Spaces:

patriotyk
/

styletts2-ukrainian

Running on Zero

App Files Files Community

Serhiy Stetskovych commited on 24 days ago

Commit

ce548c0

1 Parent(s): 77d64d5

Update up to work with new code

Browse files

Files changed (4) hide show

app.py +64 -16
config.yml +0 -105
infer.py +0 -237
requirements.txt +1 -11

app.py CHANGED Viewed

@@ -1,11 +1,21 @@
 import glob
 import os
 import gradio as gr
-from infer import inference, split_to_parts
 import onnxruntime
 from transformers import AutoTokenizer
 from huggingface_hub import hf_hub_download
 import numpy as np
 prompts_dir = 'voices'
 prompts_list = sorted(glob.glob(os.path.join(prompts_dir, '*.wav')))
@@ -71,6 +81,26 @@ def init_verbalizer():
 tokenizer, encoder_session, decoder_session = init_verbalizer()
 def generate_text(text):
     """Generate text for a single input."""
     # Prepare input
@@ -137,8 +167,10 @@ examples = [
     ["Очікується, що цей застосунок буде запущено 22.08.2025.", 1.0],
 ]
-def synthesize_multi(text, voice_audio, speed, progress=gr.Progress()):
-    prompt_audio_path = os.path.join(prompts_dir, voice_audio+'.wav')
     if text.strip() == "":
         raise gr.Error("You must enter some text")
     if len(text) > 50000:
@@ -147,20 +179,35 @@ def synthesize_multi(text, voice_audio, speed, progress=gr.Progress()):
     print(text)
     print("*** end ***")
-    return 24000, inference('multi', text, prompt_audio_path, progress, speed=speed, alpha=0, beta=0, diffusion_steps=20, embedding_scale=1.0)[0]
-def synthesize_single(text, speed,  progress=gr.Progress()):
-    if text.strip() == "":
-        raise gr.Error("You must enter some text")
-    if len(text) > 50000:
-        raise gr.Error("Text must be <50k characters")
-    print("*** saying ***")
-    print(text)
-    print("*** end ***")
-    return 24000, inference('single',  text, None, progress, speed=speed, alpha=1, beta=0, diffusion_steps=4, embedding_scale=1.0)[0]
 def select_example(df, evt: gr.SelectData):
     return evt.row_value
@@ -181,8 +228,8 @@ with gr.Blocks() as single:
                     type="numpy",
                 )
             synthesise_button = gr.Button("Синтезувати")
-            synthesise_button.click(synthesize_single, inputs=[input_text, speed], outputs=[output_audio])
     with gr.Row():
         examples_table = gr.Dataframe(wrap=True, headers=["Текст", "Швидкість"], datatype=["str", "number"], value=examples, interactive=False)
@@ -205,8 +252,9 @@ with gr.Blocks() as multy:
                     type="numpy",
                 )
             synthesise_button = gr.Button("Синтезувати")
-            synthesise_button.click(synthesize_multi, inputs=[input_text, speaker, speed], outputs=[output_audio])
     with gr.Row():
         examples_table = gr.Dataframe(wrap=True, headers=["Текст", "Швидкість"], datatype=["str", "number"], value=examples, interactive=False)
         examples_table.select(select_example, inputs=[examples_table], outputs=[input_text, speed])

 import glob
 import os
+import re
 import gradio as gr
 import onnxruntime
 from transformers import AutoTokenizer
 from huggingface_hub import hf_hub_download
 import numpy as np
+import torch
+from ipa_uk import ipa
+from unicodedata import normalize
+from styletts2_inference.models import StyleTTS2
+from ukrainian_word_stress import Stressifier
+stressify = Stressifier()
+from text_utils import TextCleaner
+textclenaer = TextCleaner()
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
 prompts_dir = 'voices'
 prompts_list = sorted(glob.glob(os.path.join(prompts_dir, '*.wav')))
 tokenizer, encoder_session, decoder_session = init_verbalizer()
+def split_to_parts(text):
+    split_symbols = '.?!:'
+    parts = ['']
+    index = 0
+    for s in text:
+        parts[index] += s
+        if s in split_symbols and len(parts[index]) > 150:
+            index += 1
+            parts.append('')
+    return parts
+models = {
+    'multi': StyleTTS2(hf_path='patriotyk/styletts2_ukrainian_multispeaker', device=device),
+    'single': StyleTTS2(hf_path='patriotyk/styletts2_ukrainian_single', device=device)
+}
 def generate_text(text):
     """Generate text for a single input."""
     # Prepare input
     ["Очікується, що цей застосунок буде запущено 22.08.2025.", 1.0],
 ]
+def synthesize(model_name, text, speed, voice_audio = None, progress=gr.Progress()):
     if text.strip() == "":
         raise gr.Error("You must enter some text")
     if len(text) > 50000:
     print(text)
     print("*** end ***")
+    diffusion_steps = 4
+    voice = None
+    if voice_audio:
+        prompt_audio_path = os.path.join(prompts_dir, voice_audio+'.wav')
+        voice = models[model_name].compute_style(prompt_audio_path)
+        diffusion_steps = 10
+    s_prev = torch.tensor([[0]])
+    result_wav = []
+    for t in progress.tqdm(split_to_parts(text)):
+        if t:
+            t = t.strip()
+            t = t.replace('"', '')
+            t = t.replace('+', 'ˈ')
+            t = normalize('NFKC', t)
+            t = re.sub(r'[᠆‐‑‒–—―⁻₋−⸺⸻]', '-', t)
+            t = re.sub(r' - ', ': ', t)
+            ps = ipa(stressify(t))
+            tokens = textclenaer(ps)
+            wav, s_prev = models[model_name](torch.LongTensor(tokens), voice=voice, speed=speed, diffusion_steps=diffusion_steps, s_prev=s_prev)
+            result_wav.append(wav)
+    return 24000, torch.concatenate(result_wav).numpy()
 def select_example(df, evt: gr.SelectData):
     return evt.row_value
                     type="numpy",
                 )
             synthesise_button = gr.Button("Синтезувати")
+            single_text = gr.Text(value='single', visible=False)
+            synthesise_button.click(synthesize, inputs=[single_text, input_text, speed], outputs=[output_audio])
     with gr.Row():
         examples_table = gr.Dataframe(wrap=True, headers=["Текст", "Швидкість"], datatype=["str", "number"], value=examples, interactive=False)
                     type="numpy",
                 )
             synthesise_button = gr.Button("Синтезувати")
+            multi = gr.Text(value='multi', visible=False)
+            synthesise_button.click(synthesize, inputs=[multi, input_text, speed, speaker], outputs=[output_audio])
     with gr.Row():
         examples_table = gr.Dataframe(wrap=True, headers=["Текст", "Швидкість"], datatype=["str", "number"], value=examples, interactive=False)
         examples_table.select(select_example, inputs=[examples_table], outputs=[input_text, speed])

config.yml DELETED Viewed

@@ -1,105 +0,0 @@
-F0_path: "weights/jdc.bin"
-ASR_config: "Utils/ASR/config.yml"
-ASR_path: "weights/asr.bin"
-model_params_multi:
-  multispeaker: true
-  dim_in: 64
-  hidden_dim: 512
-  max_conv_dim: 512
-  n_layer: 3
-  n_mels: 80
-  n_token: 181 # number of phoneme tokens
-  max_dur: 50 # maximum duration of a single phoneme
-  style_dim: 128 # style vector size
-  dropout: 0.2
-  # config for decoder
-  decoder:
-      type: 'hifigan' # either hifigan or istftnet
-      resblock_kernel_sizes: [3,7,11]
-      upsample_rates :  [10,5,3,2]
-      upsample_initial_channel: 512
-      resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
-      upsample_kernel_sizes: [20,10,6,4]
-  # speech language model config
-  slm:
-      model: ''
-      sr: 16000 # sampling rate of SLM
-      hidden: 768 # hidden size of SLM
-      nlayers: 13 # number of layers of SLM
-      initial_channel: 64 # initial channels of SLM discriminator head
-  # style diffusion model config
-  diffusion:
-    embedding_mask_proba: 0.1
-    # transformer config
-    transformer:
-      num_layers: 3
-      num_heads: 8
-      head_features: 64
-      multiplier: 2
-    # diffusion distribution config
-    dist:
-      sigma_data: 0.19988229232390187 # placeholder for estimate_sigma_data set to false
-      estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
-      mean: -3.0
-      std: 1.0
-model_params_single:
-  multispeaker: false
-  dim_in: 64
-  hidden_dim: 512
-  max_conv_dim: 512
-  n_layer: 3
-  n_mels: 80
-  n_token: 181 # number of phoneme tokens
-  max_dur: 50 # maximum duration of a single phoneme
-  style_dim: 128 # style vector size
-  dropout: 0.2
-  # config for decoder
-  decoder:
-      type: 'istftnet' # either hifigan or istftnet
-      resblock_kernel_sizes: [3,7,11]
-      upsample_rates :  [10, 6]
-      upsample_initial_channel: 512
-      resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
-      upsample_kernel_sizes: [20, 12]
-      gen_istft_n_fft: 20
-      gen_istft_hop_size: 5
-  # speech language model config
-  slm:
-      model: 'openai/whisper-medium'
-      sr: 16000 # sampling rate of SLM
-      hidden: 768 # hidden size of SLM
-      nlayers: 13 # number of layers of SLM
-      initial_channel: 64 # initial channels of SLM discriminator head
-  # style diffusion model config
-  diffusion:
-    embedding_mask_proba: 0.1
-    # transformer config
-    transformer:
-      num_layers: 3
-      num_heads: 8
-      head_features: 64
-      multiplier: 2
-    # diffusion distribution config
-    dist:
-      sigma_data: 0.18 # placeholder for estimate_sigma_data set to false
-      estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
-      mean: -3.0
-      std: 1.0

infer.py DELETED Viewed

@@ -1,237 +0,0 @@
-import torch
-torch.manual_seed(0)
-torch.backends.cudnn.benchmark = False
-torch.backends.cudnn.deterministic = True
-import random
-random.seed(0)
-import numpy as np
-np.random.seed(0)
-import librosa
-from copy import deepcopy
-from huggingface_hub import hf_hub_download
-import spaces
-import yaml
-import re
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torchaudio
-from ipa_uk import ipa
-from unicodedata import normalize
-from ukrainian_word_stress import Stressifier, StressSymbol
-stressify = Stressifier()
-from models import *
-from utils import *
-from text_utils import TextCleaner
-textclenaer = TextCleaner()
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-to_mel = torchaudio.transforms.MelSpectrogram(
-    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
-mean, std = -4, 4
-def length_to_mask(lengths):
-    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
-    mask = torch.gt(mask+1, lengths.unsqueeze(1))
-    return mask
-def load_state_dict(model, params):
-    for key in model:
-        if key in params:
-            print('%s loaded' % key)
-            try:
-                model[key].load_state_dict(params[key])
-            except:
-                from collections import OrderedDict
-                state_dict = params[key]
-                new_state_dict = OrderedDict()
-                for k, v in state_dict.items():
-                    name = k[7:] # remove `module.`
-                    new_state_dict[name] = v
-                model[key].load_state_dict(new_state_dict, strict=False)
-config = yaml.safe_load(open('config.yml'))
-# load pretrained ASR model
-ASR_config = config.get('ASR_config', False)
-ASR_path = config.get('ASR_path', False)
-text_aligner = load_ASR_models(ASR_path, ASR_config)
-# load pretrained F0 model
-F0_path = config.get('F0_path', False)
-pitch_extractor = load_F0_models(F0_path)
-# load BERT model
-from Utils.PLBERT.util import load_plbert
-plbert = load_plbert('weights/plbert.bin', 'Utils/PLBERT/config.yml')
-model_single = build_model(recursive_munch(config['model_params_single']), text_aligner, pitch_extractor, plbert)
-model_multi = build_model(recursive_munch(config['model_params_multi']), deepcopy(text_aligner), deepcopy(pitch_extractor), deepcopy(plbert))
-multi_path = hf_hub_download(repo_id='patriotyk/styletts2_ukrainian_multispeaker', filename="pytorch_model.bin")
-params_multi = torch.load(multi_path, map_location='cpu')
-single_path = hf_hub_download(repo_id='patriotyk/styletts2_ukrainian_single', filename="pytorch_model.bin")
-params_single = torch.load(single_path, map_location='cpu')
-load_state_dict(model_single, params_single)
-_ = [model_single[key].eval() for key in model_single]
-_ = [model_single[key].to(device) for key in model_single]
-load_state_dict(model_multi, params_multi)
-_ = [model_multi[key].eval() for key in model_multi]
-_ = [model_multi[key].to(device) for key in model_multi]
-models = {
-    'multi': model_multi,
-    'single': model_single
-}
-def preprocess(wave):
-    wave_tensor = torch.from_numpy(wave).float()
-    mel_tensor = to_mel(wave_tensor)
-    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
-    return mel_tensor
-def compute_style(voice_audio):
-    wave, sr = librosa.load(voice_audio, sr=24000)
-    audio, index = librosa.effects.trim(wave, top_db=30)
-    if sr != 24000:
-        audio = librosa.resample(audio, sr, 24000)
-    mel_tensor = preprocess(audio).to(device)
-    with torch.no_grad():
-        ref_s = models['multi'].style_encoder(mel_tensor.unsqueeze(1))
-        ref_p = models['multi'].predictor_encoder(mel_tensor.unsqueeze(1))
-    return torch.cat([ref_s, ref_p], dim=1)
-def split_to_parts(text):
-    split_symbols = '.?!:'
-    parts = ['']
-    index = 0
-    for s in text:
-        parts[index] += s
-        if s in split_symbols and len(parts[index]) > 150:
-            index += 1
-            parts.append('')
-    return parts
-def _inf(model, text, ref_s, speed, s_prev, noise, alpha, beta, diffusion_steps, embedding_scale):
-    model = models[model]
-    text = text.strip()
-    text = text.replace('"', '')
-    text = text.replace('+', 'ˈ')
-    text = normalize('NFKC', text)
-    text = re.sub(r'[᠆‐‑‒–—―⁻₋−⸺⸻]', '-', text)
-    text = re.sub(r' - ', ': ', text)
-    ps = ipa(stressify(text))
-    print(ps)
-    tokens = textclenaer(ps)
-    tokens.insert(0, 0)
-    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
-    with torch.no_grad():
-        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)
-        text_mask = length_to_mask(input_lengths).to(tokens.device)
-        t_en = model.text_encoder(tokens, input_lengths, text_mask)
-        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
-        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
-        if ref_s is None:
-            s_pred = model.sampler(noise,
-                  embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,
-                  embedding_scale=embedding_scale).squeeze(0)
-        else:
-            s_pred = model.sampler(noise = noise,
-                            embedding=bert_dur,
-                            embedding_scale=embedding_scale,
-                            features=ref_s, # reference from the same speaker as the embedding
-                            num_steps=diffusion_steps).squeeze(1)
-        if s_prev is not None:
-            # convex combination of previous and current style
-            s_pred = alpha * s_prev + (1 - alpha) * s_pred
-        s = s_pred[:, 128:]
-        ref = s_pred[:, :128]
-        if ref_s is not None:
-            ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]
-            s = beta * s + (1 - beta)  * ref_s[:, 128:]
-        d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
-        x, _ = model.predictor.lstm(d)
-        duration = model.predictor.duration_proj(x)
-        duration = torch.sigmoid(duration).sum(axis=-1)/speed
-        pred_dur = torch.round(duration.squeeze()).clamp(min=1)
-        if ref_s is not None:
-            pred_dur[0] = 30
-        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
-        c_frame = 0
-        for i in range(pred_aln_trg.size(0)):
-            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
-            c_frame += int(pred_dur[i].data)
-        # encode prosody
-        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
-        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
-        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
-        out = model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))
-        if ref_s is not None:
-            out = out[:,:, 14500:]
-    return out.squeeze().cpu().numpy(), s_pred, ps
-@spaces.GPU
-def inference(model, text, voice_audio, progress, speed=1, alpha=0.4, beta=0.4, diffusion_steps=10, embedding_scale=1.2):
-    wavs = []
-    s_prev = None
-    #sentences = text.split('|')
-    sentences = split_to_parts(text)
-    phonemes = ''
-    noise = torch.randn(1,1,256).to(device)
-    ref_s = compute_style(voice_audio) if voice_audio else None
-    for text in progress.tqdm(sentences):
-        if text.strip() == "": continue
-        wav, s_prev, ps = _inf(model, text, ref_s, speed, s_prev, noise, alpha=alpha, beta=beta, diffusion_steps=diffusion_steps, embedding_scale=embedding_scale)
-        wavs.append(wav)
-        phonemes += ' ' + ps
-    return  np.concatenate(wavs), phonemes

requirements.txt CHANGED Viewed

@@ -1,20 +1,10 @@
 SoundFile
 torchaudio==2.2.0
-munch
 torch==2.2.0
-pydub
-pyyaml
-librosa
-tqdm
-scipy
-gradio
-gruut
-einops
-einops_exts
-txtsplit
 transformers
 git+https://github.com/patriotyk/ukrainian-word-stress.git
 git+https://github.com/patriotyk/ipa-uk.git
 spaces
 numpy<2
 huggingface_hub

 SoundFile
 torchaudio==2.2.0
 torch==2.2.0
 transformers
 git+https://github.com/patriotyk/ukrainian-word-stress.git
 git+https://github.com/patriotyk/ipa-uk.git
+git+https://github.com/patriotyk/styletts2-inference
 spaces
 numpy<2
 huggingface_hub