|
import gradio as gr |
|
import torch |
|
from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer |
|
from string import punctuation |
|
import re |
|
|
|
from parler_tts import ParlerTTSForConditionalGeneration |
|
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed |
|
|
|
|
|
device = "cpu" |
|
|
|
|
|
repo_id = "TArtx/parler-tts-mini-v1-finetuned-12" |
|
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device) |
|
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1") |
|
feature_extractor = AutoFeatureExtractor.from_pretrained(parler-tts/parler-tts-mini-v1) |
|
|
|
|
|
SAMPLE_RATE = feature_extractor.sampling_rate |
|
SEED = 42 |
|
|
|
|
|
default_text = "This is a demonstration of my ability to convert written words into spoken language, seamlessly and naturally. As a text-to-speech model, my goal is to sound as clear and engaging as a human, making sure every word I say leaves an impression." |
|
default_description = "moderate speed, very clear, monotone, wonderful speech quality" |
|
|
|
|
|
number_normalizer = EnglishNumberNormalizer() |
|
|
|
|
|
def preprocess(text): |
|
text = number_normalizer(text).strip() |
|
text = text.replace("-", " ") |
|
if text[-1] not in punctuation: |
|
text = f"{text}." |
|
abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b' |
|
|
|
def separate_abb(chunk): |
|
chunk = chunk.replace(".", "") |
|
return " ".join(chunk) |
|
|
|
abbreviations = re.findall(abbreviations_pattern, text) |
|
for abv in abbreviations: |
|
if abv in text: |
|
text = text.replace(abv, separate_abb(abv)) |
|
return text |
|
|
|
|
|
def gen_tts(text, description): |
|
try: |
|
|
|
inputs = tokenizer(description.strip(), return_tensors="pt", truncation=True, max_length=128).to(device) |
|
prompt = tokenizer(preprocess(text), return_tensors="pt", truncation=True, max_length=128).to(device) |
|
|
|
set_seed(SEED) |
|
generation = model.generate( |
|
input_ids=inputs.input_ids, |
|
prompt_input_ids=prompt.input_ids, |
|
attention_mask=inputs.attention_mask, |
|
prompt_attention_mask=prompt.prompt_attention_mask, |
|
do_sample=True, |
|
temperature=1.0, |
|
) |
|
audio_arr = generation.cpu().numpy().squeeze() |
|
return SAMPLE_RATE, audio_arr |
|
except Exception as e: |
|
return SAMPLE_RATE, f"Error: {str(e)}" |
|
|
|
|
|
with gr.Blocks() as block: |
|
gr.Markdown( |
|
""" |
|
## Parler-TTS 🗣️ |
|
Parler-TTS is a training and inference library for high-fidelity text-to-speech (TTS) models. This demo uses the Mini v1 model. |
|
""" |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text") |
|
description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description") |
|
run_button = gr.Button("Generate Audio", variant="primary") |
|
with gr.Column(): |
|
audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out") |
|
|
|
inputs = [input_text, description] |
|
outputs = [audio_out] |
|
run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True) |
|
|
|
|
|
block.queue() |
|
block.launch() |