|
import gradio as gr |
|
import torch |
|
from parler_tts import ParlerTTSForConditionalGeneration |
|
from transformers import AutoTokenizer, set_seed |
|
import numpy as np |
|
|
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
model = ParlerTTSForConditionalGeneration.from_pretrained("TArtx/parler-tts-mini-narrated-13").to(device) |
|
tokenizer = AutoTokenizer.from_pretrained("TArtx/parler-tts-mini-narrated-13") |
|
|
|
|
|
SAMPLE_RATE = model.config.sampling_rate |
|
SEED = 42 |
|
|
|
|
|
default_text = "This is a demonstration of my ability to convert written words into spoken language, seamlessly and naturally. As a text-to-speech model, my goal is to sound as clear and engaging as a human, making sure every word I say leaves an impression." |
|
default_description = "moderate speed, very clear, monotone, wonderful speech quality" |
|
|
|
|
|
def gen_tts(text, description): |
|
try: |
|
|
|
set_seed(SEED) |
|
|
|
|
|
input_ids = tokenizer(description.strip(), return_tensors="pt").input_ids.to(device) |
|
prompt_input_ids = tokenizer(text.strip(), return_tensors="pt").input_ids.to(device) |
|
|
|
|
|
generation = model.generate( |
|
input_ids=input_ids, |
|
prompt_input_ids=prompt_input_ids, |
|
do_sample=True, |
|
temperature=0.7 |
|
) |
|
|
|
|
|
audio_arr = generation.cpu().numpy().squeeze() |
|
|
|
|
|
if np.max(np.abs(audio_arr)) > 0: |
|
audio_arr = audio_arr / np.max(np.abs(audio_arr)) |
|
audio_arr = (audio_arr * np.iinfo(np.int16).max).astype(np.int16) |
|
else: |
|
|
|
audio_arr = np.random.randint(-32768, 32767, SAMPLE_RATE * 10, dtype=np.int16) |
|
|
|
return SAMPLE_RATE, audio_arr |
|
|
|
except Exception as e: |
|
print(f"Error in TTS generation: {str(e)}") |
|
|
|
return SAMPLE_RATE, np.random.randint(-32768, 32767, SAMPLE_RATE * 10, dtype=np.int16) |
|
|
|
|
|
with gr.Blocks() as block: |
|
gr.Markdown( |
|
""" |
|
## Parler-TTS 🗣️ |
|
Parler-TTS is a training and inference library for high-fidelity text-to-speech (TTS) models. This demo uses the Mini v1 model. |
|
""" |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text") |
|
description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description") |
|
run_button = gr.Button("Generate Audio", variant="primary") |
|
with gr.Column(): |
|
audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out") |
|
|
|
inputs = [input_text, description] |
|
outputs = audio_out |
|
run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs) |
|
|
|
|
|
block.launch(debug=True) |