|
import gradio as gr |
|
import numpy as np |
|
import torch |
|
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
|
|
|
|
|
checkpoint = "microsoft/speecht5_tts" |
|
processor = SpeechT5Processor.from_pretrained(checkpoint) |
|
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) |
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
|
|
|
speaker_embeddings = { |
|
"male": "speaker/cmu_us_bdl_arctic-wav-arctic_a0009.npy", |
|
"female": "speaker/cmu_us_slt_arctic-wav-arctic_a0508.npy" |
|
} |
|
|
|
|
|
def text_to_speech(text, gender): |
|
if len(text.strip()) == 0: |
|
return (16000, np.zeros(0).astype(np.int16)) |
|
|
|
inputs = processor(text=text, return_tensors="pt") |
|
|
|
|
|
input_ids = inputs["input_ids"] |
|
input_ids = input_ids[..., :model.config.max_text_positions] |
|
|
|
|
|
speaker_embedding_path = speaker_embeddings[gender] |
|
speaker_embedding = np.load(speaker_embedding_path) |
|
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0) |
|
|
|
|
|
speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder) |
|
speech = (speech.numpy() * 32767).astype(np.int16) |
|
|
|
return (16000, speech) |
|
|
|
|
|
iface = gr.Interface( |
|
fn=text_to_speech, |
|
inputs=[ |
|
gr.Textbox(label="Enter Text"), |
|
gr.Radio(["male", "female"], label="Select Voice Gender") |
|
], |
|
outputs=gr.Audio(label="Generated Speech"), |
|
title="Text-to-Speech Bot", |
|
description="Enter text and select a voice gender to generate speech." |
|
) |
|
|
|
|
|
iface.launch() |
|
|