wifix199's picture
Upload 2 files
e67e9cb verified
raw
history blame
1.77 kB
import gradio as gr
import numpy as np
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
# Load the model and vocoder
checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Speaker embeddings for male and female
speaker_embeddings = {
"male": "speaker/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
"female": "speaker/cmu_us_slt_arctic-wav-arctic_a0508.npy"
}
# Function to generate speech
def text_to_speech(text, gender):
if len(text.strip()) == 0:
return (16000, np.zeros(0).astype(np.int16))
inputs = processor(text=text, return_tensors="pt")
# Truncate input if too long
input_ids = inputs["input_ids"]
input_ids = input_ids[..., :model.config.max_text_positions]
# Load speaker embedding based on gender selection
speaker_embedding_path = speaker_embeddings[gender]
speaker_embedding = np.load(speaker_embedding_path)
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
# Generate speech
speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
# Create the Gradio interface
iface = gr.Interface(
fn=text_to_speech,
inputs=[
gr.Textbox(label="Enter Text"),
gr.Radio(["male", "female"], label="Select Voice Gender") # Gender selection
],
outputs=gr.Audio(label="Generated Speech"),
title="Text-to-Speech Bot",
description="Enter text and select a voice gender to generate speech."
)
# Launch the interface
iface.launch()