import gradio as gr import numpy as np import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan # Load the model and vocoder checkpoint = "microsoft/speecht5_tts" processor = SpeechT5Processor.from_pretrained(checkpoint) model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Speaker embeddings for male and female speaker_embeddings = { "male": "speaker/cmu_us_bdl_arctic-wav-arctic_a0009.npy", "female": "speaker/cmu_us_slt_arctic-wav-arctic_a0508.npy" } # Function to generate speech def text_to_speech(text, gender): if len(text.strip()) == 0: return (16000, np.zeros(0).astype(np.int16)) inputs = processor(text=text, return_tensors="pt") # Truncate input if too long input_ids = inputs["input_ids"] input_ids = input_ids[..., :model.config.max_text_positions] # Load speaker embedding based on gender selection speaker_embedding_path = speaker_embeddings[gender] speaker_embedding = np.load(speaker_embedding_path) speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0) # Generate speech speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder) speech = (speech.numpy() * 32767).astype(np.int16) return (16000, speech) # Create the Gradio interface iface = gr.Interface( fn=text_to_speech, inputs=[ gr.Textbox(label="Enter Text"), gr.Radio(["male", "female"], label="Select Voice Gender") # Gender selection ], outputs=gr.Audio(label="Generated Speech"), title="Text-to-Speech Bot", description="Enter text and select a voice gender to generate speech." ) # Launch the interface iface.launch()