|
import gradio as gr |
|
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor |
|
import torch |
|
import torchaudio |
|
import tempfile |
|
|
|
|
|
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") |
|
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") |
|
|
|
|
|
speaker_embedding, _ = torchaudio.load("https://huggingface.co/microsoft/speecht5_tts/blob/main/speaker_embeddings/english/vctk_speaker_0.pt") |
|
|
|
def text_to_speech(text): |
|
inputs = processor(text, return_tensors="pt") |
|
speech = model.generate_speech(inputs["input_ids"], speaker_embedding) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: |
|
torchaudio.save(f.name, speech, 16000) |
|
return f.name |
|
|
|
|
|
interface = gr.Interface( |
|
fn=text_to_speech, |
|
inputs="text", |
|
outputs="audio", |
|
title="Text to Speech", |
|
description="Convert text to speech using the microsoft/speecht5_tts model" |
|
) |
|
|
|
interface.launch() |
|
|