File size: 2,440 Bytes
29a7123 1610722 badff1c 828d42b 1610722 c7fbbca 1610722 badff1c 828d42b badff1c 1610722 c7fbbca 1610722 828d42b c7fbbca 1610722 c7fbbca 05020c4 badff1c 828d42b c7fbbca badff1c 05020c4 c7fbbca 05020c4 c7fbbca badff1c c7fbbca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import gradio as gr
import torch
import soundfile as sf
import spaces
import os
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
device = "cuda" if torch.cuda.is_available() else "cpu"
def load_models_and_data():
model_name = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(model_name)
model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
speaker_model = EncoderClassifier.from_hparams(
source=spk_model_name,
run_opts={"device": device},
savedir=os.path.join("/tmp", spk_model_name),
)
return model, processor, vocoder, speaker_model
model, processor, vocoder, speaker_model = load_models_and_data()
def create_speaker_embedding(waveform):
with torch.no_grad():
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
speaker_embeddings = speaker_embeddings.squeeze().to(device)
return speaker_embeddings
@spaces.GPU(duration = 60)
def text_to_speech(text, audio_file):
inputs = processor(text=text, return_tensors="pt").to(device)
# Load the audio file and create speaker embedding
waveform, sample_rate = sf.read(audio_file)
if len(waveform.shape) > 1:
waveform = waveform[:, 0] # Take the first channel if stereo
speaker_embeddings = create_speaker_embedding(waveform)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
return "output.wav"
iface = gr.Interface(
fn=text_to_speech,
inputs=[
gr.Textbox(label="Enter Turkish text to convert to speech"),
gr.Audio(label="Upload a short audio sample of the target speaker", type="filepath")
],
outputs=gr.Audio(label="Generated Speech"),
title="Turkish SpeechT5 Text-to-Speech Demo with Custom Voice",
description="Enter Turkish text, upload a short audio sample of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model."
)
iface.launch() |