File size: 2,193 Bytes
b035836 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import gradio as gr
import librosa
import numpy as np
import torch
import os
import torch
from speechbrain.pretrained import EncoderClassifier
from scipy.io import wavfile
from IPython.display import Audio
from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan
processor = AutoProcessor.from_pretrained("Prasada/speecht5_tts_voxpopuli_nl")
model = AutoModelForTextToSpectrogram.from_pretrained("Prasada/speecht5_tts_voxpopuli_nl")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
source=spk_model_name,
run_opts={"device": device},
savedir=os.path.join("/tmp", spk_model_name))
def create_speaker_embedding(waveform):
with torch.no_grad():
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
return speaker_embeddings
def prepare_data(temp_text, temp_audio):
rate, audio_data = wavfile.read(temp_audio)
example = processor(
text=temp_text,
audio_target=audio_data,
sampling_rate=16000,
return_attention_mask=False,)
example["speaker_embeddings"] = create_speaker_embedding(audio_data)
example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
return example_embeddings
def predict(temp_text, temp_audio, text):
text = text
embeddings=prepare_data(temp_text, temp_audio)
inputs = processor(text=text, return_tensors="pt")
spectrogram = model.generate_speech(inputs["input_ids"], embeddings)
with torch.no_grad():
speech = vocoder(spectrogram)
return Audio(speech.numpy(), rate=16000)
gr.Interface(
fn=predict,
inputs=[
gr.Text(label="Template Text"),
gr.Audio(label="Template Speech", type="numpy"),
gr.Text(label="Input Text"),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
],
).launch()
|