File size: 2,154 Bytes
b035836 f7a3a17 b035836 fc3bfeb 1e25145 b035836 31ab4d4 b035836 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import gradio as gr
import librosa
import numpy as np
import torch
import os
import torch
from speechbrain.pretrained import EncoderClassifier
from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan
processor = AutoProcessor.from_pretrained("Prasada/speecht5_tts_voxpopuli_nl")
model = AutoModelForTextToSpectrogram.from_pretrained("Prasada/speecht5_tts_voxpopuli_nl")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
source=spk_model_name,
run_opts={"device": device},
savedir=os.path.join("/tmp", spk_model_name))
def create_speaker_embedding(waveform):
with torch.no_grad():
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
return speaker_embeddings
def prepare_data(temp_text, temp_audio):
rate, audio_data = temp_audio
example = processor(
text=temp_text,
audio_target=audio_data,
sampling_rate=16000,
return_attention_mask=False,)
example["speaker_embeddings"] = create_speaker_embedding(audio_data)
example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
return example_embeddings
def predict(temp_text, temp_audio, text):
text = text
embeddings=prepare_data(temp_text, temp_audio)
inputs = processor(text=text, return_tensors="pt")
spectrogram = model.generate_speech(inputs["input_ids"], embeddings)
with torch.no_grad():
speech = vocoder(spectrogram)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
gr.Interface(
fn=predict,
inputs=[
gr.Text(label="Template Text"),
gr.Audio(label="Template Speech", type="numpy"),
gr.Text(label="Input Text"),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
],
).launch()
|