import gradio as gr import librosa import numpy as np import torch import os import torch from speechbrain.pretrained import EncoderClassifier from scipy.io import wavfile from IPython.display import Audio from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan processor = AutoProcessor.from_pretrained("Prasada/speecht5_tts_voxpopuli_nl") model = AutoModelForTextToSpectrogram.from_pretrained("Prasada/speecht5_tts_voxpopuli_nl") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") spk_model_name = "speechbrain/spkrec-xvect-voxceleb" device = "cuda" if torch.cuda.is_available() else "cpu" speaker_model = EncoderClassifier.from_hparams( source=spk_model_name, run_opts={"device": device}, savedir=os.path.join("/tmp", spk_model_name)) def create_speaker_embedding(waveform): with torch.no_grad(): speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() return speaker_embeddings def prepare_data(temp_text, temp_audio): rate, audio_data = wavfile.read(temp_audio) example = processor( text=temp_text, audio_target=audio_data, sampling_rate=16000, return_attention_mask=False,) example["speaker_embeddings"] = create_speaker_embedding(audio_data) example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) return example_embeddings def predict(temp_text, temp_audio, text): text = text embeddings=prepare_data(temp_text, temp_audio) inputs = processor(text=text, return_tensors="pt") spectrogram = model.generate_speech(inputs["input_ids"], embeddings) with torch.no_grad(): speech = vocoder(spectrogram) return Audio(speech.numpy(), rate=16000) gr.Interface( fn=predict, inputs=[ gr.Text(label="Template Text"), gr.Audio(label="Template Speech", type="numpy"), gr.Text(label="Input Text"), ], outputs=[ gr.Audio(label="Generated Speech", type="numpy"), ], ).launch()