import gradio as gr import librosa import numpy as np import torch import os import torch from speechbrain.pretrained import EncoderClassifier from scipy.io import wavfile from IPython.display import Audio from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan processor = AutoProcessor.from_pretrained("Prasada/speecht5_tts_voxpopuli_nl") model = AutoModelForTextToSpectrogram.from_pretrained("Prasada/speecht5_tts_voxpopuli_nl") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") spk_model_name = "speechbrain/spkrec-xvect-voxceleb" device = "cuda" if torch.cuda.is_available() else "cpu" speaker_model = EncoderClassifier.from_hparams( source=spk_model_name, run_opts={"device": device}, savedir=os.path.join("/tmp", spk_model_name)) def create_speaker_embedding(waveform): with torch.no_grad(): speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() return speaker_embeddings def prepare_data(temp_text, temp_audio): rate, audio_data = temp_audio example = processor( text=temp_text, audio_target=audio_data, sampling_rate=16000, return_attention_mask=False,) example["speaker_embeddings"] = create_speaker_embedding(audio_data) example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) return example_embeddings def predict(temp_text, temp_audio, text): text = text embeddings=prepare_data(temp_text, temp_audio) inputs = processor(text=text, return_tensors="pt") spectrogram = model.generate_speech(inputs["input_ids"], embeddings) with torch.no_grad(): speech = vocoder(spectrogram) return speech.numpy() gr.Interface( fn=predict, inputs=[ gr.Text(label="Template Text"), gr.Audio(label="Template Speech", type="numpy"), gr.Text(label="Input Text"), ], outputs=[ gr.Audio(label="Generated Speech", type="numpy"), ], ).launch()