import gradio as gr import librosa import numpy as np import torch import os import torch from speechbrain.pretrained import EncoderClassifier from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts") model = AutoModelForTextToSpectrogram.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") spk_model_name = "speechbrain/spkrec-xvect-voxceleb" device = "cuda" if torch.cuda.is_available() else "cpu" speaker_model = EncoderClassifier.from_hparams( source=spk_model_name, run_opts={"device": device}, savedir=os.path.join("/tmp", spk_model_name)) def create_speaker_embedding(waveform): with torch.no_grad(): speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() return speaker_embeddings def prepare_data(temp_text, temp_audio): rate, audio_data = temp_audio example = processor( text=temp_text, audio_target=audio_data, sampling_rate=rate, return_attention_mask=False,) example["speaker_embeddings"] = create_speaker_embedding(audio_data) example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) return example_embeddings def predict(temp_text, temp_audio, text): text = text embeddings=prepare_data(temp_text, temp_audio) inputs = processor(text=text, return_tensors="pt") spectrogram = model.generate_speech(inputs["input_ids"], embeddings) with torch.no_grad(): speech = vocoder(spectrogram) speech = (speech.numpy() * 32767).astype(np.int16) return (16000, speech) gr.Interface( fn=predict, inputs=[ gr.Text(label="Template Text"), gr.Audio(label="Template Speech.", type="numpy"), gr.Text(label="Input Text"), ], outputs=[ gr.Audio(label="Generated Speech", type="numpy"), ], ).launch()