File size: 2,343 Bytes
b035836 34fabd3 b035836 a90087d b035836 f7a3a17 34fabd3 b035836 34fabd3 b035836 fc3bfeb 1e25145 b035836 871287c b035836 31ab4d4 b035836 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import gradio as gr
import librosa
import numpy as np
import torch
import os
import torch
from scipy.io import wavfile
import scipy.signal as sps
from speechbrain.pretrained import EncoderClassifier
from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan
processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
model = AutoModelForTextToSpectrogram.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
source=spk_model_name,
run_opts={"device": device},
savedir=os.path.join("/tmp", spk_model_name))
def create_speaker_embedding(waveform):
with torch.no_grad():
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
return speaker_embeddings
def prepare_data(temp_text, temp_audio):
rate, audio_data = temp_audio
new_rate = 16000
number_of_samples = round(len(audio_data) * float(new_rate) / rate)
audio_data = sps.resample(audio_data, number_of_samples)
example = processor(
text=temp_text,
audio_target=audio_data,
sampling_rate=16000,
return_attention_mask=False,)
example["speaker_embeddings"] = create_speaker_embedding(audio_data)
example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
return example_embeddings
def predict(temp_text, temp_audio, text):
text = text
embeddings=prepare_data(temp_text, temp_audio)
inputs = processor(text=text, return_tensors="pt")
spectrogram = model.generate_speech(inputs["input_ids"], embeddings)
with torch.no_grad():
speech = vocoder(spectrogram)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
gr.Interface(
fn=predict,
inputs=[
gr.Text(label="Template Text"),
gr.Audio(label="Template Speech.", type="numpy"),
gr.Text(label="Input Text"),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
],
).launch()
|