File size: 2,154 Bytes
b035836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7a3a17
b035836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc3bfeb
 
1e25145
b035836
 
 
 
 
 
 
 
 
 
31ab4d4
b035836
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import gradio as gr
import librosa
import numpy as np
import torch
import os
import torch
from speechbrain.pretrained import EncoderClassifier
from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan

processor = AutoProcessor.from_pretrained("Prasada/speecht5_tts_voxpopuli_nl")
model = AutoModelForTextToSpectrogram.from_pretrained("Prasada/speecht5_tts_voxpopuli_nl")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name))

def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

def prepare_data(temp_text, temp_audio):
    rate, audio_data = temp_audio
    example = processor(
        text=temp_text,
        audio_target=audio_data,
        sampling_rate=16000,
        return_attention_mask=False,)
    example["speaker_embeddings"] = create_speaker_embedding(audio_data)
    example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
    return example_embeddings


def predict(temp_text, temp_audio, text):
    text = text
    embeddings=prepare_data(temp_text, temp_audio)
    inputs = processor(text=text, return_tensors="pt")
    spectrogram = model.generate_speech(inputs["input_ids"], embeddings)

    with torch.no_grad():
        speech = vocoder(spectrogram)
 
    speech = (speech.numpy() * 32767).astype(np.int16)
    return (16000, speech)



gr.Interface(
    fn=predict,
    inputs=[
        gr.Text(label="Template Text"),
        gr.Audio(label="Template Speech", type="numpy"),
        gr.Text(label="Input Text"),
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
    ],

).launch()