File size: 2,343 Bytes
b035836
 
 
 
 
 
34fabd3
 
b035836
 
 
a90087d
 
b035836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7a3a17
34fabd3
 
 
b035836
 
 
34fabd3
b035836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc3bfeb
 
1e25145
b035836
 
 
 
 
 
871287c
b035836
 
 
31ab4d4
b035836
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import librosa
import numpy as np
import torch
import os
import torch
from scipy.io import wavfile
import scipy.signal as sps
from speechbrain.pretrained import EncoderClassifier
from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan

processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
model = AutoModelForTextToSpectrogram.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name))

def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

def prepare_data(temp_text, temp_audio):
    rate, audio_data = temp_audio
    new_rate = 16000
    number_of_samples = round(len(audio_data) * float(new_rate) / rate)
    audio_data = sps.resample(audio_data, number_of_samples)
    example = processor(
        text=temp_text,
        audio_target=audio_data,
        sampling_rate=16000,
        return_attention_mask=False,)
    example["speaker_embeddings"] = create_speaker_embedding(audio_data)
    example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
    return example_embeddings


def predict(temp_text, temp_audio, text):
    text = text
    embeddings=prepare_data(temp_text, temp_audio)
    inputs = processor(text=text, return_tensors="pt")
    spectrogram = model.generate_speech(inputs["input_ids"], embeddings)

    with torch.no_grad():
        speech = vocoder(spectrogram)
 
    speech = (speech.numpy() * 32767).astype(np.int16)
    return (16000, speech)



gr.Interface(
    fn=predict,
    inputs=[
        gr.Text(label="Template Text"),
        gr.Audio(label="Template Speech.", type="numpy"),
        gr.Text(label="Input Text"),
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
    ],

).launch()