Prasada commited on
Commit
b035836
·
1 Parent(s): 686ed37

Create App2.py

Browse files
Files changed (1) hide show
  1. App2.py +67 -0
App2.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+ import os
6
+ import torch
7
+ from speechbrain.pretrained import EncoderClassifier
8
+ from scipy.io import wavfile
9
+ from IPython.display import Audio
10
+ from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan
11
+
12
+ processor = AutoProcessor.from_pretrained("Prasada/speecht5_tts_voxpopuli_nl")
13
+ model = AutoModelForTextToSpectrogram.from_pretrained("Prasada/speecht5_tts_voxpopuli_nl")
14
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
15
+
16
+
17
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
18
+
19
+ device = "cuda" if torch.cuda.is_available() else "cpu"
20
+ speaker_model = EncoderClassifier.from_hparams(
21
+ source=spk_model_name,
22
+ run_opts={"device": device},
23
+ savedir=os.path.join("/tmp", spk_model_name))
24
+
25
+ def create_speaker_embedding(waveform):
26
+ with torch.no_grad():
27
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
28
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
29
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
30
+ return speaker_embeddings
31
+
32
+ def prepare_data(temp_text, temp_audio):
33
+ rate, audio_data = wavfile.read(temp_audio)
34
+ example = processor(
35
+ text=temp_text,
36
+ audio_target=audio_data,
37
+ sampling_rate=16000,
38
+ return_attention_mask=False,)
39
+ example["speaker_embeddings"] = create_speaker_embedding(audio_data)
40
+ example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
41
+ return example_embeddings
42
+
43
+
44
+ def predict(temp_text, temp_audio, text):
45
+ text = text
46
+ embeddings=prepare_data(temp_text, temp_audio)
47
+ inputs = processor(text=text, return_tensors="pt")
48
+ spectrogram = model.generate_speech(inputs["input_ids"], embeddings)
49
+
50
+ with torch.no_grad():
51
+ speech = vocoder(spectrogram)
52
+
53
+ return Audio(speech.numpy(), rate=16000)
54
+
55
+
56
+ gr.Interface(
57
+ fn=predict,
58
+ inputs=[
59
+ gr.Text(label="Template Text"),
60
+ gr.Audio(label="Template Speech", type="numpy"),
61
+ gr.Text(label="Input Text"),
62
+ ],
63
+ outputs=[
64
+ gr.Audio(label="Generated Speech", type="numpy"),
65
+ ],
66
+
67
+ ).launch()