demo
Browse files- app.py +1 -36
- spkemb/speaker0.npy +2 -2
- spkemb/speaker1.npy +2 -2
- spkemb/speaker2.npy +2 -2
app.py
CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
|
|
4 |
import librosa
|
5 |
import numpy as np
|
6 |
import torch
|
7 |
-
|
8 |
from transformers import pipeline
|
9 |
|
10 |
synthesiser = pipeline("text-to-speech", "techiaith/microsoft_speecht5_finetuned_bu_tts_cy_en")
|
@@ -15,46 +15,11 @@ speaker_embeddings = {
|
|
15 |
"BDP": "spkemb/speaker2.npy",
|
16 |
}
|
17 |
|
18 |
-
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
|
19 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
20 |
-
print(f">>>>> DEVICE {device}")
|
21 |
-
speaker_model = EncoderClassifier.from_hparams(
|
22 |
-
source=spk_model_name,
|
23 |
-
run_opts={"device": device},
|
24 |
-
savedir=os.path.join("/tmp", spk_model_name),
|
25 |
-
)
|
26 |
-
|
27 |
-
def prepare_dataset(examp):
|
28 |
-
audio = examp["audio"]
|
29 |
-
|
30 |
-
examp = processor(
|
31 |
-
text=examp["sentence"],
|
32 |
-
audio_target=audio["array"],
|
33 |
-
sampling_rate=audio["sampling_rate"],
|
34 |
-
return_attention_mask=False,
|
35 |
-
)
|
36 |
-
|
37 |
-
# strip off the batch dimension
|
38 |
-
examp["labels"] = examp["labels"][0]
|
39 |
-
|
40 |
-
# use SpeechBrain to obtain x-vector
|
41 |
-
examp["speaker_embeddings"] = create_speaker_embedding(audio["array"])
|
42 |
-
|
43 |
-
return examp
|
44 |
-
|
45 |
-
def create_speaker_embedding(waveform):
|
46 |
-
with torch.no_grad():
|
47 |
-
se = speaker_model.encode_batch(torch.tensor(waveform))
|
48 |
-
se = torch.nn.functional.normalize(se, dim=2)
|
49 |
-
se = se.squeeze().cpu().numpy()
|
50 |
-
return se
|
51 |
-
|
52 |
@spaces.GPU
|
53 |
def predict(text, speaker):
|
54 |
if len(text.strip()) == 0:
|
55 |
return (16000, np.zeros(0).astype(np.int16))
|
56 |
speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
|
57 |
-
speaker_embedding = prepare_dataset(speaker_embedding)
|
58 |
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
|
59 |
speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
|
60 |
speech = (speech.numpy() * 32767).astype(np.int16)
|
|
|
4 |
import librosa
|
5 |
import numpy as np
|
6 |
import torch
|
7 |
+
|
8 |
from transformers import pipeline
|
9 |
|
10 |
synthesiser = pipeline("text-to-speech", "techiaith/microsoft_speecht5_finetuned_bu_tts_cy_en")
|
|
|
15 |
"BDP": "spkemb/speaker2.npy",
|
16 |
}
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
@spaces.GPU
|
19 |
def predict(text, speaker):
|
20 |
if len(text.strip()) == 0:
|
21 |
return (16000, np.zeros(0).astype(np.int16))
|
22 |
speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
|
|
|
23 |
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
|
24 |
speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
|
25 |
speech = (speech.numpy() * 32767).astype(np.int16)
|
spkemb/speaker0.npy
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c638ff5f04c92bfeba658f6737ca814aa5d68bc36e0b062a74240388d7379563
|
3 |
+
size 2176
|
spkemb/speaker1.npy
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9d33076cea7d60d759e315461f25fc341f1df7d288cd7cda8f724166f7ec6fc4
|
3 |
+
size 2176
|
spkemb/speaker2.npy
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02c1fabf876cb14affc7dac02cde49e9d2b47e3d578983275918a1aa4e399fe9
|
3 |
+
size 2176
|