str20tbl commited on
Commit
b5f5075
·
1 Parent(s): 63bf2f5
Files changed (4) hide show
  1. app.py +1 -36
  2. spkemb/speaker0.npy +2 -2
  3. spkemb/speaker1.npy +2 -2
  4. spkemb/speaker2.npy +2 -2
app.py CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
4
  import librosa
5
  import numpy as np
6
  import torch
7
- from speechbrain.inference import EncoderClassifier
8
  from transformers import pipeline
9
 
10
  synthesiser = pipeline("text-to-speech", "techiaith/microsoft_speecht5_finetuned_bu_tts_cy_en")
@@ -15,46 +15,11 @@ speaker_embeddings = {
15
  "BDP": "spkemb/speaker2.npy",
16
  }
17
 
18
- spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
19
- device = "cuda" if torch.cuda.is_available() else "cpu"
20
- print(f">>>>> DEVICE {device}")
21
- speaker_model = EncoderClassifier.from_hparams(
22
- source=spk_model_name,
23
- run_opts={"device": device},
24
- savedir=os.path.join("/tmp", spk_model_name),
25
- )
26
-
27
- def prepare_dataset(examp):
28
- audio = examp["audio"]
29
-
30
- examp = processor(
31
- text=examp["sentence"],
32
- audio_target=audio["array"],
33
- sampling_rate=audio["sampling_rate"],
34
- return_attention_mask=False,
35
- )
36
-
37
- # strip off the batch dimension
38
- examp["labels"] = examp["labels"][0]
39
-
40
- # use SpeechBrain to obtain x-vector
41
- examp["speaker_embeddings"] = create_speaker_embedding(audio["array"])
42
-
43
- return examp
44
-
45
- def create_speaker_embedding(waveform):
46
- with torch.no_grad():
47
- se = speaker_model.encode_batch(torch.tensor(waveform))
48
- se = torch.nn.functional.normalize(se, dim=2)
49
- se = se.squeeze().cpu().numpy()
50
- return se
51
-
52
  @spaces.GPU
53
  def predict(text, speaker):
54
  if len(text.strip()) == 0:
55
  return (16000, np.zeros(0).astype(np.int16))
56
  speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
57
- speaker_embedding = prepare_dataset(speaker_embedding)
58
  speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
59
  speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
60
  speech = (speech.numpy() * 32767).astype(np.int16)
 
4
  import librosa
5
  import numpy as np
6
  import torch
7
+
8
  from transformers import pipeline
9
 
10
  synthesiser = pipeline("text-to-speech", "techiaith/microsoft_speecht5_finetuned_bu_tts_cy_en")
 
15
  "BDP": "spkemb/speaker2.npy",
16
  }
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  @spaces.GPU
19
  def predict(text, speaker):
20
  if len(text.strip()) == 0:
21
  return (16000, np.zeros(0).astype(np.int16))
22
  speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
 
23
  speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
24
  speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
25
  speech = (speech.numpy() * 32767).astype(np.int16)
spkemb/speaker0.npy CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29a89ec165f28301a1dda1ef5e1a5c8b5ddd60c0bfa4094f7fb6b88035812ca3
3
- size 229448
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c638ff5f04c92bfeba658f6737ca814aa5d68bc36e0b062a74240388d7379563
3
+ size 2176
spkemb/speaker1.npy CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc9f3a6e58a1e2f52c9258a7e9372b30a0966dd5b3fdc82de23cd2e38fd61c67
3
- size 599888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d33076cea7d60d759e315461f25fc341f1df7d288cd7cda8f724166f7ec6fc4
3
+ size 2176
spkemb/speaker2.npy CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e61aa815909de199d2261c806674d5cbdc264120c930838dffbe4c21d957845c
3
- size 158888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02c1fabf876cb14affc7dac02cde49e9d2b47e3d578983275918a1aa4e399fe9
3
+ size 2176