str20tbl commited on
Commit
ddd7573
·
1 Parent(s): 6f5132b
Files changed (1) hide show
  1. app.py +18 -0
app.py CHANGED
@@ -17,11 +17,29 @@ speaker_embeddings = {
17
  "BDP": "spkemb/speaker2.npy",
18
  }
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  @spaces.GPU
21
  def predict(text, speaker):
22
  if len(text.strip()) == 0:
23
  return (16000, np.zeros(0).astype(np.int16))
24
  speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
 
25
  speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
26
  inputs = processor(text=text, return_tensors="pt")
27
  speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
 
17
  "BDP": "spkemb/speaker2.npy",
18
  }
19
 
20
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
21
+ device = "cuda" if torch.cuda.is_available() else "cpu"
22
+ print(f">>>>> DEVICE {device}")
23
+ speaker_model = EncoderClassifier.from_hparams(
24
+ source=spk_model_name,
25
+ run_opts={"device": device},
26
+ savedir=os.path.join("/tmp", spk_model_name),
27
+ )
28
+
29
+
30
+ def create_speaker_embedding(waveform):
31
+ with torch.no_grad():
32
+ se = speaker_model.encode_batch(torch.tensor(waveform))
33
+ se = torch.nn.functional.normalize(se, dim=2)
34
+ se = se.squeeze().cpu().numpy()
35
+ return se
36
+
37
  @spaces.GPU
38
  def predict(text, speaker):
39
  if len(text.strip()) == 0:
40
  return (16000, np.zeros(0).astype(np.int16))
41
  speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
42
+ speaker_embedding = prepare_dataset(speaker_embedding)
43
  speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
44
  inputs = processor(text=text, return_tensors="pt")
45
  speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)