str20tbl commited on
Commit
993a760
·
1 Parent(s): 7443526
Files changed (1) hide show
  1. app.py +10 -5
app.py CHANGED
@@ -4,10 +4,11 @@ import librosa
4
  import numpy as np
5
  import torch
6
 
7
- from transformers import pipeline
8
 
9
- synthesiser = pipeline("text-to-speech", "techiaith/microsoft_speecht5_finetuned_bu_tts_cy_en", from_tf=True)
10
- synthesiser.to('cuda')
 
11
 
12
  speaker_embeddings = {
13
  "GGP": "spkemb/speaker0.npy",
@@ -15,7 +16,6 @@ speaker_embeddings = {
15
  "BDP": "spkemb/speaker2.npy",
16
  }
17
 
18
-
19
  @spaces.GPU
20
  def predict(text, speaker):
21
  if len(text.strip()) == 0:
@@ -23,7 +23,12 @@ def predict(text, speaker):
23
 
24
  speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
25
  speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
26
- speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
 
 
 
 
 
27
  speech = (speech.numpy() * 32767).astype(np.int16)
28
  return (16000, speech)
29
 
 
4
  import numpy as np
5
  import torch
6
 
7
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan
8
 
9
+ checkpoint = "microsoft/speecht5_tts"
10
+ processor = SpeechT5Processor.from_pretrained(checkpoint)
11
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
12
 
13
  speaker_embeddings = {
14
  "GGP": "spkemb/speaker0.npy",
 
16
  "BDP": "spkemb/speaker2.npy",
17
  }
18
 
 
19
  @spaces.GPU
20
  def predict(text, speaker):
21
  if len(text.strip()) == 0:
 
23
 
24
  speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
25
  speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
26
+ model = SpeechT5ForTextToSpeech.from_pretrained(
27
+ "techiaith/microsoft_speecht5_finetuned_bu_tts_cy_en"
28
+ )
29
+
30
+ inputs = processor(text=text, return_tensors="pt")
31
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
32
  speech = (speech.numpy() * 32767).astype(np.int16)
33
  return (16000, speech)
34