str20tbl commited on
Commit
a1c5c13
·
1 Parent(s): de1d53c
Files changed (1) hide show
  1. app.py +5 -11
app.py CHANGED
@@ -4,14 +4,9 @@ import librosa
4
  import numpy as np
5
  import torch
6
 
7
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
8
-
9
-
10
- checkpoint = "microsoft/speecht5_tts"
11
- processor = SpeechT5Processor.from_pretrained(checkpoint)
12
- model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
13
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
14
 
 
15
 
16
  speaker_embeddings = {
17
  "GGP": "spkemb/speaker0.npy",
@@ -25,8 +20,6 @@ def predict(text, speaker):
25
  if len(text.strip()) == 0:
26
  return (16000, np.zeros(0).astype(np.int16))
27
 
28
- inputs = processor(text=text, return_tensors="pt")
29
-
30
  # limit input length
31
  input_ids = inputs["input_ids"]
32
  input_ids = input_ids[..., :model.config.max_text_positions]
@@ -51,8 +44,9 @@ def predict(text, speaker):
51
 
52
  speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
53
 
54
- speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
55
-
 
56
  speech = (speech.numpy() * 32767).astype(np.int16)
57
  return (16000, speech)
58
 
 
4
  import numpy as np
5
  import torch
6
 
7
+ from transformers import pipeline
 
 
 
 
 
 
8
 
9
+ synthesiser = pipeline("text-to-speech", "techiaith/microsoft_speecht5_finetuned_bu_tts_cy_en")
10
 
11
  speaker_embeddings = {
12
  "GGP": "spkemb/speaker0.npy",
 
20
  if len(text.strip()) == 0:
21
  return (16000, np.zeros(0).astype(np.int16))
22
 
 
 
23
  # limit input length
24
  input_ids = inputs["input_ids"]
25
  input_ids = input_ids[..., :model.config.max_text_positions]
 
44
 
45
  speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
46
 
47
+
48
+ speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
49
+
50
  speech = (speech.numpy() * 32767).astype(np.int16)
51
  return (16000, speech)
52