Abrahamau commited on
Commit
1269470
·
verified ·
1 Parent(s): 68b077f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -3
app.py CHANGED
@@ -2,7 +2,8 @@ import torch
2
  import os
3
  import random
4
  import gradio as gr
5
- from transformers import pipeline
 
6
  import base64
7
  from datasets import load_dataset
8
  from diffusers import DiffusionPipeline
@@ -28,12 +29,15 @@ def guessanAge(model, image):
28
  def text2speech(model, text, voice):
29
  print(model, text, voice)
30
  if len(text) > 0:
31
- synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
 
 
 
32
 
33
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
34
  speaker_embedding = torch.tensor(embeddings_dataset[voice]["xvector"]).unsqueeze(0)
35
 
36
- speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
37
  audio_data = np.frombuffer(speech["audio"], dtype=np.float32)
38
  audio_data_16bit = (audio_data * 32767).astype(np.int16)
39
  return speech["sampling_rate"], audio_data_16bit
 
2
  import os
3
  import random
4
  import gradio as gr
5
+
6
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, pipeline
7
  import base64
8
  from datasets import load_dataset
9
  from diffusers import DiffusionPipeline
 
29
  def text2speech(model, text, voice):
30
  print(model, text, voice)
31
  if len(text) > 0:
32
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
33
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
34
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
35
+ inputs = processor(text=text, return_tensors="pt")
36
 
37
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
38
  speaker_embedding = torch.tensor(embeddings_dataset[voice]["xvector"]).unsqueeze(0)
39
 
40
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
41
  audio_data = np.frombuffer(speech["audio"], dtype=np.float32)
42
  audio_data_16bit = (audio_data * 32767).astype(np.int16)
43
  return speech["sampling_rate"], audio_data_16bit