JanLilan commited on
Commit
e5d5a17
·
1 Parent(s): dbfdf1a

update app.py adding whisper to catala, also the model text-to-speech

Browse files
Files changed (1) hide show
  1. app.py +9 -4
app.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
  import numpy as np
3
  import torch
4
  from datasets import load_dataset
5
-
6
  from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
7
 
8
 
@@ -14,15 +13,21 @@ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base",
14
  # load text-to-speech checkpoint and speaker embeddings
15
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
 
17
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 
 
 
18
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
-
 
 
 
20
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
21
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
 
23
 
24
  def translate(audio):
25
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
26
  return outputs["text"]
27
 
28
 
 
2
  import numpy as np
3
  import torch
4
  from datasets import load_dataset
 
5
  from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
6
 
7
 
 
13
  # load text-to-speech checkpoint and speaker embeddings
14
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
15
 
16
+ # model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
17
+ model = SpeechT5ForTextToSpeech.from_pretrained(
18
+ "JanLilan/speecht5_finetuned_openslr-slr69-cat"
19
+ ).to(device)
20
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
21
+ # we will try to translate with this voice embedding... Let's see what happen. else:
22
+ # dataset = load_dataset("projecte-aina/openslr-slr69-ca-trimmed-denoised", split="train")
23
+ # dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
24
+ # etc.
25
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
26
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
27
 
28
 
29
  def translate(audio):
30
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language": "cat"})
31
  return outputs["text"]
32
 
33