mcamara commited on
Commit
9d588a9
·
1 Parent(s): b33585a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -28
app.py CHANGED
@@ -1,51 +1,65 @@
1
- import gradio as gr
2
- import numpy as np
3
  import torch
4
- from datasets import load_dataset
5
 
6
- from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
 
 
 
7
 
 
8
 
9
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
 
 
 
10
 
11
- # load speech translation checkpoint
12
- pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
13
 
14
- # load text-to-speech checkpoint and speaker embeddings
15
  processor = SpeechT5Processor.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")
16
 
17
- model = SpeechT5ForTextToSpeech.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl").to(device)
18
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
 
20
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
21
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
 
 
 
23
 
24
- def translate(audio):
25
- outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
26
- return outputs["text"]
 
 
 
27
 
 
28
 
29
  def synthesise(text):
30
  inputs = processor(text=text, return_tensors="pt")
31
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
 
 
32
  return speech.cpu()
33
 
 
 
 
 
 
 
 
34
 
35
  def speech_to_speech_translation(audio):
36
  translated_text = translate(audio)
37
  synthesised_speech = synthesise(translated_text)
38
- synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
39
  return 16000, synthesised_speech
40
 
41
 
42
- title = "Cascaded STST"
43
- description = """
44
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
45
- [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
46
 
47
- ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
48
- """
49
 
50
  demo = gr.Blocks()
51
 
@@ -53,17 +67,12 @@ mic_translate = gr.Interface(
53
  fn=speech_to_speech_translation,
54
  inputs=gr.Audio(source="microphone", type="filepath"),
55
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
56
- title=title,
57
- description=description,
58
  )
59
 
60
  file_translate = gr.Interface(
61
  fn=speech_to_speech_translation,
62
  inputs=gr.Audio(source="upload", type="filepath"),
63
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
64
- examples=[["./example.wav"]],
65
- title=title,
66
- description=description,
67
  )
68
 
69
  with demo:
 
 
 
1
  import torch
2
+ from transformers import pipeline
3
 
4
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
5
+ pipe = pipeline(
6
+ "automatic-speech-recognition", model="openai/whisper-base", device=device
7
+ )
8
 
9
+ # %%
10
 
11
+ def translate(audio):
12
+ outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
13
+ return outputs["text"]
14
+
15
+ # %%
16
 
17
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 
18
 
 
19
  processor = SpeechT5Processor.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")
20
 
21
+ model = SpeechT5ForTextToSpeech.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")
22
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
23
 
24
+ # %%
 
25
 
26
+ model.to(device)
27
+ vocoder.to(device)
28
 
29
+ # %%
30
+
31
+ from datasets import load_dataset, Audio
32
+
33
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
34
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
35
 
36
+ # %%
37
 
38
  def synthesise(text):
39
  inputs = processor(text=text, return_tensors="pt")
40
+ speech = model.generate_speech(
41
+ inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
42
+ )
43
  return speech.cpu()
44
 
45
+ # %%
46
+
47
+ import numpy as np
48
+
49
+ target_dtype = np.int16
50
+ max_range = np.iinfo(target_dtype).max
51
+
52
 
53
  def speech_to_speech_translation(audio):
54
  translated_text = translate(audio)
55
  synthesised_speech = synthesise(translated_text)
56
+ synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
57
  return 16000, synthesised_speech
58
 
59
 
60
+ # %%
 
 
 
61
 
62
+ import gradio as gr
 
63
 
64
  demo = gr.Blocks()
65
 
 
67
  fn=speech_to_speech_translation,
68
  inputs=gr.Audio(source="microphone", type="filepath"),
69
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
 
 
70
  )
71
 
72
  file_translate = gr.Interface(
73
  fn=speech_to_speech_translation,
74
  inputs=gr.Audio(source="upload", type="filepath"),
75
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
 
 
 
76
  )
77
 
78
  with demo: