SamuelM0422 commited on
Commit
943fd9a
·
verified ·
1 Parent(s): 0e72f25

Upload 7 files

Browse files
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from helper_function import speech_to_speech_translation
3
+
4
+ demo = gr.Blocks()
5
+
6
+ title = 'Audio translator 🇧🇷 ➡️ 🇺🇸'
7
+ description = 'A stacked aproach for translating audios from Portuguese to English'
8
+
9
+ translate = gr.Interface(
10
+ fn=speech_to_speech_translation,
11
+ inputs=gr.Audio(label='Input', sources=['upload', 'microphone'], type='filepath'),
12
+ outputs=[gr.Audio(label='Output', type='numpy'), gr.Textbox(label="Tradução")],
13
+ flagging_mode='never',
14
+ examples=examples,
15
+ title=title,
16
+ description=description
17
+ )
18
+
19
+ translate.launch()
examples/Example 01.mp3 ADDED
Binary file (19.7 kB). View file
 
examples/Example 02.mp3 ADDED
Binary file (20.8 kB). View file
 
examples/Example 03.mp3 ADDED
Binary file (54 kB). View file
 
examples/Example 04.mp3 ADDED
Binary file (33.7 kB). View file
 
examples/Example 05.mp3 ADDED
Binary file (43.9 kB). View file
 
helper_function.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import numpy as np
3
+ import torch
4
+ from transformers import pipeline
5
+ from transformers import VitsModel, VitsTokenizer
6
+ from IPython.display import Audio
7
+ from pathlib import Path
8
+
9
+ examples = list(Path('').glob('*mp3'))
10
+ examples.sort()
11
+
12
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
13
+ pipe = pipeline(
14
+ 'automatic-speech-recognition', model='openai/whisper-base', device=device,
15
+ )
16
+
17
+ model = VitsModel.from_pretrained("facebook/mms-tts-eng")
18
+ tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
19
+
20
+ target_dtype=np.int16
21
+ max_range = np.iinfo(target_dtype).max
22
+
23
+ def speech_to_speech_translation(filepath):
24
+ print(filepath)
25
+ translation = pipe(filepath, max_new_tokens=256, generate_kwargs={'task': 'translate'})['text']
26
+
27
+ inputs = tokenizer(translation, return_tensors="pt")
28
+ input_ids = inputs["input_ids"]
29
+
30
+ model.eval()
31
+ with torch.inference_mode():
32
+ outputs = model(input_ids)
33
+
34
+ speech = outputs["waveform"]
35
+ synthesised_speech = speech / torch.max(torch.abs(speech)) # Normaliza para [-1, 1]
36
+ synthesised_speech = (speech * max_range).numpy().astype(target_dtype)
37
+
38
+ Audio(synthesised_speech, rate=16000)
39
+
40
+ return (16000, synthesised_speech.squeeze()), translation