Spaces:
Running
Running
Upload 7 files
Browse files- app.py +19 -0
- examples/Example 01.mp3 +0 -0
- examples/Example 02.mp3 +0 -0
- examples/Example 03.mp3 +0 -0
- examples/Example 04.mp3 +0 -0
- examples/Example 05.mp3 +0 -0
- helper_function.py +40 -0
app.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from helper_function import speech_to_speech_translation
|
3 |
+
|
4 |
+
demo = gr.Blocks()
|
5 |
+
|
6 |
+
title = 'Audio translator 🇧🇷 ➡️ 🇺🇸'
|
7 |
+
description = 'A stacked aproach for translating audios from Portuguese to English'
|
8 |
+
|
9 |
+
translate = gr.Interface(
|
10 |
+
fn=speech_to_speech_translation,
|
11 |
+
inputs=gr.Audio(label='Input', sources=['upload', 'microphone'], type='filepath'),
|
12 |
+
outputs=[gr.Audio(label='Output', type='numpy'), gr.Textbox(label="Tradução")],
|
13 |
+
flagging_mode='never',
|
14 |
+
examples=examples,
|
15 |
+
title=title,
|
16 |
+
description=description
|
17 |
+
)
|
18 |
+
|
19 |
+
translate.launch()
|
examples/Example 01.mp3
ADDED
Binary file (19.7 kB). View file
|
|
examples/Example 02.mp3
ADDED
Binary file (20.8 kB). View file
|
|
examples/Example 03.mp3
ADDED
Binary file (54 kB). View file
|
|
examples/Example 04.mp3
ADDED
Binary file (33.7 kB). View file
|
|
examples/Example 05.mp3
ADDED
Binary file (43.9 kB). View file
|
|
helper_function.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
from transformers import pipeline
|
5 |
+
from transformers import VitsModel, VitsTokenizer
|
6 |
+
from IPython.display import Audio
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
examples = list(Path('').glob('*mp3'))
|
10 |
+
examples.sort()
|
11 |
+
|
12 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
13 |
+
pipe = pipeline(
|
14 |
+
'automatic-speech-recognition', model='openai/whisper-base', device=device,
|
15 |
+
)
|
16 |
+
|
17 |
+
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
|
18 |
+
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
|
19 |
+
|
20 |
+
target_dtype=np.int16
|
21 |
+
max_range = np.iinfo(target_dtype).max
|
22 |
+
|
23 |
+
def speech_to_speech_translation(filepath):
|
24 |
+
print(filepath)
|
25 |
+
translation = pipe(filepath, max_new_tokens=256, generate_kwargs={'task': 'translate'})['text']
|
26 |
+
|
27 |
+
inputs = tokenizer(translation, return_tensors="pt")
|
28 |
+
input_ids = inputs["input_ids"]
|
29 |
+
|
30 |
+
model.eval()
|
31 |
+
with torch.inference_mode():
|
32 |
+
outputs = model(input_ids)
|
33 |
+
|
34 |
+
speech = outputs["waveform"]
|
35 |
+
synthesised_speech = speech / torch.max(torch.abs(speech)) # Normaliza para [-1, 1]
|
36 |
+
synthesised_speech = (speech * max_range).numpy().astype(target_dtype)
|
37 |
+
|
38 |
+
Audio(synthesised_speech, rate=16000)
|
39 |
+
|
40 |
+
return (16000, synthesised_speech.squeeze()), translation
|