audio_translator / helper_function.py
SamuelM0422's picture
Upload 7 files
943fd9a verified
raw
history blame
1.2 kB
import numpy as np
import torch
from transformers import pipeline
from transformers import VitsModel, VitsTokenizer
from IPython.display import Audio
from pathlib import Path
examples = list(Path('').glob('*mp3'))
examples.sort()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
pipe = pipeline(
'automatic-speech-recognition', model='openai/whisper-base', device=device,
)
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
target_dtype=np.int16
max_range = np.iinfo(target_dtype).max
def speech_to_speech_translation(filepath):
print(filepath)
translation = pipe(filepath, max_new_tokens=256, generate_kwargs={'task': 'translate'})['text']
inputs = tokenizer(translation, return_tensors="pt")
input_ids = inputs["input_ids"]
model.eval()
with torch.inference_mode():
outputs = model(input_ids)
speech = outputs["waveform"]
synthesised_speech = speech / torch.max(torch.abs(speech)) # Normaliza para [-1, 1]
synthesised_speech = (speech * max_range).numpy().astype(target_dtype)
Audio(synthesised_speech, rate=16000)
return (16000, synthesised_speech.squeeze()), translation