audio_translator / helper_function.py
SamuelM0422's picture
Update helper_function.py
881ce66 verified
import numpy as np
import torch
from transformers import pipeline
from transformers import VitsModel, VitsTokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
pipe = pipeline(
'automatic-speech-recognition', model='openai/whisper-base', device=device,
)
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
target_dtype=np.int16
max_range = np.iinfo(target_dtype).max
def speech_to_speech_translation(filepath):
translation = pipe(filepath, max_new_tokens=256, generate_kwargs={'task': 'translate'})['text']
inputs = tokenizer(translation, return_tensors="pt")
input_ids = inputs["input_ids"]
model.eval()
with torch.inference_mode():
outputs = model(input_ids)
speech = outputs["waveform"]
synthesised_speech = speech / torch.max(torch.abs(speech)) # Normaliza para [-1, 1]
synthesised_speech = (speech * max_range).numpy().astype(target_dtype)
return (16000, synthesised_speech.squeeze()), translation