File size: 2,359 Bytes
d347764 cb24fcf d347764 cb24fcf d347764 cb24fcf d347764 6297784 d347764 cb24fcf d347764 cb24fcf 9ab46d4 d347764 f805e49 fee55e0 c6f1d54 f805e49 c737803 d347764 226ec3a 7c3bb3f f805e49 d347764 c737803 7c3bb3f c737803 3946ba6 c737803 d347764 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import gradio as gr
import numpy as np
import torch
from transformers import pipeline, VitsModel, VitsTokenizer
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
# load text-to-speech checkpoint
model = VitsModel.from_pretrained('facebook/mms-tts-rus').to(device)
tokenizer = VitsTokenizer.from_pretrained('facebook/mms-tts-rus')
target_dtype = np.int16
max_range = np.iinfo(target_dtype).max
def translate(audio):
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": 'russian'})
return outputs["text"]
def synthesise(text):
input_ids = tokenizer(text, return_tensors="pt")["input_ids"].to(device)
with torch.no_grad():
outputs = model(input_ids)
return outputs["waveform"].squeeze().cpu()
def speech_to_speech_translation(audio):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
return 16000, synthesised_speech
title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Russian. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:

"""
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
examples=[["./example.wav"]],
title=title,
description=description,
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
demo.launch()
|