|
import gradio as gr |
|
import torch |
|
import torchaudio |
|
|
|
from transformers import AutoProcessor, SeamlessM4TModel |
|
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium") |
|
model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium") |
|
|
|
|
|
language_dict = { |
|
"Modern Standard Arabic" : "arb", |
|
"Bengali" : "ben", |
|
"Catalan" : "cat", |
|
"Czech" : "ces", |
|
"Mandarin Chinese" : "cmn", |
|
"Welsh" : "cym", |
|
"Danish" : "dan", |
|
"German" : "deu", |
|
"English" : "eng", |
|
"Estonian" : "est", |
|
"Finnish" : "fin", |
|
"French" : "fra", |
|
"Hindi" : "hin", |
|
"Indonesian" : "ind", |
|
"Italian" : "ita", |
|
"Japanese" : "jpn", |
|
"Korean" : "kor", |
|
"Maltese" : "mlt", |
|
"Dutch" : "nld", |
|
"Western Persian" : "pes", |
|
"Polish" : "pol", |
|
"Portuguese" : "por", |
|
"Romanian" : "ron", |
|
"Russian" : "rus", |
|
"Slovak" : "slk", |
|
"Spanish" : "spa", |
|
"Swedish" : "swe", |
|
"Swahili" : "swh", |
|
"Telugu" : "tel", |
|
"Tagalog" : "tgl", |
|
"Thai" : "tha", |
|
"Turkish" : "tur", |
|
"Ukrainian" : "ukr", |
|
"Urdu" : "urd", |
|
"Northern Uzbek" : "uzn", |
|
"Vietnamese" : "vie" |
|
} |
|
languages = list(language_dict.keys()) |
|
|
|
|
|
def png(source_lang,target_lang,audio,text): |
|
|
|
source_lang_code = language_dict[source_lang] |
|
target_lang_code = language_dict[target_lang] |
|
|
|
if audio == None: |
|
processed_inputs = processor(text, src_lang=source_lang_code, return_tensors="pt") |
|
else: |
|
sample_rate, audio_data = audio |
|
audio_tokens = torch.from_numpy(audio_data) |
|
audio_tokens = audio_tokens.to(torch.float32) |
|
audio_tokens = torchaudio.functional.resample(audio_tokens, orig_freq=sample_rate, new_freq=16_000) |
|
|
|
processed_inputs = processor(audios=audio_tokens, sampling_rate=16000, return_tensors="pt") |
|
|
|
|
|
|
|
generated_audio = model.generate(**processed_inputs, tgt_lang=target_lang_code)[0].cpu().numpy().squeeze() |
|
output_tokens = model.generate(**processed_inputs, tgt_lang=target_lang_code, generate_speech=False) |
|
generated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True) |
|
|
|
return (16000,generated_audio),generated_text |
|
|
|
title = "36 Language Translator" |
|
description = """ |
|
This Demo can translate either Speech or Text form any of the selected SOURCE language amoung 36 languages to both Speech and Text in any of the selected TARGET language. |
|
This Demo is powered by "facebook/hf-seamless-m4t-medium" model. Thanks for checking out. |
|
Select source and target languages for translation. |
|
""" |
|
|
|
iface = gr.Interface( |
|
png, |
|
inputs=[ |
|
gr.Dropdown(languages, label="Source Language"), |
|
gr.Dropdown(languages, label="Target Language"), |
|
gr.Audio(), |
|
gr.Textbox(label="Enter Text in Source Language") |
|
], |
|
outputs=[ |
|
gr.Audio(label = "Translated Audio"), |
|
gr.Textbox(label="Translated Text") |
|
], |
|
title="Language Translation App", |
|
description = """ |
|
This Demo can translate either Speech or Text form any of the selected SOURCE language amoung 36 languages to both Speech and Text in any of the selected TARGET language. |
|
This Demo is powered by "facebook/hf-seamless-m4t-medium" model. This Demo can take 3-4 mins as it is running on a CPU.\n Try this Google Colab Notebook with GPU for faster processing. |
|
\n Thanks for checking out. |
|
Select source and target languages for translation. |
|
""", |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|