File size: 3,455 Bytes
1c9197b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gradio as gr
import sentencepiece as spm
import ctranslate2
from huggingface_hub import hf_hub_download
import os

languages = {
    "Kurdish": "ku",
    "Samoan": "sm",
    "Xhosa": "xh",
    "Lao": "lo",
    "Corsican": "co",
    "Cebuano": "ceb",
    "Galician": "gl",
    "Yiddish": "yi",
    "Swahili": "sw",
    "Yoruba": "yo",
    "English": "en",
}

def get_repo_id(src_lang, tgt_lang):
    return f"lingvanex/{src_lang.lower()}-to-{tgt_lang.lower()}-translation"

def download_models(src_lang, tgt_lang):
    repo_id = get_repo_id(src_lang, tgt_lang)
    models = {
        "src_spm": hf_hub_download(repo_id=repo_id, filename=f"{languages[src_lang]}.spm.model"),
        "tgt_spm": hf_hub_download(repo_id=repo_id, filename=f"{languages[tgt_lang]}.spm.model"),
        "model": hf_hub_download(repo_id=repo_id, filename="model.bin"),
        "source_vocab": hf_hub_download(repo_id=repo_id, filename="source_vocabulary.txt"),
        "target_vocab": hf_hub_download(repo_id=repo_id, filename="target_vocabulary.txt"),
        "config": hf_hub_download(repo_id=repo_id, filename="config.json"),
    }
    return models

def translate(text, src_lang, tgt_lang):
    if src_lang == tgt_lang:
        return text

    models = download_models(src_lang, tgt_lang)

    spm_encoder = spm.SentencePieceProcessor(models["src_spm"])
    spm_decoder = spm.SentencePieceProcessor(models["tgt_spm"])

    model_dir = os.path.dirname(models["model"])

    translator = ctranslate2.Translator(model_dir, device="cpu")

    tokens = spm_encoder.encode(text, out_type=str)

    result = translator.translate_batch([tokens])

    output = spm_decoder.decode(result[0].hypotheses[0], out_type=str)

    return output


def update_target_lang(src_lang):
    if src_lang == "English":
        return gr.Dropdown(choices=sorted(languages.keys())), ""
    else:
        return gr.Dropdown(choices=["English"]), "Note: Translations are only supported from this language to English."


with gr.Blocks() as demo:
    gr.Markdown("# Multilingual Translation with Lingvanex")
    gr.Markdown("""
    This translator allows you to translate text between English and a variety of other languages. Please note that translations are supported only in the following directions:
    - From English to the target language (e.g., English → Kurdish).
    - From the source language to English (e.g., Kurdish → English).
    For this demo, language-specific model pairs are utilized, so translations between two non-English languages are not supported at this time.
    """)

    with gr.Row():
        src_lang = gr.Dropdown(choices=sorted(languages.keys()), label="Source Language", value="English")
        tgt_lang = gr.Dropdown(choices=sorted(languages.keys()), label="Target Language", value="Kurdish")
        note = gr.Markdown("")

    src_lang.change(update_target_lang, inputs=src_lang, outputs=[tgt_lang, note])

    text_input = gr.Textbox(label="Input Text", placeholder="Enter text to translate...")
    text_output = gr.Textbox(label="Translated Text")

    examples = gr.Examples(
        examples=[
            ["Hello, how are you?", "English", "Kurdish"],
            ["Silav halê we çawa ye?", "Kurdish", "English"],
        ],
        inputs=[text_input, src_lang, tgt_lang],
    )

    translate_btn = gr.Button("Translate")
    translate_btn.click(translate, inputs=[text_input, src_lang, tgt_lang], outputs=text_output)

demo.launch(share=True)