File size: 3,607 Bytes
3d4f13a
b5f7961
3d4f13a
4f63972
ef9b88b
3b68341
ef9b88b
965bd13
 
b02baad
3b68341
 
4f70f9f
b5f7961
 
 
 
3d4f13a
3363f16
bfcf2ec
 
 
 
 
3363f16
1335053
b02baad
 
 
1335053
b02baad
 
 
 
 
 
 
 
 
 
 
1335053
4f70f9f
b02baad
 
692156b
 
b02baad
 
 
 
692156b
3746080
 
692156b
 
3746080
 
 
 
 
 
 
 
692156b
b02baad
 
 
 
3746080
b02baad
 
1335053
3363f16
692156b
 
3363f16
692156b
5f5a773
3363f16
5f5a773
 
 
 
 
 
 
692156b
 
 
 
 
 
 
 
 
 
 
3363f16
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model_name = "dsfsi/nso-en-m2m100-gov"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

print(tokenizer.lang_code_to_token)

tokenizer.src_lang = "ns"
model.config.forced_bos_token_id = tokenizer.get_lang_id("en")

def translate(inp):
    inputs = tokenizer(inp, return_tensors="pt")
    translated_tokens = model.generate(**inputs, max_length=512, forced_bos_token_id=tokenizer.get_lang_id("en"))
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text


logo = """
<div style='text-align: center;'>
    <img src='file/logo_transparent_small.png' alt='Logo' width='150'/>
</div>
"""

description = """
<p style='text-align: center;'>
    Northern Sotho to English Translation
</p>
<p>
    This space provides a translation service from Northern Sotho to English using the M2M100 model, fine-tuned for low-resource languages. It supports researchers, linguists, and users working with Northern Sotho texts.
</p>
"""

article = """
<div style='text-align: center;'>
    <a href='https://github.com/dsfsi/nso-en-m2m100-gov' target='_blank'>GitHub</a> |
    <a href='https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform' target='_blank'>Feedback Form</a> |
    <a href='https://arxiv.org/abs/2303.03750' target='_blank'>Arxiv</a>
</div>
<br/>
"""

authors = """
<div style='text-align: center;'>
    Authors: Vukosi Marivate, Matimba Shingange, Richard Lastrucci, 
    Isheanesu Joseph Dzingirai, Jenalea Rajab
</div>
"""

citation = """
<pre style="text-align: left; white-space: pre-wrap;">
@inproceedings{lastrucci-etal-2023-preparing,
    title = "Preparing the Vuk{'}uzenzele and {ZA}-gov-multilingual {S}outh {A}frican multilingual corpora",
    author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab 
              and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate",
    booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)",
    month = may,
    year = "2023",
    address = "Dubrovnik, Croatia",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.rail-1.3",
    pages = "18--25"
}
</pre>
"""

doi = """
<div style='text-align: center;'>
    DOI: <a href="https://doi.org/10.48550/arXiv.2303.03750" target="_blank">10.48550/arXiv.2303.03750</a>
</div>
"""

with gr.Blocks() as demo:
    gr.Markdown(logo)   
    gr.Markdown(description)
    gr.Markdown(article)

    with gr.Interface(
        fn=translate,
        title="Northern Sotho to English Translation",
        description=description,
        article=article,
        inputs=gr.components.Textbox(lines=5, placeholder="Enter Northern Sotho text (maximum 5 lines)", label="Input"),
        outputs="text"
    ) as iface:
        iface.launch(enable_queue=True)

    with gr.Accordion("More Information", open=False):
        gr.Markdown("""
        <h4 style="text-align: center;">More information about the space</h4>
        <p>This is a variant of the M2M100 model, fine-tuned on a multilingual dataset 
        to support translation from Northern Sotho (Sepedi) to English. The model was trained 
        with a focus on improving translation accuracy for low-resource languages.</p>
        """)
        gr.Markdown(authors)
        gr.Markdown(citation)
        gr.Markdown(doi)

demo.launch(enable_queue=True)