File size: 3,129 Bytes
3d4f13a
b5f7961
3d4f13a
4f63972
ef9b88b
3b68341
ef9b88b
965bd13
 
b02baad
3b68341
 
4f70f9f
b5f7961
 
 
 
3d4f13a
e7da02f
b02baad
e7da02f
b02baad
1335053
b02baad
 
 
1335053
b02baad
 
 
 
 
 
 
 
 
 
 
 
 
1335053
 
4f70f9f
b02baad
 
 
 
 
 
 
 
4f70f9f
b02baad
1335053
4f63972
1335053
 
b02baad
4f70f9f
b02baad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1335053
b02baad
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model_name = "dsfsi/nso-en-m2m100-gov"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

print(tokenizer.lang_code_to_token)

tokenizer.src_lang = "ns"
model.config.forced_bos_token_id = tokenizer.get_lang_id("en")

def translate(inp):
    inputs = tokenizer(inp, return_tensors="pt")
    translated_tokens = model.generate(**inputs, max_length=512, forced_bos_token_id=tokenizer.get_lang_id("en"))
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

img1, img2, img3 = gr.Columns(3)
with img2:
    gr.Image("logo_transparent_small.png", alt="DSFSI Logo", elem_id="logo", label=None)

description = """
<p style='text-align: center;'>
    Northern Sotho to English Translation
</p>
<p>
    This space provides a translation service from Northern Sotho to English using the M2M100 model, fine-tuned for low-resource languages. It supports researchers, linguists, and users working with Northern Sotho texts.
</p>
"""

article = """
<div style='text-align: center;'>
    <a href='https://github.com/dsfsi/nso-en-m2m100-gov' target='_blank'>GitHub</a> |
    <a href='https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform' target='_blank'>Feedback Form</a> |
    <a href='https://arxiv.org/abs/2303.03750' target='_blank'>Arxiv</a>
</div>
<br/>
<p style='text-align: center;'>
    <h2>Translate | Northern Sotho to English (dsfsi/nso-en-m2m100-gov)</h2>
</p>
"""

extra_info = """
<div style='text-align: center;'>
    <h4>More information about the space</h4>
</div>
<p>
    This is a variant of the M2M100 model, fine-tuned on a multilingual dataset to support translation from Northern Sotho (Sepedi) to English. The model was trained with a focus on improving translation accuracy for low-resource languages.
</p>
"""

with gr.Interface(
    fn=translate,
    title="Northern Sotho to English Translation",
    description=description,
    article=article,
    inputs=gr.components.Textbox(lines=5, placeholder="Enter Northern Sotho text (maximum 5 lines)", label="Input"),
    outputs="text"
) as iface:

    iface.launch(enable_queue=True)

authors = """
<div style='text-align: center;'>
    Authors: Vukosi Marivate, Matimba Shingange, Richard Lastrucci, Isheanesu Joseph Dzingirai, Jenalea Rajab
</div>
"""

citation = """
<div style='text-align: center;'>
    <p>
        @inproceedings{{dsfsi2024, title={{Northern Sotho to English Translation using M2M100}}, 
        author={{DSFSI Research Team}}, year={{2024}}, 
        url={{https://huggingface.co/dsfsi/nso-en-m2m100-gov}}
    }}
    </p>
</div>
"""

doi = """
<div style='text-align: center;'>
    DOI: <a href="https://doi.org/10.1234/dsfsi.2024.001" target="_blank">10.1234/dsfsi.2024.001</a>
</div>
"""

gr.markdown(extra_info, unsafe_allow_html=True)
gr.markdown(authors, unsafe_allow_html=True)
gr.markdown(citation, unsafe_allow_html=True)
gr.markdown(doi, unsafe_allow_html=True)