File size: 3,428 Bytes
3d4f13a
b5f7961
3d4f13a
4f63972
ef9b88b
3b68341
ef9b88b
965bd13
 
b02baad
3b68341
 
4f70f9f
b5f7961
 
 
 
3d4f13a
3363f16
bfcf2ec
 
 
 
 
3363f16
1335053
b02baad
 
 
1335053
b02baad
 
 
 
 
 
 
 
 
 
 
1335053
4f70f9f
b02baad
 
 
 
 
 
 
 
4f70f9f
b02baad
 
 
 
 
 
 
3746080
 
 
 
 
 
 
 
 
 
 
b02baad
 
 
 
3746080
b02baad
 
1335053
3363f16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import gradio as gr
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model_name = "dsfsi/nso-en-m2m100-gov"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

print(tokenizer.lang_code_to_token)

tokenizer.src_lang = "ns"
model.config.forced_bos_token_id = tokenizer.get_lang_id("en")

def translate(inp):
    inputs = tokenizer(inp, return_tensors="pt")
    translated_tokens = model.generate(**inputs, max_length=512, forced_bos_token_id=tokenizer.get_lang_id("en"))
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text


logo = """
<div style='text-align: center;'>
    <img src='file/logo_transparent_small.png' alt='Logo' width='150'/>
</div>
"""

description = """
<p style='text-align: center;'>
    Northern Sotho to English Translation
</p>
<p>
    This space provides a translation service from Northern Sotho to English using the M2M100 model, fine-tuned for low-resource languages. It supports researchers, linguists, and users working with Northern Sotho texts.
</p>
"""

article = """
<div style='text-align: center;'>
    <a href='https://github.com/dsfsi/nso-en-m2m100-gov' target='_blank'>GitHub</a> |
    <a href='https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform' target='_blank'>Feedback Form</a> |
    <a href='https://arxiv.org/abs/2303.03750' target='_blank'>Arxiv</a>
</div>
<br/>
"""

extra_info = """
<div style='text-align: center;'>
    <h4>More information about the space</h4>
</div>
<p>
    This is a variant of the M2M100 model, fine-tuned on a multilingual dataset to support translation from Northern Sotho (Sepedi) to English. The model was trained with a focus on improving translation accuracy for low-resource languages.
</p>
"""

authors = """
<div style='text-align: center;'>
    Authors: Vukosi Marivate, Matimba Shingange, Richard Lastrucci, Isheanesu Joseph Dzingirai, Jenalea Rajab
</div>
"""

citation = """
@inproceedings{lastrucci-etal-2023-preparing,
    title = "Preparing the Vuk{'}uzenzele and {ZA}-gov-multilingual {S}outh {A}frican multilingual corpora",
    author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate",
    booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)",
    month = may,
    year = "2023",
    address = "Dubrovnik, Croatia",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.rail-1.3",
    pages = "18--25"
}
"""

doi = """
<div style='text-align: center;'>
    DOI: <a href="https://doi.org/10.48550/arXiv.2303.03750" target="_blank">10.48550/arXiv.2303.03750</a>
</div>
"""

with gr.Blocks() as demo:
    gr.Markdown(logo)    
    gr.Markdown(description) 
    gr.Markdown(article)
    
    textbox = gr.Textbox(lines=5, placeholder="Enter Northern Sotho text (maximum 5 lines)", label="Input")
    output_text = gr.Textbox(label="Translation")
    
    gr.Interface(
        fn=translate,
        inputs=textbox,
        outputs=output_text,
        title="Northern Sotho to English Translation"
    )
    
    gr.Markdown(extra_info)
    gr.Markdown(authors)
    gr.Markdown(citation)
    gr.Markdown(doi)

demo.launch(enable_queue=True)