Spaces:
Sleeping
Sleeping
File size: 3,607 Bytes
3d4f13a b5f7961 3d4f13a 4f63972 ef9b88b 3b68341 ef9b88b 965bd13 b02baad 3b68341 4f70f9f b5f7961 3d4f13a 3363f16 bfcf2ec 3363f16 1335053 b02baad 1335053 b02baad 1335053 4f70f9f b02baad 692156b b02baad 692156b 3746080 692156b 3746080 692156b b02baad 3746080 b02baad 1335053 3363f16 692156b 3363f16 692156b 5f5a773 3363f16 5f5a773 692156b 3363f16 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import gradio as gr
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
model_name = "dsfsi/nso-en-m2m100-gov"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
print(tokenizer.lang_code_to_token)
tokenizer.src_lang = "ns"
model.config.forced_bos_token_id = tokenizer.get_lang_id("en")
def translate(inp):
inputs = tokenizer(inp, return_tensors="pt")
translated_tokens = model.generate(**inputs, max_length=512, forced_bos_token_id=tokenizer.get_lang_id("en"))
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
return translated_text
logo = """
<div style='text-align: center;'>
<img src='file/logo_transparent_small.png' alt='Logo' width='150'/>
</div>
"""
description = """
<p style='text-align: center;'>
Northern Sotho to English Translation
</p>
<p>
This space provides a translation service from Northern Sotho to English using the M2M100 model, fine-tuned for low-resource languages. It supports researchers, linguists, and users working with Northern Sotho texts.
</p>
"""
article = """
<div style='text-align: center;'>
<a href='https://github.com/dsfsi/nso-en-m2m100-gov' target='_blank'>GitHub</a> |
<a href='https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform' target='_blank'>Feedback Form</a> |
<a href='https://arxiv.org/abs/2303.03750' target='_blank'>Arxiv</a>
</div>
<br/>
"""
authors = """
<div style='text-align: center;'>
Authors: Vukosi Marivate, Matimba Shingange, Richard Lastrucci,
Isheanesu Joseph Dzingirai, Jenalea Rajab
</div>
"""
citation = """
<pre style="text-align: left; white-space: pre-wrap;">
@inproceedings{lastrucci-etal-2023-preparing,
title = "Preparing the Vuk{'}uzenzele and {ZA}-gov-multilingual {S}outh {A}frican multilingual corpora",
author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab
and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate",
booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.rail-1.3",
pages = "18--25"
}
</pre>
"""
doi = """
<div style='text-align: center;'>
DOI: <a href="https://doi.org/10.48550/arXiv.2303.03750" target="_blank">10.48550/arXiv.2303.03750</a>
</div>
"""
with gr.Blocks() as demo:
gr.Markdown(logo)
gr.Markdown(description)
gr.Markdown(article)
with gr.Interface(
fn=translate,
title="Northern Sotho to English Translation",
description=description,
article=article,
inputs=gr.components.Textbox(lines=5, placeholder="Enter Northern Sotho text (maximum 5 lines)", label="Input"),
outputs="text"
) as iface:
iface.launch(enable_queue=True)
with gr.Accordion("More Information", open=False):
gr.Markdown("""
<h4 style="text-align: center;">More information about the space</h4>
<p>This is a variant of the M2M100 model, fine-tuned on a multilingual dataset
to support translation from Northern Sotho (Sepedi) to English. The model was trained
with a focus on improving translation accuracy for low-resource languages.</p>
""")
gr.Markdown(authors)
gr.Markdown(citation)
gr.Markdown(doi)
demo.launch(enable_queue=True)
|