Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer | |
model_name = "dsfsi/nso-en-m2m100-gov" | |
tokenizer = M2M100Tokenizer.from_pretrained(model_name) | |
model = M2M100ForConditionalGeneration.from_pretrained(model_name) | |
print(tokenizer.lang_code_to_token) | |
tokenizer.src_lang = "ns" | |
model.config.forced_bos_token_id = tokenizer.get_lang_id("en") | |
def translate(inp): | |
inputs = tokenizer(inp, return_tensors="pt") | |
translated_tokens = model.generate(**inputs, max_length=512, forced_bos_token_id=tokenizer.get_lang_id("en")) | |
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True) | |
return translated_text | |
logo = """ | |
<div style='text-align: center;'> | |
<img src='file/logo_transparent_small.png' alt='Logo' width='150'/> | |
</div> | |
""" | |
description = """ | |
<p style='text-align: center;'> | |
Northern Sotho to English Translation | |
</p> | |
<p> | |
This space provides a translation service from Northern Sotho to English using the M2M100 model, fine-tuned for low-resource languages. It supports researchers, linguists, and users working with Northern Sotho texts. | |
</p> | |
""" | |
article = """ | |
<div style='text-align: center;'> | |
<a href='https://github.com/dsfsi/nso-en-m2m100-gov' target='_blank'>GitHub</a> | | |
<a href='https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform' target='_blank'>Feedback Form</a> | | |
<a href='https://arxiv.org/abs/2303.03750' target='_blank'>Arxiv</a> | |
</div> | |
<br/> | |
""" | |
authors = """ | |
<div style='text-align: center;'> | |
Authors: Vukosi Marivate, Matimba Shingange, Richard Lastrucci, | |
Isheanesu Joseph Dzingirai, Jenalea Rajab | |
</div> | |
""" | |
citation = """ | |
<pre style="text-align: left; white-space: pre-wrap;"> | |
@inproceedings{lastrucci-etal-2023-preparing, | |
title = "Preparing the Vuk{'}uzenzele and {ZA}-gov-multilingual {S}outh {A}frican multilingual corpora", | |
author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab | |
and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate", | |
booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)", | |
month = may, | |
year = "2023", | |
address = "Dubrovnik, Croatia", | |
publisher = "Association for Computational Linguistics", | |
url = "https://aclanthology.org/2023.rail-1.3", | |
pages = "18--25" | |
} | |
</pre> | |
""" | |
doi = """ | |
<div style='text-align: center;'> | |
DOI: <a href="https://doi.org/10.48550/arXiv.2303.03750" target="_blank">10.48550/arXiv.2303.03750</a> | |
</div> | |
""" | |
with gr.Blocks() as demo: | |
gr.Markdown(logo) | |
gr.Markdown(description) | |
gr.Markdown(article) | |
with gr.Interface( | |
fn=translate, | |
title="Northern Sotho to English Translation", | |
description=description, | |
article=article, | |
inputs=gr.components.Textbox(lines=5, placeholder="Enter Northern Sotho text (maximum 5 lines)", label="Input"), | |
outputs="text" | |
) as iface: | |
iface.launch(enable_queue=True) | |
with gr.Accordion("More Information", open=False): | |
gr.Markdown(""" | |
<h4 style="text-align: center;">More information about the space</h4> | |
<p>This is a variant of the M2M100 model, fine-tuned on a multilingual dataset | |
to support translation from Northern Sotho (Sepedi) to English. The model was trained | |
with a focus on improving translation accuracy for low-resource languages.</p> | |
""") | |
gr.Markdown(authors) | |
gr.Markdown(citation) | |
gr.Markdown(doi) | |
demo.launch(enable_queue=True) | |