| @misc{200LanguagesSingle, | |
| title = {200 Languages within a Single {{AI}} Model: {{A}} Breakthrough in High-Quality Machine Translation}, | |
| shorttitle = {200 Languages within a Single {{AI}} Model}, | |
| urldate = {2024-11-02}, | |
| abstract = {Meta AI has built a single AI model, NLLB-200, that is the first to translate across 200 different languages with state-of-the-art quality that has been validated through extensive evaluations for each of them.}, | |
| langid = {english}, | |
| keywords = {dataset,model,n=200}, | |
| file = {/Users/david/Zotero/storage/AU759RXC/nllb-200-high-quality-machine-translation.html} | |
| } | |
| @misc{adelaniSIB200SimpleInclusive2024, | |
| title = {{{SIB-200}}: {{A Simple}}, {{Inclusive}}, and {{Big Evaluation Dataset}} for {{Topic Classification}} in 200+ {{Languages}} and {{Dialects}}}, | |
| shorttitle = {{{SIB-200}}}, | |
| author = {Adelani, David Ifeoluwa and Liu, Hannah and Shen, Xiaoyu and Vassilyev, Nikita and Alabi, Jesujoba O. and Mao, Yanke and Gao, Haonan and Lee, Annie En-Shiun}, | |
| year = {2024}, | |
| month = mar, | |
| number = {arXiv:2309.07445}, | |
| eprint = {2309.07445}, | |
| publisher = {arXiv}, | |
| doi = {10.48550/arXiv.2309.07445}, | |
| urldate = {2024-11-02}, | |
| abstract = {Despite the progress we have recorded in the last few years in multilingual natural language processing, evaluation is typically limited to a small set of languages with available datasets which excludes a large number of low-resource languages. In this paper, we created SIB-200 -- a large-scale open-sourced benchmark dataset for topic classification in 200 languages and dialects to address the lack of evaluation dataset for Natural Language Understanding (NLU). For many of the languages covered in SIB-200, this is the first publicly available evaluation dataset for NLU. The dataset is based on Flores-200 machine translation corpus. We annotated the English portion of the dataset and extended the sentence-level annotation to the remaining 203 languages covered in the corpus. Despite the simplicity of this task, our evaluation in full-supervised setting, cross-lingual transfer setting and prompting of large language model setting show that there is still a large gap between the performance of high-resource and low-resource languages when multilingual evaluation is scaled to numerous world languages. We found that languages unseen during the pre-training of multilingual language models, under-represented language families (like Nilotic and Altantic-Congo), and languages from the regions of Africa, Americas, Oceania and South East Asia, often have the lowest performance on our topic classification dataset. We hope our dataset will encourage a more inclusive evaluation of multilingual language models on a more diverse set of languages. https://github.com/dadelani/sib-200}, | |
| archiveprefix = {arXiv}, | |
| keywords = {dataset,n=200}, | |
| file = {/Users/david/Zotero/storage/UFRJDZRG/Adelani et al. - 2024 - SIB-200 A Simple, Inclusive, and Big Evaluation Dataset for Topic Classification in 200+ Languages.pdf;/Users/david/Zotero/storage/T49BTFIH/2309.html} | |
| } | |
| @misc{AfricaNLPCollection, | |
| title = {{{AfricaNLP Collection}}}, | |
| keywords = {dataset-collection} | |
| } | |
| @misc{ahujaMEGAVERSEBenchmarkingLarge2024, | |
| title = {{{MEGAVERSE}}: {{Benchmarking Large Language Models Across Languages}}, {{Modalities}}, {{Models}} and {{Tasks}}}, | |
| shorttitle = {{{MEGAVERSE}}}, | |
| author = {Ahuja, Sanchit and Aggarwal, Divyanshu and Gumma, Varun and Watts, Ishaan and Sathe, Ashutosh and Ochieng, Millicent and Hada, Rishav and Jain, Prachi and Axmed, Maxamed and Bali, Kalika and Sitaram, Sunayana}, | |
| year = {2024}, | |
| month = apr, | |
| number = {arXiv:2311.07463}, | |
| eprint = {2311.07463}, | |
| publisher = {arXiv}, | |
| doi = {10.48550/arXiv.2311.07463}, | |
| urldate = {2024-11-02}, | |
| abstract = {There has been a surge in LLM evaluation research to understand LLM capabilities and limitations. However, much of this research has been confined to English, leaving LLM building and evaluation for non-English languages relatively unexplored. Several new LLMs have been introduced recently, necessitating their evaluation on non-English languages. This study aims to perform a thorough evaluation of the non-English capabilities of SoTA LLMs (GPT-3.5-Turbo, GPT-4, PaLM2, Gemini-Pro, Mistral, Llama2, and Gemma) by comparing them on the same set of multilingual datasets. Our benchmark comprises 22 datasets covering 83 languages, including low-resource African languages. We also include two multimodal datasets in the benchmark and compare the performance of LLaVA models, GPT-4-Vision and Gemini-Pro-Vision. Our experiments show that larger models such as GPT-4, Gemini-Pro and PaLM2 outperform smaller models on various tasks, notably on low-resource languages, with GPT-4 outperforming PaLM2 and Gemini-Pro on more datasets. We also perform a study on data contamination and find that several models are likely to be contaminated with multilingual evaluation benchmarks, necessitating approaches to detect and handle contamination while assessing the multilingual performance of LLMs.}, | |
| archiveprefix = {arXiv}, | |
| keywords = {dataset,evaluation,n=83}, | |
| file = {/Users/david/Zotero/storage/Q8A3WGUG/Ahuja et al. - 2024 - MEGAVERSE Benchmarking Large Language Models Across Languages, Modalities, Models and Tasks.pdf;/Users/david/Zotero/storage/ZHA8FR3E/2311.html} | |
| } | |
| @misc{bapnaBuildingMachineTranslation2022, | |
| title = {Building {{Machine Translation Systems}} for the {{Next Thousand Languages}}}, | |
| author = {Bapna, Ankur and Caswell, Isaac and Kreutzer, Julia and Firat, Orhan and van Esch, Daan and Siddhant, Aditya and Niu, Mengmeng and Baljekar, Pallavi and Garcia, Xavier and Macherey, Wolfgang and Breiner, Theresa and Axelrod, Vera and Riesa, Jason and Cao, Yuan and Chen, Mia Xu and Macherey, Klaus and Krikun, Maxim and Wang, Pidong and Gutkin, Alexander and Shah, Apurva and Huang, Yanping and Chen, Zhifeng and Wu, Yonghui and Hughes, Macduff}, | |
| year = {2022}, | |
| month = jul, | |
| number = {arXiv:2205.03983}, | |
| eprint = {2205.03983}, | |
| publisher = {arXiv}, | |
| doi = {10.48550/arXiv.2205.03983}, | |
| urldate = {2024-11-02}, | |
| abstract = {In this paper we share findings from our effort to build practical machine translation (MT) systems capable of translating across over one thousand languages. We describe results in three research domains: (i) Building clean, web-mined datasets for 1500+ languages by leveraging semi-supervised pre-training for language identification and developing data-driven filtering techniques; (ii) Developing practical MT models for under-served languages by leveraging massively multilingual models trained with supervised parallel data for over 100 high-resource languages and monolingual datasets for an additional 1000+ languages; and (iii) Studying the limitations of evaluation metrics for these languages and conducting qualitative analysis of the outputs from our MT models, highlighting several frequent error modes of these types of models. We hope that our work provides useful insights to practitioners working towards building MT systems for currently understudied languages, and highlights research directions that can complement the weaknesses of massively multilingual models in data-sparse settings.}, | |
| archiveprefix = {arXiv}, | |
| keywords = {dataset,model,n=1500}, | |
| file = {/Users/david/Zotero/storage/YCW6FWWE/Bapna et al. - 2022 - Building Machine Translation Systems for the Next Thousand Languages.pdf;/Users/david/Zotero/storage/EL7PA6YJ/2205.html} | |
| } | |
| @article{costa-jussaScalingNeuralMachine2024, | |
| title = {Scaling Neural Machine Translation to 200 Languages}, | |
| author = {{Costa-juss{\`a}}, Marta R. and Cross, James and {\c C}elebi, Onur and Elbayad, Maha and Heafield, Kenneth and Heffernan, Kevin and Kalbassi, Elahe and Lam, Janice and Licht, Daniel and Maillard, Jean and Sun, Anna and Wang, Skyler and Wenzek, Guillaume and Youngblood, Al and Akula, Bapi and Barrault, Loic and Gonzalez, Gabriel Mejia and Hansanti, Prangthip and Hoffman, John and Jarrett, Semarley and Sadagopan, Kaushik Ram and Rowe, Dirk and Spruit, Shannon and Tran, Chau and Andrews, Pierre and Ayan, Necip Fazil and Bhosale, Shruti and Edunov, Sergey and Fan, Angela and Gao, Cynthia and Goswami, Vedanuj and Guzm{\'a}n, Francisco and Koehn, Philipp and Mourachko, Alexandre and Ropers, Christophe and Saleem, Safiyyah and Schwenk, Holger and Wang, Jeff and {NLLB Team}}, | |
| year = {2024}, | |
| month = jun, | |
| journal = {Nature}, | |
| volume = {630}, | |
| number = {8018}, | |
| pages = {841--846}, | |
| publisher = {Nature Publishing Group}, | |
| issn = {1476-4687}, | |
| doi = {10.1038/s41586-024-07335-x}, | |
| urldate = {2024-11-02}, | |
| abstract = {The development of neural techniques has opened up new avenues for research in machine translation. Today, neural machine translation (NMT) systems can leverage highly multilingual capacities and even perform zero-shot translation, delivering promising results in terms of language coverage and quality. However, scaling quality NMT requires large volumes of parallel bilingual data, which are not equally available for the 7,000+ languages in the world1. Focusing on improving the translation qualities of a relatively small group of high-resource languages comes at the expense of directing research attention to low-resource languages, exacerbating digital inequities in the long run. To break this pattern, here we introduce No Language Left Behind---a single massively multilingual model that leverages transfer learning across languages. We developed a conditional computational model based on the Sparsely Gated Mixture of Experts architecture2--7, which we trained on data obtained with new mining techniques tailored for low-resource languages. Furthermore, we devised multiple architectural and training improvements to counteract overfitting while training on thousands of tasks. We evaluated the performance of our model over 40,000 translation directions using tools created specifically for this purpose---an automatic benchmark (FLORES-200), a human evaluation metric (XSTS) and a toxicity detector that covers every language in our model. Compared with the previous state-of-the-art models, our model achieves an average of 44\% improvement in translation quality as measured by BLEU. By demonstrating how to scale NMT to 200 languages and making all contributions in this effort freely available for non-commercial use, our work lays important groundwork for the development of a universal translation system.}, | |
| copyright = {2024 Meta}, | |
| langid = {english}, | |
| keywords = {dataset,model,n=200}, | |
| file = {/Users/david/Zotero/storage/R7AB5BR3/Costa-jussà et al. - 2024 - Scaling neural machine translation to 200 languages.pdf} | |
| } | |
| @book{eberhard2024ethnologue, | |
| title = {Ethnologue: {{Languages}} of the World}, | |
| editor = {Eberhard, David M. and Simons, Gary F. and Fennig, Charles D.}, | |
| year = {2024}, | |
| edition = {27}, | |
| publisher = {SIL International}, | |
| address = {Dallas, Texas} | |
| } | |
| @inproceedings{federmannNTREX128NewsTest2022, | |
| title = {{{NTREX-128}} -- {{News Test References}} for {{MT Evaluation}} of 128 {{Languages}}}, | |
| booktitle = {Proceedings of the {{First Workshop}} on {{Scaling Up Multilingual Evaluation}}}, | |
| author = {Federmann, Christian and Kocmi, Tom and Xin, Ying}, | |
| editor = {Ahuja, Kabir and Anastasopoulos, Antonios and Patra, Barun and Neubig, Graham and Choudhury, Monojit and Dandapat, Sandipan and Sitaram, Sunayana and Chaudhary, Vishrav}, | |
| year = {2022}, | |
| month = nov, | |
| pages = {21--24}, | |
| publisher = {Association for Computational Linguistics}, | |
| address = {Online}, | |
| urldate = {2024-11-02}, | |
| keywords = {dataset,n=128}, | |
| file = {/Users/david/Zotero/storage/E286EDPU/Federmann et al. - 2022 - NTREX-128 – News Test References for MT Evaluation of 128 Languages.pdf} | |
| } | |
| @misc{gurgurovLowREmRepositoryWord2024, | |
| title = {{{LowREm}}: {{A Repository}} of {{Word Embeddings}} for 87 {{Low-Resource Languages Enhanced}} with {{Multilingual Graph Knowledge}}}, | |
| shorttitle = {{{LowREm}}}, | |
| author = {Gurgurov, Daniil and Kumar, Rishu and Ostermann, Simon}, | |
| year = {2024}, | |
| month = sep, | |
| number = {arXiv:2409.18193}, | |
| eprint = {2409.18193}, | |
| publisher = {arXiv}, | |
| doi = {10.48550/arXiv.2409.18193}, | |
| urldate = {2024-11-02}, | |
| abstract = {Contextualized embeddings based on large language models (LLMs) are available for various languages, but their coverage is often limited for lower resourced languages. Training LLMs for such languages is often difficult due to insufficient data and high computational cost. Especially for very low resource languages, static word embeddings thus still offer a viable alternative. There is, however, a notable lack of comprehensive repositories with such embeddings for diverse languages. To address this, we present LowREm, a centralized repository of static embeddings for 87 low-resource languages. We also propose a novel method to enhance GloVe-based embeddings by integrating multilingual graph knowledge, utilizing another source of knowledge. We demonstrate the superior performance of our enhanced embeddings as compared to contextualized embeddings extracted from XLM-R on sentiment analysis. Our code and data are publicly available under https://huggingface.co/DFKI.}, | |
| archiveprefix = {arXiv}, | |
| keywords = {model,n=87}, | |
| file = {/Users/david/Zotero/storage/CGG3Y22P/Gurgurov et al. - 2024 - LowREm A Repository of Word Embeddings for 87 Low-Resource Languages Enhanced with Multilingual Gra.pdf;/Users/david/Zotero/storage/TJLLL6RT/2409.html} | |
| } | |
| @misc{HPLTDatasetsV2, | |
| title = {{{HPLT Datasets}} V2}, | |
| urldate = {2024-11-02}, | |
| keywords = {dataset,n=193}, | |
| file = {/Users/david/Zotero/storage/UFLXIHNC/v2.html} | |
| } | |
| @inproceedings{joshiStateFateLinguistic2020, | |
| title = {The {{State}} and {{Fate}} of {{Linguistic Diversity}} and {{Inclusion}} in the {{NLP World}}}, | |
| booktitle = {Proceedings of the 58th {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}}}, | |
| author = {Joshi, Pratik and Santy, Sebastin and Budhiraja, Amar and Bali, Kalika and Choudhury, Monojit}, | |
| year = {2020}, | |
| pages = {6282--6293}, | |
| publisher = {Association for Computational Linguistics}, | |
| address = {Online}, | |
| doi = {10.18653/v1/2020.acl-main.560}, | |
| urldate = {2024-11-02}, | |
| langid = {english}, | |
| keywords = {evaluation,n=2500}, | |
| file = {/Users/david/Zotero/storage/TDKP4GV9/Joshi et al. - 2020 - The State and Fate of Linguistic Diversity and Inclusion in the NLP World.pdf} | |
| } | |
| @misc{LacunaLanguageDatasets, | |
| title = {Lacuna {{Language Datasets}}}, | |
| urldate = {2024-11-02}, | |
| langid = {american}, | |
| keywords = {dataset-collection}, | |
| file = {/Users/david/Zotero/storage/SMRV9HE2/language.html} | |
| } | |
| @misc{Lanfrica, | |
| title = {Lanfrica}, | |
| urldate = {2024-11-02}, | |
| abstract = {Lanfrica catalogues, archives and links African language resources in order to mitigate the difficulty encountered in discovering African works.}, | |
| howpublished = {https://lanfrica.com/records}, | |
| langid = {english} | |
| } | |
| @inproceedings{maillardSmallDataBig2023, | |
| title = {Small {{Data}}, {{Big Impact}}: {{Leveraging Minimal Data}} for {{Effective Machine Translation}}}, | |
| shorttitle = {Small {{Data}}, {{Big Impact}}}, | |
| booktitle = {Proceedings of the 61st {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}} ({{Volume}} 1: {{Long Papers}})}, | |
| author = {Maillard, Jean and Gao, Cynthia and Kalbassi, Elahe and Sadagopan, Kaushik Ram and Goswami, Vedanuj and Koehn, Philipp and Fan, Angela and Guzman, Francisco}, | |
| editor = {Rogers, Anna and {Boyd-Graber}, Jordan and Okazaki, Naoaki}, | |
| year = {2023}, | |
| month = jul, | |
| pages = {2740--2756}, | |
| publisher = {Association for Computational Linguistics}, | |
| address = {Toronto, Canada}, | |
| doi = {10.18653/v1/2023.acl-long.154}, | |
| urldate = {2024-11-02}, | |
| abstract = {For many languages, machine translation progress is hindered by the lack of reliable training data. Models are trained on whatever pre-existing datasets may be available and then augmented with synthetic data, because it is often not economical to pay for the creation of large-scale datasets. But for the case of low-resource languages, would the creation of a few thousand professionally translated sentence pairs give any benefit? In this paper, we show that it does. We describe a broad data collection effort involving around 6k professionally translated sentence pairs for each of 39 low-resource languages, which we make publicly available. We analyse the gains of models trained on this small but high-quality data, showing that it has significant impact even when larger but lower quality pre-existing corpora are used, or when data is augmented with millions of sentences through backtranslation.}, | |
| keywords = {dataset,n=39}, | |
| file = {/Users/david/Zotero/storage/6BYYZ7V2/Maillard et al. - 2023 - Small Data, Big Impact Leveraging Minimal Data for Effective Machine Translation.pdf} | |
| } | |
| @inproceedings{nekotoParticipatoryResearchLowresourced2020, | |
| title = {Participatory {{Research}} for {{Low-resourced Machine Translation}}: {{A Case Study}} in {{African Languages}}}, | |
| shorttitle = {Participatory {{Research}} for {{Low-resourced Machine Translation}}}, | |
| booktitle = {Findings of the {{Association}} for {{Computational Linguistics}}: {{EMNLP}} 2020}, | |
| author = {Nekoto, Wilhelmina and Marivate, Vukosi and Matsila, Tshinondiwa and Fasubaa, Timi and Fagbohungbe, Taiwo and Akinola, Solomon Oluwole and Muhammad, Shamsuddeen and Kabongo Kabenamualu, Salomon and Osei, Salomey and Sackey, Freshia and Niyongabo, Rubungo Andre and Macharm, Ricky and Ogayo, Perez and Ahia, Orevaoghene and Berhe, Musie Meressa and Adeyemi, Mofetoluwa and {Mokgesi-Selinga}, Masabata and Okegbemi, Lawrence and Martinus, Laura and Tajudeen, Kolawole and Degila, Kevin and Ogueji, Kelechi and Siminyu, Kathleen and Kreutzer, Julia and Webster, Jason and Ali, Jamiil Toure and Abbott, Jade and Orife, Iroro and Ezeani, Ignatius and Dangana, Idris Abdulkadir and Kamper, Herman and Elsahar, Hady and Duru, Goodness and Kioko, Ghollah and Espoir, Murhabazi and {van Biljon}, Elan and Whitenack, Daniel and Onyefuluchi, Christopher and Emezue, Chris Chinenye and Dossou, Bonaventure F. P. and Sibanda, Blessing and Bassey, Blessing and Olabiyi, Ayodele and Ramkilowan, Arshath and {\"O}ktem, Alp and Akinfaderin, Adewale and Bashir, Abdallah}, | |
| editor = {Cohn, Trevor and He, Yulan and Liu, Yang}, | |
| year = {2020}, | |
| month = nov, | |
| pages = {2144--2160}, | |
| publisher = {Association for Computational Linguistics}, | |
| address = {Online}, | |
| doi = {10.18653/v1/2020.findings-emnlp.195}, | |
| urldate = {2024-11-02}, | |
| abstract = {Research in NLP lacks geographic diversity, and the question of how NLP can be scaled to low-resourced languages has not yet been adequately solved. `Low-resourced'-ness is a complex problem going beyond data availability and reflects systemic problems in society. In this paper, we focus on the task of Machine Translation (MT), that plays a crucial role for information accessibility and communication worldwide. Despite immense improvements in MT over the past decade, MT is centered around a few high-resourced languages. As MT researchers cannot solve the problem of low-resourcedness alone, we propose participatory research as a means to involve all necessary agents required in the MT development process. We demonstrate the feasibility and scalability of participatory research with a case study on MT for African languages. Its implementation leads to a collection of novel translation datasets, MT benchmarks for over 30 languages, with human evaluations for a third of them, and enables participants without formal training to make a unique scientific contribution. Benchmarks, models, data, code, and evaluation results are released at https://github.com/masakhane-io/masakhane-mt.}, | |
| keywords = {dataset-collection,n=30}, | |
| file = {/Users/david/Zotero/storage/JJ2S8CT3/Nekoto et al. - 2020 - Participatory Research for Low-resourced Machine Translation A Case Study in African Languages.pdf} | |
| } | |
| @misc{OpenlanguagedataFlores2024, | |
| title = {Openlanguagedata/Flores}, | |
| year = {2024}, | |
| month = nov, | |
| urldate = {2024-11-02}, | |
| abstract = {The FLORES+ Machine Translation Benchmark}, | |
| copyright = {CC-BY-SA-4.0}, | |
| howpublished = {openlanguagedata}, | |
| keywords = {dataset,n=200} | |
| } | |
| @inproceedings{robinsonChatGPTMTCompetitive2023, | |
| title = {{{ChatGPT MT}}: {{Competitive}} for {{High-}} (but {{Not Low-}}) {{Resource Languages}}}, | |
| shorttitle = {{{ChatGPT MT}}}, | |
| booktitle = {Proceedings of the {{Eighth Conference}} on {{Machine Translation}}}, | |
| author = {Robinson, Nathaniel and Ogayo, Perez and Mortensen, David R. and Neubig, Graham}, | |
| editor = {Koehn, Philipp and Haddow, Barry and Kocmi, Tom and Monz, Christof}, | |
| year = {2023}, | |
| month = dec, | |
| pages = {392--418}, | |
| publisher = {Association for Computational Linguistics}, | |
| address = {Singapore}, | |
| doi = {10.18653/v1/2023.wmt-1.40}, | |
| urldate = {2024-11-02}, | |
| abstract = {Large language models (LLMs) implicitly learn to perform a range of language tasks, including machine translation (MT). Previous studies explore aspects of LLMs' MT capabilities. However, there exist a wide variety of languages for which recent LLM MT performance has never before been evaluated. Without published experimental evidence on the matter, it is difficult for speakers of the world's diverse languages to know how and whether they can use LLMs for their languages. We present the first experimental evidence for an expansive set of 204 languages, along with MT cost analysis, using the FLORES-200 benchmark. Trends reveal that GPT models approach or exceed traditional MT model performance for some high-resource languages (HRLs) but consistently lag for low-resource languages (LRLs), under-performing traditional MT for 84.1\% of languages we covered. Our analysis reveals that a language's resource level is the most important feature in determining ChatGPT's relative ability to translate it, and suggests that ChatGPT is especially disadvantaged for LRLs and African languages.}, | |
| keywords = {evaluation,n=200}, | |
| file = {/Users/david/Zotero/storage/BWFBTAZ9/Robinson et al. - 2023 - ChatGPT MT Competitive for High- (but Not Low-) Resource Languages.pdf} | |
| } | |
| @misc{siminyuAI4DAfricanLanguage2021, | |
| title = {{{AI4D}} -- {{African Language Program}}}, | |
| author = {Siminyu, Kathleen and Kalipe, Godson and Orlic, Davor and Abbott, Jade and Marivate, Vukosi and Freshia, Sackey and Sibal, Prateek and Neupane, Bhanu and Adelani, David I. and Taylor, Amelia and ALI, Jamiil Toure and Degila, Kevin and Balogoun, Momboladji and DIOP, Thierno Ibrahima and David, Davis and Fourati, Chayma and Haddad, Hatem and Naski, Malek}, | |
| year = {2021}, | |
| month = apr, | |
| number = {arXiv:2104.02516}, | |
| eprint = {2104.02516}, | |
| publisher = {arXiv}, | |
| doi = {10.48550/arXiv.2104.02516}, | |
| urldate = {2024-11-02}, | |
| abstract = {Advances in speech and language technologies enable tools such as voice-search, text-to-speech, speech recognition and machine translation. These are however only available for high resource languages like English, French or Chinese. Without foundational digital resources for African languages, which are considered low-resource in the digital context, these advanced tools remain out of reach. This work details the AI4D - African Language Program, a 3-part project that 1) incentivised the crowd-sourcing, collection and curation of language datasets through an online quantitative and qualitative challenge, 2) supported research fellows for a period of 3-4 months to create datasets annotated for NLP tasks, and 3) hosted competitive Machine Learning challenges on the basis of these datasets. Key outcomes of the work so far include 1) the creation of 9+ open source, African language datasets annotated for a variety of ML tasks, and 2) the creation of baseline models for these datasets through hosting of competitive ML challenges.}, | |
| archiveprefix = {arXiv}, | |
| keywords = {dataset-collection}, | |
| file = {/Users/david/Zotero/storage/VU6IFENR/Siminyu et al. - 2021 - AI4D -- African Language Program.pdf;/Users/david/Zotero/storage/7TV2PS8J/2104.html} | |
| } | |
| @misc{Tatoeba, | |
| title = {Tatoeba}, | |
| urldate = {2024-11-03}, | |
| file = {/Users/david/Zotero/storage/4NDTCGWG/sentences_by_language.html} | |
| } | |
| @misc{teamNoLanguageLeft2022, | |
| title = {No {{Language Left Behind}}: {{Scaling Human-Centered Machine Translation}}}, | |
| shorttitle = {No {{Language Left Behind}}}, | |
| author = {Team, {\relax NLLB} and {Costa-juss{\`a}}, Marta R. and Cross, James and {\c C}elebi, Onur and Elbayad, Maha and Heafield, Kenneth and Heffernan, Kevin and Kalbassi, Elahe and Lam, Janice and Licht, Daniel and Maillard, Jean and Sun, Anna and Wang, Skyler and Wenzek, Guillaume and Youngblood, Al and Akula, Bapi and Barrault, Loic and Gonzalez, Gabriel Mejia and Hansanti, Prangthip and Hoffman, John and Jarrett, Semarley and Sadagopan, Kaushik Ram and Rowe, Dirk and Spruit, Shannon and Tran, Chau and Andrews, Pierre and Ayan, Necip Fazil and Bhosale, Shruti and Edunov, Sergey and Fan, Angela and Gao, Cynthia and Goswami, Vedanuj and Guzm{\'a}n, Francisco and Koehn, Philipp and Mourachko, Alexandre and Ropers, Christophe and Saleem, Safiyyah and Schwenk, Holger and Wang, Jeff}, | |
| year = {2022}, | |
| month = aug, | |
| number = {arXiv:2207.04672}, | |
| eprint = {2207.04672}, | |
| publisher = {arXiv}, | |
| doi = {10.48550/arXiv.2207.04672}, | |
| urldate = {2024-11-02}, | |
| abstract = {Driven by the goal of eradicating language barriers on a global scale, machine translation has solidified itself as a key focus of artificial intelligence research today. However, such efforts have coalesced around a small subset of languages, leaving behind the vast majority of mostly low-resource languages. What does it take to break the 200 language barrier while ensuring safe, high quality results, all while keeping ethical considerations in mind? In No Language Left Behind, we took on this challenge by first contextualizing the need for low-resource language translation support through exploratory interviews with native speakers. Then, we created datasets and models aimed at narrowing the performance gap between low and high-resource languages. More specifically, we developed a conditional compute model based on Sparsely Gated Mixture of Experts that is trained on data obtained with novel and effective data mining techniques tailored for low-resource languages. We propose multiple architectural and training improvements to counteract overfitting while training on thousands of tasks. Critically, we evaluated the performance of over 40,000 different translation directions using a human-translated benchmark, Flores-200, and combined human evaluation with a novel toxicity benchmark covering all languages in Flores-200 to assess translation safety. Our model achieves an improvement of 44\% BLEU relative to the previous state-of-the-art, laying important groundwork towards realizing a universal translation system. Finally, we open source all contributions described in this work, accessible at https://github.com/facebookresearch/fairseq/tree/nllb.}, | |
| archiveprefix = {arXiv}, | |
| keywords = {dataset,model,n=200}, | |
| file = {/Users/david/Zotero/storage/GHWEGFFS/Team et al. - 2022 - No Language Left Behind Scaling Human-Centered Machine Translation.pdf;/Users/david/Zotero/storage/SZK3CP9C/2207.html} | |
| } | |
