Spaces:

nitikdias
/

stt-ml

Running

App Files Files Community

nitikdias commited on Mar 19

Commit

74ee63f

verified ·

1 Parent(s): df298e7

Upload 114 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
IndicTrans2/.gitignore +148 -0
IndicTrans2/LICENSE +21 -0
IndicTrans2/README.md +528 -0
IndicTrans2/apply_sentence_piece.sh +48 -0
IndicTrans2/baseline_eval/azure_translate.py +183 -0
IndicTrans2/baseline_eval/google_translate.py +129 -0
IndicTrans2/baseline_eval/m2m100_inference.py +148 -0
IndicTrans2/baseline_eval/mbart_inference.py +159 -0
IndicTrans2/baseline_eval/nllb_moe_cpu_inference.py +157 -0
IndicTrans2/compute_comet_score.sh +84 -0
IndicTrans2/compute_metrics.sh +29 -0
IndicTrans2/compute_metrics_significance.sh +66 -0
IndicTrans2/eval.sh +54 -0
IndicTrans2/eval_rev.sh +55 -0
IndicTrans2/finetune.sh +54 -0
IndicTrans2/huggingface_interface/.gitignore +1 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/.gitignore +4 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/CHANGELOG.md +16 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/PKG-INFO +130 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/SOURCES.txt +15 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/dependency_links.txt +1 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/not-zip-safe +1 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/requires.txt +8 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/top_level.txt +1 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__init__.py +9 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-310.pyc +0 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-313.pyc +0 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-310.pyc +0 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-313.pyc +0 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-310.pyc +0 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-313.pyc +0 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/processor.cpython-310.pyc +0 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/collator.py +74 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/evaluator.py +151 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.c +0 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.cp310-win_amd64.pyd +3 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so +3 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.pyx +503 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/version.py +1 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/version.txt +1 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/LICENSE +21 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/README.md +97 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/app.py +118 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.cpython-310-x86_64-linux-gnu.so +3 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so +3 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.o +3 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/processor.o +3 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/main.py +113 -0
IndicTrans2/huggingface_interface/IndicTransToolkit/pyproject.toml +25 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+IndicTrans2/huggingface_interface/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+IndicTrans2/huggingface_interface/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+IndicTrans2/huggingface_interface/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.o filter=lfs diff=lfs merge=lfs -text
+IndicTrans2/huggingface_interface/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/processor.o filter=lfs diff=lfs merge=lfs -text
+IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.cp310-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
+IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+IndicTrans2/translation_guidelines.pdf filter=lfs diff=lfs merge=lfs -text

IndicTrans2/.gitignore ADDED Viewed

	@@ -0,0 +1,148 @@

+# ignore libs and data folder we use
+indic_nlp_library
+indic_nlp_resources
+fairseq
+devtest
+checkpoints
+eval_benchmarks
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+.DS_Store

IndicTrans2/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+    MIT License
+    Copyright (c) AI4Bharat.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE

IndicTrans2/README.md ADDED Viewed

	@@ -0,0 +1,528 @@

+# IndicTrans2
+[📜 Paper](https://arxiv.org/abs/2305.16307) | [🌐 Website](https://ai4bharat.iitm.ac.in/indic-trans2) | [▶️ Demo](https://models.ai4bharat.org/#/nmt/v2) | [🤗 HF Interface](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface) | [![colab link](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI4Bharat/IndicTrans2/blob/main/huggingface_interface/colab_inference.ipynb)
+IndicTrans2 is the first open-source transformer-based multilingual NMT model that supports high-quality translations across all the 22 scheduled Indic languages — including multiple scripts for low-resouce languages like Kashmiri, Manipuri and Sindhi. It adopts script unification wherever feasible to leverage transfer learning by lexical sharing between languages. Overall, the model supports five scripts Perso-Arabic (Kashmiri, Sindhi, Urdu), Ol Chiki (Santali), Meitei (Manipuri), Latin (English), and Devanagari (used for all the remaining languages).
+We open-souce all our training dataset (BPCC), back-translation data (BPCC-BT), final IndicTrans2 models, evaluation benchmarks (IN22, which includes IN22-Gen and IN22-Conv) and training and inference scripts for easier use and adoption within the research community. We hope that this will foster even more research in low-resource Indic languages, leading to further improvements in the quality of low-resource translation through contributions from the research community.
+This code repository contains instructions for downloading the artifacts associated with IndicTrans2, as well as the code for training/fine-tuning the multilingual NMT models.
+Here is the list of languages supported by the IndicTrans2 models:
+<table>
+<tbody>
+  <tr>
+    <td>Assamese (asm_Beng)</td>
+    <td>Kashmiri (Arabic) (kas_Arab)</td>
+    <td>Punjabi (pan_Guru)</td>
+  </tr>
+  <tr>
+    <td>Bengali (ben_Beng)</td>
+    <td>Kashmiri (Devanagari) (kas_Deva)</td>
+    <td>Sanskrit (san_Deva)</td>
+  </tr>
+  <tr>
+    <td>Bodo (brx_Deva)</td>
+    <td>Maithili (mai_Deva)</td>
+    <td>Santali (sat_Olck)</td>
+  </tr>
+  <tr>
+    <td>Dogri (doi_Deva)</td>
+    <td>Malayalam (mal_Mlym)</td>
+    <td>Sindhi (Arabic) (snd_Arab)</td>
+  </tr>
+  <tr>
+    <td>English (eng_Latn)</td>
+    <td>Marathi (mar_Deva)</td>
+    <td>Sindhi (Devanagari) (snd_Deva)</td>
+  </tr>
+  <tr>
+    <td>Konkani (gom_Deva)</td>
+    <td>Manipuri (Bengali) (mni_Beng)</td>
+    <td>Tamil (tam_Taml)</td>
+  </tr>
+  <tr>
+    <td>Gujarati (guj_Gujr)</td>
+    <td>Manipuri (Meitei) (mni_Mtei)</td>
+    <td>Telugu (tel_Telu)</td>
+  </tr>
+  <tr>
+    <td>Hindi (hin_Deva)</td>
+    <td>Nepali (npi_Deva)</td>
+    <td>Urdu (urd_Arab)</td>
+  </tr>
+  <tr>
+    <td>Kannada (kan_Knda)</td>
+    <td>Odia (ory_Orya)</td>
+    <td></td>
+  </tr>
+</tbody>
+</table>
+## Updates
+- 🚨 Jan 18, 2025 - Long Context Models- RoPE-based variants of IndicTrans2 models capable of handling sequence lengths **upto 2048 tokens** are available [here](https://huggingface.co/collections/prajdabre/indictrans2-rope-6742ddac669a05db0804db35).
+- 🚨 Dec 20, 2024 - The latest releases of the high-quality human-annotated BPCC-Seed dataset would henceforth be made available on the [AI4Bharat Website](https://ai4bharat.iitm.ac.in/datasets/bpcc).
+- 🚨 Dec 30, 2023 - Migrated IndicTrans2 tokenizer for HF compatible IndicTrans2 models to [IndicTransToolkit](https://github.com/VarunGumma/IndicTransToolkit) and will be maintained separately there from now onwards. Add LoRA fine-tuning scripts for our IndicTrans2 models in [huggingface_interface](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface).
+- 🚨 Dec 1, 2023 - Release of Indic-Indic model and corresponding distilled variants for each base model. Please refer to the [Download section](https://github.com/AI4Bharat/IndicTrans2#multilingual-translation-models) for the checkpoints.
+- 🚨 Sep 9, 2023 - Added HF compatible IndicTrans2 models. Please refer to the [README](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface) for detailed example usage.
+## Tables of Contents
+- [Download Models and Other Artifacts](#download-models-and-other-artifacts)
+  - [Multilingual Translation Models](#multilingual-translation-models)
+  - [Training Data](#training-data)
+  - [Evaluation Data](#evaluation-data)
+- [Installation](#installation)
+- [Data](#data)
+  - [Training](#training)
+  - [Evaluation](#evaluation)
+- [Preparing Data for Training](#preparing-data-for-training)
+  - [Using our SPM model and Fairseq dictionary](#using-our-spm-model-and-fairseq-dictionary)
+  - [Training your own SPM models and learning Fairseq dictionary](#training-your-own-spm-models-and-learning-fairseq-dictionary)
+- [Training / Fine-tuning](#training--fine-tuning)
+- [Inference](#inference)
+  - [Fairseq Inference](#fairseq-inference)
+  - [CT2 Inference](#ct2-inference)
+- [Evaluations](#evaluations)
+  - [Baseline Evaluation](#baseline-evaluation)
+- [LICENSE](#license)
+- [Citation](#citation)
+## Download Models and Other Artifacts
+### Multilingual Translation Models
+| Model                        | En-Indic                                                                                                    | Indic-En                                                                                                    | Indic-Indic                                                                                            | Evaluations                                                                                                                                                                                                          |
+| ---------------------------- | ----------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Base (used for benchmarking) | [Fairseq](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/en-indic-preprint.tar.gz) & [HF](https://huggingface.co/ai4bharat/indictrans2-en-indic-1B) | [fairseq](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/indic-en-preprint.tar.gz) & [HF](https://huggingface.co/ai4bharat/indictrans2-indic-en-1B) | [HF](https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B)  | [translations](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/translation_outputs.tar.gz) (as of May 10, 2023), [metrics](https://drive.google.com/drive/folders/1lOOdaU0VdRSBgJEsNav5zC7wwLBis9NI?usp=sharing) |
+| Distilled                    | [Fairseq](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/en-indic-dist.tar.gz) & [HF](https://huggingface.co/ai4bharat/indictrans2-en-indic-dist-200M)         | [Fairseq](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/indic-en-dist.tar.gz) & [HF](https://huggingface.co/ai4bharat/indictrans2-indic-en-dist-200M)         | [HF](https://huggingface.co/ai4bharat/indictrans2-indic-indic-dist-320M) |
+### Training Data
+|Data                                  | URL                                                                                          |
+|-------------------------------------------|--------------------------------------------------------------------------------------------------|
+| ✨ BPCC-Seed Latest Release           | [HF Config: bpcc-seed-latest](https://huggingface.co/datasets/ai4bharat/BPCC)                                           |
+| BPCC (*Used in Paper - utilizes the BPCC-Seed V1 dataset*)    | [HF Config: bpcc-seed-v1](https://huggingface.co/datasets/ai4bharat/BPCC)                      |
+| Back-translation (BPCC-BT)            | Will be updated                   |
+| Full Data Split            | [Download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/BPCC.zip)                   |
+### Evaluation Data
+| Data                    | URL                                                                                  |
+| ----------------------- | ------------------------------------------------------------------------------------ |
+| IN22 test set           | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/IN22_testset.zip)  |
+| FLORES-22 Indic dev set | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/flores-22_dev.zip) |
+## Installation
+Instructions to setup and install everything before running the code.
+```bash
+# Clone the github repository and navigate to the project directory.
+git clone https://github.com/AI4Bharat/IndicTrans2
+cd IndicTrans2
+# Install all the dependencies and requirements associated with the project.
+source install.sh
+```
+Note: We recommend creating a virtual environment with python>=3.7.
+### Additional notes about Installation
+The ``prepare_data_joint_finetuning.sh`` and ``prepare_data_joint_training.sh`` scripts expect that the sentencepiece commandline utility and GNU parallel are installed.
+1. To install the sentencepiece command line utility, please follow the instructions [here](https://github.com/google/sentencepiece?tab=readme-ov-file#build-and-install-sentencepiece-command-line-tools-from-c-source).
+2. Please check if GNU parallel is installed, if not please install the same or alternatively in case of installation issues, remove ``parallel --pipe --keep-order`` from the respective training / finetuning script as well as ``apply_sentence_piece.sh``.
+## Data
+### Training
+Bharat Parallel Corpus Collection (BPCC) is a comprehensive and publicly available parallel corpus that includes both existing and new data for all 22 scheduled Indic languages. It is comprised of two parts: BPCC-Mined and BPCC-Human, totaling approximately 230 million bitext pairs. BPCC-Mined contains about 228 million pairs, with nearly 126 million pairs newly added as a part of this work. On the other hand, BPCC-Human consists of 2.2 million gold standard English-Indic pairs, with an additional 644K bitext pairs from English Wikipedia sentences (forming the BPCC-H-Wiki subset) and 139K sentences covering everyday use cases (forming the BPCC-H-Daily subset). It is worth highlighting that BPCC provides the first available datasets for 7 languages and significantly increases the available data for all languages covered.
+You can find the contribution from different sources in the following table:
+<table>
+<tbody>
+  <tr>
+    <td rowspan="4">BPCC-Mined</th>
+    <td rowspan="2">Existing</th>
+    <td>Samanantar</th>
+    <td>19.4M</th>
+  </tr>
+  <tr>
+    <td>NLLB</th>
+    <td>85M</th>
+  </tr>
+  <tr>
+    <td rowspan="2">Newly Added</th>
+    <td>Samanantar++</th>
+    <td>121.6M</th>
+  </tr>
+  <tr>
+    <td>Comparable</th>
+    <td>4.3M</th>
+  </tr>
+  <tr>
+    <td rowspan="5">BPCC-Human</td>
+    <td rowspan="3">Existing</td>
+    <td>NLLB</td>
+    <td>18.5K</td>
+  </tr>
+  <tr>
+    <td>ILCI</td>
+    <td>1.3M</td>
+  </tr>
+  <tr>
+    <td>Massive</td>
+    <td>115K</td>
+  </tr>
+  <tr>
+    <td rowspan="2">Newly Added</td>
+    <td>Wiki</td>
+    <td>644K</td>
+  </tr>
+  <tr>
+    <td>Daily</td>
+    <td>139K</td>
+  </tr>
+</tbody>
+</table>
+Additionally, we provide augmented back-translation data generated by our intermediate IndicTrans2 models for training purposes. Please refer our paper for more details on the selection of sample proportions and sources.
+<table>
+<tbody>
+  <tr>
+    <td>English BT data (English Original)</td>
+    <td>401.9M</td>
+  </tr>
+  <tr>
+    <td>Indic BT data (Indic Original)</td>
+    <td>400.9M</td>
+  </tr>
+</tbody>
+</table>
+<br>
+### Evaluation
+IN22 test set is a newly created comprehensive benchmark for evaluating machine translation performance in multi-domain, n-way parallel contexts across 22 Indic languages. It has been created from three distinct subsets, namely IN22-Wiki, IN22-Web and IN22-Conv. The Wikipedia and Web sources subsets offer diverse content spanning news, entertainment, culture, legal, and India-centric topics. IN22-Wiki and IN22-Web have been combined and considered for evaluation purposes and released as IN22-Gen. Meanwhile, IN22-Conv the conversation domain subset is designed to assess translation quality in typical day-to-day conversational-style applications.
+<table>
+<tbody>
+  <tr>
+    <td>IN22-Gen (IN22-Wiki + IN22-Web)</td>
+    <td>1024 sentences</td>
+    <td>🤗 <a href="https://huggingface.co/datasets/ai4bharat/IN22-Gen">ai4bharat/IN22-Gen</td>
+  </tr>
+  <tr>
+    <td>IN22-Conv</td>
+    <td>1503 sentences</td>
+    <td>🤗 <a href="https://huggingface.co/datasets/ai4bharat/IN22-Conv">ai4bharat/IN22-Conv</td>
+  </tr>
+</tbody>
+</table>
+You can download the data artifacts released as a part of this work from the [following section](#download-models-and-other-artifacts).
+## Preparing Data for Training
+BPCC data is organized under different subsets as described above, where each subset contains language pair subdirectories with the sentences pairs. We also provide LaBSE and LASER for the mined subsets of BPCC. In order to replicate our training setup, you will need to combine the data for corresponding language pairs from different subsets and remove overlapping bitext pairs if any.
+Here is the expected directory structure of the data:
+```bash
+BPCC
+├── eng_Latn-asm_Beng
+│   ├── train.eng_Latn
+│   └── train.asm_Beng
+├── eng_Latn-ben_Beng
+└── ...
+```
+While we provide deduplicated subsets with the current available benchmarks, we highly recommend performing deduplication using the combined monolingual side of all the benchmarks. You can use the following command for deduplication once you combine the monolingual side of all the benchmarks in the directory.
+```python3
+python3 scripts/dedup_benchmark.py <in_data_dir> <out_data_dir> <benchmark_dir>
+```
+- `<in_data_dir>`: path to the directory containing train data for each language pair in the format `{src_lang}-{tgt_lang}`
+- `<out_data_dir>`: path to the directory where the deduplicated train data will be written for each language pair in the format `{src_lang}-{tgt_lang}`
+- `<benchmark_dir>`: path to the directory containing the language-wise monolingual side of dev/test set, with monolingual files named as `test.{lang}`
+### Using our SPM model and Fairseq dictionary
+Once you complete the deduplication of the training data with the available benchmarks, you can preprocess and binarize the data for training models. Please download our trained SPM model and learned Fairseq dictionary using the following links for your experiments.
+|                    | En-Indic                                                                                     | Indic-En                                                                                     | Indic-Indic                                                                                     |
+| ------------------ | -------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- |
+| SPM model          | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/en-indic-spm.zip)          | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/indic-en-spm.zip)          | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/indic-indic-spm.zip)          |
+| Fairseq dictionary | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/en-indic-fairseq-dict.zip) | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/indic-en-fairseq-dict.zip) | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/indic-indic-fairseq-dict.zip) |
+To prepare the data for training En-Indic model, please do the following:
+1. Download the SPM model in the experiment directory and rename it as `vocab`.
+2. Download the Fairseq dictionary in the experiment directory and rename it as `final_bin`.
+Here is the expected directory for training En-Indic model:
+```bash
+en-indic-exp
+├── train
+│   ├── eng_Latn-asm_Beng
+│   │   ├── train.eng_Latn
+│   │   └── train.asm_Beng
+│   ├── eng_Latn-ben_Beng
+│   └── ...
+├── devtest
+│   └── all
+│       ├── eng_Latn-asm_Beng
+│       │   ├── dev.eng_Latn
+│       │   └── dev.asm_Beng
+│       ├── eng_Latn-ben_Beng
+│       └── ...
+├── vocab
+│   ├── model.SRC
+│   ├── model.TGT
+│   ├── vocab.SRC
+│   └── vocab.TGT
+└── final_bin
+    ├── dict.SRC.txt
+    └── dict.TGT.txt
+```
+To prepare data for training the Indic-En model, you should reverse the language pair directories within the train and devtest directories. Additionally, make sure to download the corresponding SPM model and Fairseq dictionary and put them in the experiment directory, similar to the procedure mentioned above for En-Indic model training.
+You can binarize the data for model training using the following:
+```bash
+bash prepare_data_joint_finetuning.sh <exp_dir>
+```
+- `<exp_dir>`: path to the directory containing the raw data for binarization
+You will need to follow the same steps for data preparation in case of fine-tuning models.
+### Training your own SPM models and learning Fairseq dictionary
+If you want to train your own SPM model and learn Fairseq dictionary, then please do the following:
+1. Collect a balanced amount of English and Indic monolingual data (we use around 3 million sentences per language-script combination). If some languages have limited data available, increase their representation to achieve a fair distribution of tokens across languages.
+2. Perform script unification for Indic languages wherever possible using `scripts/preprocess_translate.py` and concatenate all Indic data into a single file.
+3. Train two SPM models, one for English and other for Indic side using the following:
+```bash
+spm_train --input=train.indic --model_prefix=<model_name> --vocab_size=<vocab_size> --character_coverage=1.0 --model_type=BPE
+```
+4. Copy the trained SPM models in the experiment directory mentioned earlier and learn the Fairseq dictionary using the following:
+```bash
+bash prepare_data_joint_training.sh <exp_dir>
+```
+5. You will need to use the same Fairseq dictionary for any subsequent fine-tuning experiments and refer to the steps described above ([link](#using-our-spm-model-and-fairseq-dictionary)).
+## Training / Fine-tuning
+After binarizing the data, you can use train.sh to train the models. We provide the default hyperparameters used in this work. You can modify the hyperparameters as per your requirement if needed. If you want to train the model on a customized architecture, then please define the architecture in `model_configs/custom_transformer.py`. You can start the model training with the following command:
+```bash
+bash train.sh <exp_dir> <model_arch>
+```
+- `<exp_dir>`: path to the directory containing the binarized data
+- `<model_arch>`: custom transformer architecture used for model training
+For fine-tuning, the initial steps remain the same. However, the `finetune.sh` script includes an additional argument, `pretrained_ckpt`, which specifies the model checkpoint to be loaded for further fine-tuning. You can perform fine-tuning using the following command:
+```bash
+bash finetune.sh <exp_dir> <model_arch> <pretrained_ckpt>
+```
+- `<exp_dir>`: path to the directory containing the binarized data
+- `<model_arch>`: custom transformer architecture used for model training
+  - `transformer_18_18` - For IT2 Base models
+  - `transformer_base18L` - For IT2 Distilled models
+- `<pretrained_ckpt>`: path to the fairseq model checkpoint to be loaded for further fine-tuning
+You can download the model artifacts released as a part of this work from the [following section](#download-models-and-other-artifacts).
+The pretrained checkpoints have 3 directories, a fairseq model directory and 2 CT-ported model directories. Please note that the CT2 models are provided only for efficient inference. For fine-tuning purposes you should use the `fairseq_model`. Post that you can use the [fairseq-ct2-converter](https://opennmt.net/CTranslate2/guides/fairseq.html) to port your fine-tuned checkpoints to CT2 for faster inference.
+## Inference
+### Fairseq Inference
+In order to run inference on our pretrained models using bash interface, please use the following:
+```bash
+bash joint_translate.sh <infname> <outfname> <src_lang> <tgt_lang> <ckpt_dir>
+```
+- `infname`: path to the input file containing sentences
+- `outfname`: path to the output file where the translations should be stored
+- `src_lang`: source language
+- `tgt_lang`: target language
+- `ckpt_dir`: path to the fairseq model checkpoint directory
+If you want to run the inference using python interface then please execute the following block of code from the root directory:
+```python3
+from inference.engine import Model
+model = Model(ckpt_dir, model_type="fairseq")
+sents = [sent1, sent2,...]
+# for a batch of sentences
+model.batch_translate(sents, src_lang, tgt_lang)
+# for a paragraph
+model.translate_paragraph(text, src_lang, tgt_lang)
+```
+### CT2 Inference
+In order to run inference on CT2-ported model using python inference then please execute the following block of code from the root directory:
+```python3
+from inference.engine import Model
+model = Model(ckpt_dir, model_type="ctranslate2")
+sents = [sent1, sent2,...]
+# for a batch of sentences
+model.batch_translate(sents, src_lang, tgt_lang)
+# for a paragraph
+model.translate_paragraph(text, src_lang, tgt_lang)
+```
+## Evaluations
+We consider the chrF++ score as our primary metric. Additionally, we also report the BLEU and Comet scores.
+We also perform statistical significance tests for each metric to ascertain whether the differences are statistically significant.
+In order to run our evaluation scripts, you will need to organize the evaluation test sets into the following directory structure:
+```bash
+eval_benchmarks
+├── flores
+│   └── eng_Latn-asm_Beng
+│       ├── test.eng_Latn
+│       └── test.asm_Beng
+├── in22-gen
+├── in22-conv
+├── ntrex
+└── ...
+```
+To compute the BLEU and chrF++ scores for prediction file, you can use the following command:
+```bash
+bash compute_metrics.sh <pred_fname> <ref_fname> <tgt_lang>
+```
+- `pred_fname`: path to the model translations
+- `ref_fname`: path to the reference translations
+- `tgt_lang`: target language
+In order to automate the inference over the individual test sets for En-Indic, you can use the following command:
+```bash
+bash eval.sh <devtest_data_dir> <ckpt_dir> <system>
+```
+- `<devtest_data_dir>`: path to the evaluation set with language pair subdirectories (for example, flores directory in the above tree structure)
+- `<ckpt_dir>`: path to the fairseq model checkpoint directory
+- `<system>`: system name suffix to store the predictions in the format `test.{lang}.pred.{system}`
+In case of Indic-En evaluation, please use the following command:
+```bash
+bash eval_rev.sh  <devtest_data_dir> <ckpt_dir> <system>
+```
+- `<devtest_data_dir>`: path to the evaluation set with language pair subdirectories (for example, flores directory in the above tree structure)
+- `<ckpt_dir>`: path to the fairseq model checkpoint directory
+- `<system>`: system name suffix to store the predictions in the format `test.{lang}.pred.{system}`
+**_Note: You don’t need to reverse the test set directions for each language pair._**
+In case of Indic-Indic evaluation, please use the following command:
+```bash
+bash pivot_eval.sh <devtest_data_dir> <pivot_lang> <src2pivot_ckpt_dir> <pivot2tgt_ckpt_dir> <system>
+```
+- `<devtest_data_dir>`: path to the evaluation set with language pair subdirectories (for example, flores directory in the above tree structure)
+- `<pivot_lang>`: pivot language (default should be `eng_Latn`)
+- `<src2pivot_ckpt_dir>`: path to the fairseq Indic-En model checkpoint directory
+- `<pivot2tgt_ckpt_dir>`: path to the fairseq En-Indic model checkpoint directory
+- `<system>`: system name suffix to store the predictions in the format test.{lang}.pred.{system}
+In order to perform significance testing for BLEU and chrF++ metrics after you have the predictions for different systems, you can use the following command:
+```bash
+bash compute_comet_metrics_significance.sh <devtest_data_dir>
+```
+- `<devtest_data_dir>`: path to the evaluation set with language pair subdirectories (for example, flores directory in the above tree structure)
+Similarly, to compute the COMET scores and perform significance testing on predictions of different systems, you can use the following command.
+```bash
+bash compute_comet_score.sh <devtest_data_dir>
+```
+- `<devtest_data_dir>`: path to the evaluation set with language pair subdirectories (for example, flores directory in the above tree structure)
+Please note that as we compute significance tests with the same script and automate everything, it is best to have all the predictions for all the systems in place to avoid repeating anything.
+Also, we define the systems in the script itself, if you want to try out other systems, make sure to edit it there itself.
+### Baseline Evaluation
+To generate the translation results for baseline models such as M2M-100, MBART, Azure, Google, and NLLB MoE, you can check the scripts provided in the "baseline_eval" directory of this repository. For NLLB distilled, you can either modify NLLB_MoE eval or use this [repository](https://github.com/pluiez/NLLB-inference). Similarly, for IndicTrans inference, please refer to this [repository](https://github.com/ai4bharat/IndicTrans).
+You can download the translation outputs released as a part of this work from the [following section](#download-models-and-other-artifacts).
+## LICENSE
+The following table lists the licenses associated with the different artifacts released as a part of this work:
+| Artifact                                              | LICENSE                                                               |
+| ----------------------------------------------------- | --------------------------------------------------------------------- |
+| Existing Mined Corpora (NLLB & Samanantar)            | [CC0](https://creativecommons.org/share-your-work/public-domain/cc0/) |
+| Existing Seed Corpora (NLLB-Seed, ILCI, MASSIVE)      | [CC0](https://creativecommons.org/share-your-work/public-domain/cc0/) |
+| Newly Added Mined Corpora (Samanantar++ & Comparable) | [CC0](https://creativecommons.org/share-your-work/public-domain/cc0/) |
+| Newly Added Seed Corpora (BPCC-H-Wiki & BPCC-H-Daily) | [CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/)             |
+| Newly Created IN-22 test set (IN22-Gen & IN22-Conv)   | [CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/)             |
+| Back-translation data (BPCC-BT)                       | [CC0](https://creativecommons.org/share-your-work/public-domain/cc0/) |
+| Model checkpoints                                     | [MIT](https://github.com/ai4bharat/IndicTrans2/blob/main/LICENSE)     |
+The mined corpora collection (BPCC-Mined), existing seed corpora (NLLB-Seed, ILCI, MASSIVE), Backtranslation data (BPCC-BT), are released under the following licensing scheme:
+- We do not own any of the text from which this data has been extracted.
+- We license the actual packaging of this data under the Creative Commons [CC0 license (“no rights reserved”)](https://creativecommons.org/share-your-work/public-domain/cc0/).
+- To the extent possible under law, [AI4Bharat](https://ai4bharat.iitm.ac.in/) has waived all copyright and related or neighboring rights to BPCC-Mined, existing seed corpora (NLLB-Seed, ILCI, MASSIVE) and BPCC-BT.
+## Citation
+```bibtex
+@article{gala2023indictrans,
+title={IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages},
+author={Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan},
+journal={Transactions on Machine Learning Research},
+issn={2835-8856},
+year={2023},
+url={https://openreview.net/forum?id=vfT4YuzAYA},
+note={}
+}
+```

IndicTrans2/apply_sentence_piece.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/bin/bash
+# This script tokenizes the preprocessed train and dev set using the trained spm models.
+echo `date`
+exp_dir=$1                      # path to the experiment directory
+data_dir=$2                     # path to the data directory where all lang pairs are concatenated
+bpe_dir=$3                      # path to the tokenized data directory
+src_lang=$4                     # source language
+tgt_lang=$5                     # target language
+split=$6                        # name of the split
+parallel_installed=${7:-false}  # If GNU Parallel is installed or not
+in_split_dir=$data_dir/$split
+out_split_dir=$bpe_dir/$split
+echo "Apply Sentence Piece tokenization to SRC corpus"
+# for very large datasets, it is recommended to use gnu-parallel to speed up applying bpe
+if $parallel_installed; then
+    parallel --pipe --keep-order \
+    spm_encode --model=$exp_dir/vocab/model.SRC \
+        --output_format=piece \
+        < $in_split_dir.$src_lang \
+        > $out_split_dir.$src_lang
+else
+    spm_encode --model=$exp_dir/vocab/model.SRC \
+        --output_format=piece \
+        < $in_split_dir.$src_lang \
+        > $out_split_dir.$src_lang
+fi
+echo "Apply Sentence Piece tokenization to TGT corpus"
+# for very large datasets, it is recommended to use gnu-parallel to speed up applying bpe
+if $parallel_installed; then
+    parallel --pipe --keep-order \
+    spm_encode --model=$exp_dir/vocab/model.TGT \
+        --output_format=piece \
+        < $in_split_dir.$tgt_lang \
+        > $out_split_dir.$tgt_lang
+else
+    spm_encode --model=$exp_dir/vocab/model.TGT \
+        --output_format=piece \
+        < $in_split_dir.$tgt_lang \
+        > $out_split_dir.$tgt_lang
+fi

IndicTrans2/baseline_eval/azure_translate.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import os
+import sys
+import glob
+import requests
+from urllib.parse import urlencode
+from dotenv import dotenv_values
+import traceback
+import time
+flores_to_iso = {
+    "asm_Beng": "as",
+    "ben_Beng": "bn",
+    "brx_Deva": "brx",
+    "doi_Deva": "doi",
+    "eng_Latn": "en",
+    "gom_Deva": "gom",
+    "guj_Gujr": "gu",
+    "hin_Deva": "hi",
+    "kan_Knda": "kn",
+    "kas_Arab": "ks",
+    "kas_Deva": "ks_Deva",
+    "mai_Deva": "mai",
+    "mal_Mlym": "ml",
+    "mar_Deva": "mr",
+    "mni_Beng": "mni_Beng",
+    "mni_Mtei": "mni",
+    "npi_Deva": "ne",
+    "ory_Orya": "or",
+    "pan_Guru": "pa",
+    "san_Deva": "sa",
+    "sat_Olck": "sat",
+    "snd_Arab": "sd",
+    "snd_Deva": "sd_Deva",
+    "tam_Taml": "ta",
+    "tel_Telu": "te",
+    "urd_Arab": "ur",
+}
+class AzureTranslator:
+    def __init__(
+        self,
+        subscription_key: str,
+        region: str,
+        endpoint: str = "https://api.cognitive.microsofttranslator.com",
+    ) -> None:
+        self.http_headers = {
+            "Ocp-Apim-Subscription-Key": subscription_key,
+            "Ocp-Apim-Subscription-Region": region,
+        }
+        self.translate_endpoint = endpoint + "/translate?api-version=3.0&"
+        self.languages_endpoint = endpoint + "/languages?api-version=3.0"
+        self.supported_languages = self.get_supported_languages()
+    def get_supported_languages(self) -> dict:
+        return requests.get(self.languages_endpoint).json()["translation"]
+    def batch_translate(self, texts: list, src_lang: str, tgt_lang: str) -> list:
+        if not texts:
+            return texts
+        src_lang = flores_to_iso[src_lang]
+        tgt_lang = flores_to_iso[tgt_lang]
+        if src_lang not in self.supported_languages:
+            raise NotImplementedError(
+                f"Source language code: `{src_lang}` not supported!"
+            )
+        if tgt_lang not in self.supported_languages:
+            raise NotImplementedError(
+                f"Target language code: `{tgt_lang}` not supported!"
+            )
+        body = [{"text": text} for text in texts]
+        query_string = urlencode(
+            {
+                "from": src_lang,
+                "to": tgt_lang,
+            }
+        )
+        try:
+            response = requests.post(
+                self.translate_endpoint + query_string,
+                headers=self.http_headers,
+                json=body,
+            )
+        except:
+            traceback.print_exc()
+            return None
+        try:
+            response = response.json()
+        except:
+            traceback.print_exc()
+            print("Response:", response.text)
+            return None
+        return [payload["translations"][0]["text"] for payload in response]
+    def text_translate(self, text: str, src_lang: str, tgt_lang: str) -> str:
+        return self.batch_translate([text], src_lang, tgt_lang)[0]
+if __name__ == "__main__":
+    root_dir = sys.argv[1]
+    # Expects a .env file containing the API credentials.
+    config = dotenv_values(os.path.join(os.path.dirname(__file__), ".env"))
+    t = AzureTranslator(
+        config["AZURE_TRANSLATOR_TEXT_SUBSCRIPTION_KEY"],
+        config["AZURE_TRANSLATOR_TEXT_REGION"],
+        config["AZURE_TRANSLATOR_TEXT_ENDPOINT"],
+    )
+    pairs = sorted(glob.glob(os.path.join(root_dir, "*")))
+    for i, pair in enumerate(pairs):
+        basename = os.path.basename(pair)
+        print(pair)
+        src_lang, tgt_lang = basename.split("-")
+        print(f"{src_lang} - {tgt_lang}")
+        # source to target translations
+        src_infname = os.path.join(pair, f"test.{src_lang}")
+        tgt_outfname = os.path.join(pair, f"test.{tgt_lang}.pred.azure")
+        if not os.path.exists(src_infname):
+            continue
+        src_sents = [
+            sent.replace("\n", "").strip()
+            for sent in open(src_infname, "r").read().split("\n")
+            if sent
+        ]
+        if not os.path.exists(tgt_outfname):
+            try:
+                translations = []
+                for i in range(0, len(src_sents), 128):
+                    start, end = i, int(min(i + 128, len(src_sents)))
+                    translations.extend(
+                        t.batch_translate(src_sents[start:end], src_lang, tgt_lang)
+                    )
+                with open(tgt_outfname, "w") as f:
+                    f.write("\n".join(translations))
+                time.sleep(10)
+            except Exception as e:
+                print(e)
+                continue
+        # target to source translations
+        tgt_infname = os.path.join(pair, f"test.{tgt_lang}")
+        src_outfname = os.path.join(pair, f"test.{src_lang}.pred.azure")
+        if not os.path.exists(tgt_infname):
+            continue
+        tgt_sents = [
+            sent.replace("\n", "").strip()
+            for sent in open(tgt_infname, "r").read().split("\n")
+            if sent
+        ]
+        if not os.path.exists(src_outfname):
+            try:
+                translations = []
+                for i in range(0, len(tgt_sents), 128):
+                    start, end = i, int(min(i + 128, len(tgt_sents)))
+                    translations.extend(
+                        t.batch_translate(tgt_sents[start:end], tgt_lang, src_lang)
+                    )
+                with open(src_outfname, "w") as f:
+                    f.write("\n".join(translations))
+            except Exception as e:
+                continue
+            time.sleep(10)

IndicTrans2/baseline_eval/google_translate.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import os
+import sys
+import glob
+from tqdm import tqdm
+from google.cloud import translate
+# Expects a json file containing the API credentials.
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join(
+    os.path.dirname(__file__), r"api_key.json"
+)
+flores_to_iso = {
+    "asm_Beng": "as",
+    "ben_Beng": "bn",
+    "doi_Deva": "doi",
+    "eng_Latn": "en",
+    "gom_Deva": "gom",
+    "guj_Gujr": "gu",
+    "hin_Deva": "hi",
+    "kan_Knda": "kn",
+    "mai_Deva": "mai",
+    "mal_Mlym": "ml",
+    "mar_Deva": "mr",
+    "mni_Mtei": "mni_Mtei",
+    "npi_Deva": "ne",
+    "ory_Orya": "or",
+    "pan_Guru": "pa",
+    "san_Deva": "sa",
+    "sat_Olck": "sat",
+    "snd_Arab": "sd",
+    "tam_Taml": "ta",
+    "tel_Telu": "te",
+    "urd_Arab": "ur",
+}
+# Copy the project id from the json file containing API credentials
+def translate_text(text, src_lang, tgt_lang, project_id="project_id"):
+    src_lang = flores_to_iso[src_lang]
+    tgt_lang = flores_to_iso[tgt_lang]
+    if src_lang == "mni_Mtei":
+        src_lang = "mni-Mtei"
+    if tgt_lang == "mni_Mtei":
+        tgt_lang = "mni-Mtei"
+    client = translate.TranslationServiceClient()
+    location = "global"
+    parent = f"projects/{project_id}/locations/{location}"
+    response = client.translate_text(
+        request={
+            "parent": parent,
+            "contents": [text],
+            "mime_type": "text/plain",  # mime types: text/plain, text/html
+            "source_language_code": src_lang,
+            "target_language_code": tgt_lang,
+        }
+    )
+    translated_text = ""
+    for translation in response.translations:
+        translated_text += translation.translated_text
+    return translated_text
+if __name__ == "__main__":
+    root_dir = sys.argv[1]
+    pairs = sorted(glob.glob(os.path.join(root_dir, "*")))
+    for pair in pairs:
+        print(pair)
+        basename = os.path.basename(pair)
+        src_lang, tgt_lang = basename.split("-")
+        if src_lang not in flores_to_iso.keys() or tgt_lang not in flores_to_iso.keys():
+            continue
+        if src_lang == "eng_Latn":
+            lang = tgt_lang
+        else:
+            lang = src_lang
+        lang = flores_to_iso[lang]
+        if lang not in "as bn doi gom gu hi kn mai ml mni_Mtei mr ne or pa sa sd ta te ur":
+            continue
+        print(f"{src_lang} - {tgt_lang}")
+        # source to target translations
+        src_infname = os.path.join(pair, f"test.{src_lang}")
+        tgt_outfname = os.path.join(pair, f"test.{tgt_lang}.pred.google")
+        if os.path.exists(src_infname) and not os.path.exists(tgt_outfname):
+            src_sents = [
+                sent.replace("\n", "").strip()
+                for sent in open(src_infname, "r").read().split("\n")
+                if sent
+            ]
+            translations = [
+                translate_text(text, src_lang, tgt_lang).strip() for text in tqdm(src_sents)
+            ]
+            with open(tgt_outfname, "w") as f:
+                f.write("\n".join(translations))
+        # # target to source translations
+        tgt_infname = os.path.join(pair, f"test.{tgt_lang}")
+        src_outfname = os.path.join(pair, f"test.{src_lang}.pred.google")
+        if os.path.exists(tgt_infname) and not os.path.exists(src_outfname):
+            tgt_sents = [
+                sent.replace("\n", "").strip()
+                for sent in open(tgt_infname, "r").read().split("\n")
+                if sent
+            ]
+            translations = [
+                translate_text(text, tgt_lang, src_lang).strip() for text in tqdm(tgt_sents)
+            ]
+            with open(src_outfname, "w") as f:
+                f.write("\n".join(translations))

IndicTrans2/baseline_eval/m2m100_inference.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import os
+import re
+import sys
+from tqdm import tqdm
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# dictionary mapping flores codes to M2M-100 supported codes
+langs_supported = {
+    "eng_Latn": "en",
+    "ben_Beng": "bn",
+    "guj_Gujr": "gu",
+    "hin_Deva": "hi",
+    "kan_Knda": "kn",
+    "mal_Mlym": "ml",
+    "mar_Deva": "mr",
+    "npi_Deva": "ne",
+    "ory_Orya": "or",
+    "pan_Guru": "pa",
+    "snd_Arab": "sd",
+    "tam_Taml": "ta",
+    "urd_Arab": "ur",
+}
+def predict(batch, tokenizer, model, bos_token_id):
+    encoded_batch = tokenizer(batch, padding=True, return_tensors="pt").to(model.device)
+    generated_tokens = model.generate(
+        **encoded_batch,
+        num_beams=5,
+        max_length=256,
+        min_length=0,
+        forced_bos_token_id=bos_token_id,
+    )
+    hypothesis = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+    return hypothesis
+def main(devtest_data_dir, batch_size):
+    # load the pre-trained M2M-100 tokenizer and model
+    model_name = "facebook/m2m100-12B-last-ckpt"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+    model.eval()
+    # iterate over a list of language pairs from `devtest_data_dir`
+    for pair in sorted(os.listdir(devtest_data_dir)):
+        if "-" not in pair:
+            continue
+        src_lang, tgt_lang = pair.split("-")
+        # check if the source and target languages are supported
+        if (
+            src_lang not in langs_supported.keys()
+            or tgt_lang not in langs_supported.keys()
+        ):
+            print(f"Skipping {src_lang}-{tgt_lang} ...")
+            continue
+        # -------------------------------------------------------------------
+        #                   source to target evaluation
+        # -------------------------------------------------------------------
+        print(f"Evaluating {src_lang}-{tgt_lang} ...")
+        infname = os.path.join(devtest_data_dir, pair, f"test.{src_lang}")
+        outfname = os.path.join(devtest_data_dir, pair, f"test.{tgt_lang}.pred.m2m100")
+        with open(infname, "r") as f:
+            src_sents = f.read().split("\n")
+        add_new_line = False
+        if src_sents[-1] == "":
+            add_new_line = True
+            src_sents = src_sents[:-1]
+        # set the source language for tokenization
+        tokenizer.src_lang = langs_supported[src_lang]
+        # process sentences in batches and generate predictions
+        hypothesis = []
+        for i in tqdm(range(0, len(src_sents), batch_size)):
+            start, end = i, int(min(len(src_sents), i + batch_size))
+            batch = src_sents[start:end]
+            bos_token_id = tokenizer.lang_code_to_id[langs_supported[tgt_lang]]
+            hypothesis += predict(batch, tokenizer, model, bos_token_id)
+        assert len(hypothesis) == len(src_sents)
+        hypothesis = [
+            re.sub("\s+", " ", x.replace("\n", " ").replace("\t", " ")).strip()
+            for x in hypothesis
+        ]
+        if add_new_line:
+            hypothesis = hypothesis
+        with open(outfname, "w") as f:
+            f.write("\n".join(hypothesis))
+        # -------------------------------------------------------------------
+        #                   target to source evaluation
+        # -------------------------------------------------------------------
+        infname = os.path.join(devtest_data_dir, pair, f"test.{tgt_lang}")
+        outfname = os.path.join(devtest_data_dir, pair, f"test.{src_lang}.pred.m2m100")
+        with open(infname, "r") as f:
+            src_sents = f.read().split("\n")
+        add_new_line = False
+        if src_sents[-1] == "":
+            add_new_line = True
+            src_sents = src_sents[:-1]
+        # set the source language for tokenization
+        tokenizer.src_lang = langs_supported[tgt_lang]
+        # process sentences in batches and generate predictions
+        hypothesis = []
+        for i in tqdm(range(0, len(src_sents), batch_size)):
+            start, end = i, int(min(len(src_sents), i + batch_size))
+            batch = src_sents[start:end]
+            bos_token_id = tokenizer.lang_code_to_id[langs_supported[src_lang]]
+            hypothesis += predict(batch, tokenizer, model, bos_token_id)
+        assert len(hypothesis) == len(src_sents)
+        hypothesis = [
+            re.sub("\s+", " ", x.replace("\n", " ").replace("\t", " ")).strip()
+            for x in hypothesis
+        ]
+        if add_new_line:
+            hypothesis = hypothesis
+        with open(outfname, "w") as f:
+            f.write("\n".join(hypothesis))
+if __name__ == "__main__":
+    # expects En-X subdirectories pairs within the devtest data directory
+    devtest_data_dir = sys.argv[1]
+    batch_size = int(sys.argv[2])
+    if not torch.cuda.is_available():
+        print("No GPU available")
+        sys.exit(1)
+    main(devtest_data_dir, batch_size)

IndicTrans2/baseline_eval/mbart_inference.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import os
+import re
+import sys
+from tqdm import tqdm
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# dictionary mapping flores codes to mBART supported codes
+langs_supported = {
+    "eng_Latn": "en_XX",
+    "guj_Gujr": "gu_IN",
+    "hin_Deva": "hi_IN",
+    "npi_Deva": "ne_NP",
+    "ben_Beng": "bn_IN",
+    "mal_Mlym": "ml_IN",
+    "mar_Deva": "mr_IN",
+    "tam_Taml": "ta_IN",
+    "tel_Telu": "te_IN",
+    "urd_Arab": "ur_PK",
+}
+def predict(batch, tokenizer, model, bos_token_id):
+    encoded_batch = tokenizer(batch, padding=True, return_tensors="pt").to(model.device)
+    generated_tokens = model.generate(
+        **encoded_batch,
+        num_beams=5,
+        max_length=256,
+        min_length=0,
+        forced_bos_token_id=bos_token_id,
+    )
+    hypothesis = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+    return hypothesis
+def main(devtest_data_dir, batch_size):
+    # load the pre-trained mBART tokenizers and models for English-XX and XX-English translation
+    enxx_model_name = "facebook/mbart-large-50-one-to-many-mmt"
+    xxen_model_name = "facebook/mbart-large-50-many-to-one-mmt"
+    tokenizers = {
+        "enxx": AutoTokenizer.from_pretrained(enxx_model_name),
+        "xxen": AutoTokenizer.from_pretrained(xxen_model_name),
+    }
+    models = {
+        "enxx": AutoModelForSeq2SeqLM.from_pretrained(enxx_model_name).cuda(),
+        "xxen": AutoModelForSeq2SeqLM.from_pretrained(xxen_model_name).cuda(),
+    }
+    # set the models to evaluation mode
+    for model_name in models:
+        models[model_name].eval()
+    # iterate over a list of language pairs from `devtest_data_dir`
+    for pair in sorted(os.listdir(devtest_data_dir)):
+        if "-" not in pair:
+            continue
+        src_lang, tgt_lang = pair.split("-")
+        # check if the source and target languages are supported
+        if (
+            src_lang not in langs_supported.keys()
+            or tgt_lang not in langs_supported.keys()
+        ):
+            print(f"Skipping {src_lang}-{tgt_lang} ...")
+            continue
+        # -------------------------------------------------------------------
+        #                   source to target evaluation
+        # -------------------------------------------------------------------
+        print(f"Evaluating {src_lang}-{tgt_lang} ...")
+        infname = os.path.join(devtest_data_dir, pair, f"test.{src_lang}")
+        outfname = os.path.join(devtest_data_dir, pair, f"test.{tgt_lang}.pred.mbart50")
+        with open(infname, "r") as f:
+            src_sents = f.read().split("\n")
+        add_new_line = False
+        if src_sents[-1] == "":
+            add_new_line = True
+            src_sents = src_sents[:-1]
+        # set the source language for tokenization
+        tokenizers["enxx"].src_lang = langs_supported[src_lang]
+        # process sentences in batches and generate predictions
+        hypothesis = []
+        for i in tqdm(range(0, len(src_sents), batch_size)):
+            start, end = i, int(min(len(src_sents), i + batch_size))
+            batch = src_sents[start:end]
+            bos_token_id = tokenizers["enxx"].lang_code_to_id[langs_supported[tgt_lang]]
+            hypothesis += predict(
+                batch, tokenizers["enxx"], models["enxx"], bos_token_id
+            )
+        assert len(hypothesis) == len(src_sents)
+        hypothesis = [
+            re.sub("\s+", " ", x.replace("\n", " ").replace("\t", " ")).strip()
+            for x in hypothesis
+        ]
+        if add_new_line:
+            hypothesis = hypothesis
+        with open(outfname, "w") as f:
+            f.write("\n".join(hypothesis))
+        # -------------------------------------------------------------------
+        #                   target to source evaluation
+        # -------------------------------------------------------------------
+        infname = os.path.join(devtest_data_dir, pair, f"test.{tgt_lang}")
+        outfname = os.path.join(devtest_data_dir, pair, f"test.{src_lang}.pred.mbart50")
+        with open(infname, "r") as f:
+            src_sents = f.read().split("\n")
+        add_new_line = False
+        if src_sents[-1] == "":
+            add_new_line = True
+            src_sents = src_sents[:-1]
+        # set the source language for tokenization
+        tokenizers["xxen"].src_lang = langs_supported[tgt_lang]
+        # process sentences in batches and generate predictions
+        hypothesis = []
+        for i in tqdm(range(0, len(src_sents), batch_size)):
+            start, end = i, int(min(len(src_sents), i + batch_size))
+            batch = src_sents[start:end]
+            bos_token_id = tokenizers["xxen"].lang_code_to_id[langs_supported[src_lang]]
+            hypothesis += predict(
+                batch, tokenizers["xxen"], models["xxen"], bos_token_id
+            )
+        assert len(hypothesis) == len(src_sents)
+        hypothesis = [
+            re.sub("\s+", " ", x.replace("\n", " ").replace("\t", " ")).strip()
+            for x in hypothesis
+        ]
+        if add_new_line:
+            hypothesis = hypothesis
+        with open(outfname, "w") as f:
+            f.write("\n".join(hypothesis))
+if __name__ == "__main__":
+    # expects En-X subdirectories pairs within the devtest data directory
+    devtest_data_dir = sys.argv[1]
+    batch_size = int(sys.argv[2])
+    if not torch.cuda.is_available():
+        print("No GPU available")
+        sys.exit(1)
+    main(devtest_data_dir, batch_size)

IndicTrans2/baseline_eval/nllb_moe_cpu_inference.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import os
+import re
+import sys
+from tqdm import tqdm
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+langs_supported = [
+    "asm_Beng",
+    "ben_Beng",
+    "guj_Gujr",
+    "eng_Latn",
+    "hin_Deva",
+    "kas_Deva",
+    "kas_Arab",
+    "kan_Knda",
+    "mal_Mlym",
+    "mai_Deva",
+    "mar_Deva",
+    "mni_Beng",
+    "npi_Deva",
+    "ory_Orya",
+    "pan_Guru",
+    "san_Deva",
+    "snd_Arab",
+    "sat_Olck",
+    "tam_Taml",
+    "tel_Telu",
+    "urd_Arab",
+]
+def predict(batch, tokenizer, model, bos_token_id):
+    encoded_batch = tokenizer(batch, padding=True, return_tensors="pt").to(model.device)
+    generated_tokens = model.generate(
+        **encoded_batch,
+        num_beams=5,
+        max_length=256,
+        min_length=0,
+        forced_bos_token_id=bos_token_id,
+    )
+    hypothesis = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+    return hypothesis
+def main(devtest_data_dir, batch_size):
+    # load the pre-trained NLLB tokenizer and model
+    model_name = "facebook/nllb-moe-54b"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+    model.eval()
+    # iterate over a list of language pairs from `devtest_data_dir`
+    for pair in sorted(os.listdir(devtest_data_dir)):
+        if "-" not in pair:
+            continue
+        src_lang, tgt_lang = pair.split("-")
+        # check if the source and target languages are supported
+        if (
+            src_lang not in langs_supported.keys()
+            or tgt_lang not in langs_supported.keys()
+        ):
+            print(f"Skipping {src_lang}-{tgt_lang} ...")
+            continue
+        # -------------------------------------------------------------------
+        #                   source to target evaluation
+        # -------------------------------------------------------------------
+        print(f"Evaluating {src_lang}-{tgt_lang} ...")
+        infname = os.path.join(devtest_data_dir, pair, f"test.{src_lang}")
+        outfname = os.path.join(
+            devtest_data_dir, pair, f"test.{tgt_lang}.pred.nllb_moe"
+        )
+        with open(infname, "r") as f:
+            src_sents = f.read().split("\n")
+        add_new_line = False
+        if src_sents[-1] == "":
+            add_new_line = True
+            src_sents = src_sents[:-1]
+        # set the source language for tokenization
+        tokenizer.src_lang = src_lang
+        # process sentences in batches and generate predictions
+        hypothesis = []
+        for i in tqdm(range(0, len(src_sents), batch_size)):
+            start, end = i, int(min(len(src_sents), i + batch_size))
+            batch = src_sents[start:end]
+            if tgt_lang == "sat_Olck":
+                bos_token_id = tokenizer.lang_code_to_id["sat_Beng"]
+            else:
+                bos_token_id = tokenizer.lang_code_to_id[tgt_lang]
+            hypothesis += predict(batch, tokenizer, model, bos_token_id)
+        assert len(hypothesis) == len(src_sents)
+        hypothesis = [
+            re.sub("\s+", " ", x.replace("\n", " ").replace("\t", " ")).strip()
+            for x in hypothesis
+        ]
+        if add_new_line:
+            hypothesis = hypothesis
+        with open(outfname, "w") as f:
+            f.write("\n".join(hypothesis))
+        # -------------------------------------------------------------------
+        #                   target to source evaluation
+        # -------------------------------------------------------------------
+        infname = os.path.join(devtest_data_dir, pair, f"test.{tgt_lang}")
+        outfname = os.path.join(
+            devtest_data_dir, pair, f"test.{src_lang}.pred.nllb_moe"
+        )
+        with open(infname, "r") as f:
+            src_sents = f.read().split("\n")
+        add_new_line = False
+        if src_sents[-1] == "":
+            add_new_line = True
+            src_sents = src_sents[:-1]
+        # set the source language for tokenization
+        tokenizer.src_lang = "sat_Beng" if tgt_lang == "sat_Olck" else tgt_lang
+        # process sentences in batches and generate predictions
+        hypothesis = []
+        for i in tqdm(range(0, len(src_sents), batch_size)):
+            start, end = i, int(min(len(src_sents), i + batch_size))
+            batch = src_sents[start:end]
+            bos_token_id = tokenizer.lang_code_to_id[langs_supported[src_lang]]
+            hypothesis += predict(batch, tokenizer, model, bos_token_id)
+        assert len(hypothesis) == len(src_sents)
+        hypothesis = [
+            re.sub("\s+", " ", x.replace("\n", " ").replace("\t", " ")).strip()
+            for x in hypothesis
+        ]
+        if add_new_line:
+            hypothesis = hypothesis
+        with open(outfname, "w") as f:
+            f.write("\n".join(hypothesis))
+if __name__ == "__main__":
+    # expects En-X subdirectories pairs within the devtest data directory
+    devtest_data_dir = sys.argv[1]
+    batch_size = int(sys.argv[2])
+    main(devtest_data_dir, batch_size)

IndicTrans2/compute_comet_score.sh ADDED Viewed

	@@ -0,0 +1,84 @@

+#!/bin/bash
+# This script computes COMET metrics and also performs significance testing on the evaluation set
+# where each subdirectory contains En-X pair
+echo `date`
+devtest_data_dir=$1                         # path to the evaluation directory
+model_name=${2-"Unbabel/wmt22-comet-da"}    # name of the model checkpoint
+# predefined list of languages supported by COMET
+langs=(asm_Beng ben_Beng guj_Gujr hin_Deva kan_Knda mal_Mlym mar_Deva ory_Orya pan_Guru tam_Taml tel_Telu urd_Arab)
+# we predefine a set of systems which we consider for evaluation
+# feel free to change the below line in case you want to add or remove any system
+system=(google azure nllb mbart50 m2m100 it1 it2)
+# iterate over the list of predefined languages
+for lang in "${langs[@]}"; do
+    mkdir -p "$devtest_data_dir/eng_Latn-$lang/comet"
+    # --------------------------------------------------------------
+    #                   COMET score computation
+    # --------------------------------------------------------------
+    # iterate over the list of predefined systems
+    for sys in "${system[@]}"; do
+        echo "${sys}"
+        # en - indic direction
+        if [ -f "$devtest_data_dir/eng_Latn-$lang/test.$lang.pred.$sys" ]; then
+            echo "eng_Latn-${lang}"
+            src_fname=$devtest_data_dir/eng_Latn-$lang/test.eng_Latn
+            pred_fname=$devtest_data_dir/eng_Latn-$lang/test.$lang.pred.$sys
+            ref_fname=$devtest_data_dir/eng_Latn-$lang/test.$lang
+            out_fname=$devtest_data_dir/eng_Latn-$lang/comet/eng_Latn_${lang}_${sys}_comet.txt
+            # Compute COMET scores using the `comet-score`
+            comet-score -s $src_fname -t $pred_fname -r $ref_fname --gpus 1 --model $model_name --quiet --only_system > $out_fname
+        fi
+        # indic - en direction
+        if [ -f "$devtest_data_dir/eng_Latn-$lang/test.eng_Latn.pred.$sys" ]; then
+            echo "${lang}-eng_Latn"
+            src_fname=$devtest_data_dir/eng_Latn-$lang/test.$lang
+            pred_fname=$devtest_data_dir/eng_Latn-$lang/test.eng_Latn.pred.$sys
+            ref_fname=$devtest_data_dir/eng_Latn-$lang/test.eng_Latn
+            out_fname=$devtest_data_dir/eng_Latn-$lang/comet/${lang}_eng_Latn_${sys}_comet.txt
+            # Compute COMET scores using the `comet-score`
+            comet-score -s $src_fname -t $pred_fname -r $ref_fname --gpus 1 --model $model_name --quiet --only_system > $out_fname
+        fi
+    done
+    # --------------------------------------------------------------
+    #                  COMET significance testing
+    # --------------------------------------------------------------
+    # en - indic direction
+    src_fname=$devtest_data_dir/eng_Latn-$lang/test.eng_Latn
+    pred_fname=$devtest_data_dir/eng_Latn-$lang/test.$lang.pred.*
+    ref_fname=$devtest_data_dir/eng_Latn-$lang/test.$lang
+    out_fname=$devtest_data_dir/eng_Latn-$lang/comet/eng_Latn_${lang}_comet_stat.txt
+    # Compute COMET significance scores using the `comet-compare`
+    comet-compare -s $src_fname -t $pred_fname -r $ref_fname > $out_fname
+    # indic-en direction
+    src_fname=$devtest_data_dir/eng_Latn-$lang/test.$lang
+    pred_fname=$devtest_data_dir/eng_Latn-$lang/test.eng_Latn.pred.*
+    ref_fname=$devtest_data_dir/eng_Latn-$lang/test.eng_Latn
+    out_fname=$devtest_data_dir/eng_Latn-$lang/comet/${lang}_eng_Latn_comet_stat.txt
+    # Compute COMET significance scores using the `comet-compare`
+    comet-compare -s $src_fname -t $pred_fname -r $ref_fname > $out_fname
+done

IndicTrans2/compute_metrics.sh ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/bin/bash
+# This script compute the evaluation metrics such as BLEU, chrF, chrF++ using the
+# detokenized predictions of the translation systems using sacrebleu (version 2.3.1).
+# If the target language is:
+#   English: directly use Moses tokenizer that is internally supported (`mteval-v13a`)
+#   Indic: use IndicNLP tokenizers and skip tokenization step in sacrebleu.
+echo `date`
+pred_fname=$1       # path to the predction file
+ref_fname=$2        # path to the reference file
+tgt_lang=$3         # target language
+if [ $tgt_lang == 'eng_Latn' ]; then
+    # directly tokenize the prediction and reference files using sacrebleu and compute the metric
+    sacrebleu $ref_fname < $pred_fname -m bleu chrf
+    sacrebleu $ref_fname < $pred_fname -m chrf --chrf-word-order 2
+else
+    # indicnlp tokenize prediction and reference files before evaluation
+    input_size=`python scripts/preprocess_translate.py $ref_fname $ref_fname.tok $tgt_lang false false`
+    input_size=`python scripts/preprocess_translate.py $pred_fname $pred_fname.tok $tgt_lang false false`
+    # since we are tokenizing with indicnlp separately, we are setting tokenize to none here
+    sacrebleu --tokenize none $ref_fname.tok < $pred_fname.tok -m bleu chrf
+    sacrebleu --tokenize none $ref_fname.tok < $pred_fname.tok -m chrf --chrf-word-order 2
+fi

IndicTrans2/compute_metrics_significance.sh ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/bin/bash
+# This script performs significance testing for metrics such as BLEU, chrF++ using sacrebleu on the evaluation set
+# where each subdirectory contains En-X pair
+echo `date`
+devtest_data_dir=$1                         # path to the evaluation directory
+# we predefine a set of systems which we consider for evaluation
+# feel free to change the below line in case you want to add or remove any system
+system=(google azure nllb mbart50 m2m100 it1 it2)
+# get a list of language pairs in the `devtest_data_dir`
+pairs=$(ls -d $devtest_data_dir/eng_Latn-* | sort)
+# iterate over each language pair
+for pair in ${pairs[@]}; do
+    # extract the source and target languages from the pair name
+    pair=$(basename $pair)
+    src_lang=$(echo "$pair" | cut -d "-" -f 1)
+    tgt_lang=$(echo "$pair" | cut -d "-" -f 2)
+    if [[ $src_lang == "eng_Latn" ]]; then
+        # ----------------------------------------------------------------------
+        #                           en - indic direction
+        # ----------------------------------------------------------------------
+        echo "${src_lang} - ${tgt_lang}"
+        # find all the prediction files for different systems and tokenize it using IndicNLP
+        pred_fnames=$devtest_data_dir/$pair/test.${tgt_lang}.pred.*
+        ref_fname=$devtest_data_dir/$pair/test.${tgt_lang}
+        for pred_fname in $(find . -type f -name $pred_fnames); do
+            input_size=`python scripts/preprocess_translate.py $pred_fname $pred_fname.tok $tgt_lang false false`
+        done
+        input_size=`python scripts/preprocess_translate.py $ref_fname $ref_fname.tok $tgt_lang false false`
+        ref_fname=$devtest_data_dir/$pair/test.${tgt_lang}.tok
+        it2_fname=$devtest_data_dir/$pair/test.${tgt_lang}.pred.it2.tok
+        sys_fnames=$devtest_data_dir/$pair/test.${tgt_lang}.pred.*.tok
+        bleu_out_fname=$devtest_data_dir/$pair/${src_lang}_${tgt_lang}_bleu_significance.txt
+        chrF_out_fname=$devtest_data_dir/$pair/${src_lang}_${tgt_lang}_chrF++_significance.txt
+        sacrebleu --tokenize none $ref_fname -i $it2_fname $sys_fnames --paired-bs -m bleu --format text > $bleu_out_fname
+        sacrebleu --tokenize none $it2_fname $sys_fnames --paired-bs -m chrf --chrf-word-order 2 --format text > $chrF_out_fname
+        # ----------------------------------------------------------------------
+        #                           indic - en direction
+        # ----------------------------------------------------------------------
+        echo "${tgt_lang} - ${src_lang}"
+        ref_fname=$devtest_data_dir/$pair/test.${src_lang}
+        it2_fname=$devtest_data_dir/$pair/test.${src_lang}.pred.it2
+        sys_fnames=$devtest_data_dir/$pair/test.${src_lang}.pred.*
+        bleu_out_fname=$devtest_data_dir/$pair/${tgt_lang}_${src_lang}_bleu_significance.txt
+        chrF_out_fname=$devtest_data_dir/$pair/${tgt_lang}_${src_lang}_chrF++_significance.txt
+        sacrebleu --tokenize none $ref_fname -i $it2_fname $sys_fnames --paired-bs -m bleu --format text > $bleu_out_fname
+        sacrebleu --tokenize none $it2_fname $sys_fnames --paired-bs -m chrf --chrf-word-order 2 --format text > $chrF_out_fname
+    fi

IndicTrans2/eval.sh ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/bin/bash
+# This script evaluates the performance of a machine translation system
+# on a evaluation set in forward direction. For example, if the evaluation set
+# consists of language pairs, such as En-X, where En represents the English language
+# and X represents the target Indic language then this script accesses the translation
+# system from the English language (En) to the target Indic language (X) direction.
+echo `date`
+devtest_data_dir=$1         # path to the evaluation directory
+ckpt_dir=$2                 # path to the checkpoint directory
+system=${3:-"it2"}          # name of the machine translation system
+# get a list of language pairs in the `devtest_data_dir`
+pairs=$(ls -d $devtest_data_dir/* | sort)
+# iterate over each language pair
+for pair in ${pairs[@]}; do
+    # extract the source and target languages from the pair name
+    pair=$(basename $pair)
+    src_lang=$(echo "$pair" | cut -d "-" -f 1)
+    tgt_lang=$(echo "$pair" | cut -d "-" -f 2)
+    src_fname=$devtest_data_dir/$src_lang-$tgt_lang/test.$src_lang
+    tgt_fname=$devtest_data_dir/$src_lang-$tgt_lang/test.$tgt_lang
+    # check if the source and target files exists
+    if [ -f "$src_fname" ] && [ -f "$tgt_fname" ]; then
+        echo "Evaluating $src_lang-$tgt_lang ..."
+    else
+        echo "Skipping $src_lang-$tgt_lang ..."
+        continue
+    fi
+    # generate translations if the system name contains "it2"
+    if [[ $system == *"it2"* ]]; then
+        echo "Generating Translations"
+        bash joint_translate.sh $src_fname $tgt_fname.pred.$system $src_lang $tgt_lang $ckpt_dir
+    fi
+    # compute automatic string-based metrics if the prediction exists for the system
+    if [[ -f "${tgt_fname}.pred.${system}" ]]; then
+        echo "Computing Metrics"
+        bash compute_metrics.sh $tgt_fname.pred.$system $tgt_fname $tgt_lang > $devtest_data_dir/$src_lang-$tgt_lang/${src_lang}_${tgt_lang}_${system}_scores.txt
+    fi
+    # remove the intermediate files
+    rm -rf $tgt_fname.pred.$system.*
+    rm -rf $devtest_data_dir/$src_lang-$tgt_lang/*.tok
+done

IndicTrans2/eval_rev.sh ADDED Viewed

	@@ -0,0 +1,55 @@

+#!/bin/bash
+# This script evaluates the performance of a machine translation system
+# on a evaluation set in forward direction. For example, if the evaluation set
+# consists of language pairs, such as En-X, where En represents the English language
+# and X represents the target Indic language then this script accesses the translation
+# system from the target Indic language (X) to the English language (En) direction.
+echo `date`
+devtest_data_dir=$1         # path to the evaluation directory
+ckpt_dir=$2                 # path to the checkpoint directory
+system=${3:-"it2"}          # name of the machine translation system
+# get a list of language pairs in the `devtest_data_dir`
+pairs=$(ls -d $devtest_data_dir/* | sort)
+# iterate over each language pair
+for pair in ${pairs[@]}; do
+    # extract the source and target languages from the pair name
+    pair=$(basename $pair)
+    src_lang=$(echo "$pair" | cut -d "-" -f 1)
+    tgt_lang=$(echo "$pair" | cut -d "-" -f 2)
+    src_fname=$devtest_data_dir/$src_lang-$tgt_lang/test.$tgt_lang
+    tgt_fname=$devtest_data_dir/$src_lang-$tgt_lang/test.$src_lang
+    # check if the source and target files exists
+    # in this case, we flip the actual target file as source and vice-versa
+    if [ -f "$src_fname" ] && [ -f "$tgt_fname" ]; then
+        echo "Evaluating $src_lang-$tgt_lang ..."
+    else
+        echo "Skipping $src_lang-$tgt_lang ..."
+        continue
+    fi
+    # generate translations if the system name contains "it2"
+    if [[ $system == *"it2"* ]]; then
+        echo "Generating Translations"
+        bash joint_translate.sh $src_fname $tgt_fname.pred.$system $tgt_lang $src_lang $ckpt_dir
+    fi
+    # compute automatic string-based metrics if the prediction exists for the system
+    if [[ -f "${tgt_fname}.pred.${system}" ]]; then
+        echo "Computing Metrics"
+        bash compute_metrics.sh $tgt_fname.pred.$system $tgt_fname $src_lang > $devtest_data_dir/$src_lang-$tgt_lang/${tgt_lang}_${src_lang}_${system}_scores.txt
+    fi
+    # remove the intermediate files
+    rm -rf $tgt_fname.pred.$system.*
+    rm -rf $devtest_data_dir/$src_lang-$tgt_lang/*.tok
+done

IndicTrans2/finetune.sh ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/bin/bash
+# This script finetunes the pretrained translation model on the binarized data using fairseq.
+echo `date`
+exp_dir=$1                              # path of the experiment directory
+model_arch=${2:-"transformer_18_18"}    # model architecture (defaults to `transformer_18_18`)
+pretrained_ckpt=$3                      # path to the pretrained checkpoint `.pt` file
+fairseq-train $exp_dir/final_bin \
+--max-source-positions=256 \
+--max-target-positions=256 \
+--source-lang=SRC \
+--target-lang=TGT \
+--max-update=1000000 \
+--save-interval-updates=1000 \
+--arch=$model_arch \
+--activation-fn gelu \
+--criterion=label_smoothed_cross_entropy \
+--label-smoothing=0.1 \
+--optimizer adam \
+--adam-betas "(0.9, 0.98)" \
+--lr-scheduler=inverse_sqrt \
+--clip-norm 1.0 \
+--warmup-init-lr 1e-07 \
+--lr 3e-5 \
+--warmup-updates 2000 \
+--dropout 0.2 \
+--save-dir $exp_dir/model \
+--keep-last-epochs 5 \
+--keep-interval-updates 3 \
+--patience 10 \
+--skip-invalid-size-inputs-valid-test \
+--fp16 \
+--user-dir model_configs \
+--update-freq=4 \
+--distributed-world-size 8 \
+--num-workers 24 \
+--max-tokens 1024 \
+--eval-bleu \
+--eval-bleu-args "{\"beam\": 1, \"lenpen\": 1.0, \"max_len_a\": 1.2, \"max_len_b\": 10}" \
+--eval-bleu-detok moses \
+--eval-bleu-remove-bpe sentencepiece \
+--eval-bleu-print-samples \
+--best-checkpoint-metric bleu \
+--maximize-best-checkpoint-metric \
+--restore-file $pretrained_ckpt \
+--reset-lr-scheduler \
+--reset-meters \
+--reset-dataloader \
+--reset-optimizer \
+--task translation

IndicTrans2/huggingface_interface/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ IndicTransTokenizer

IndicTrans2/huggingface_interface/IndicTransToolkit/.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+dist/
+build/
+*.egg-info/
+*/*/__pycache__/

IndicTrans2/huggingface_interface/IndicTransToolkit/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,16 @@

+# Changelog
+# 📢 Release v1.0.3
+- 🚨 The `IndicProcessor` class has been re-written in [Cython](https://github.com/cython/cython) for faster implementation. This gives us atleast `+10 lines/s`.
+- A new `visualize` argument as been added to `preprocess_batch` to track the processing with a `tqdm` bar.
+# 📢 Release v1.0.2
+- The repository has been renamed to `IndicTransToolkit`.
+- 🚨 The custom tokenizer is now **removed** from the repository. Please revert to a previous commit ([v1.0.1](https://github.com/VarunGumma/IndicTransToolkit/tree/0e68fb5872f4d821578a5252f90ad43c9649370f)) to use it **(strongly discouraged)**. The official _(and only tokenizer)_ is available on HF along with the models.
+# 📢 Release v1.0.0
+- The [PreTrainedTokenizer](https://huggingface.co/docs/transformers/main_classes/tokenizer) for IndicTrans2 is now available on HF 🎉🎉 Note that, you still need the `IndicProcessor` to pre-process the sentences before tokenization.
+- 🚨 **In favor of the standard PreTrainedTokenizer, we deprecated the custom tokenizer. However, this custom tokenizer will still be available here for backward compatibility, but no further updates/bug-fixes will be provided.**
+- The `indic_evaluate` function is now consolidated into a concrete `IndicEvaluator` class.
+- The data collation function for training is consolidated into a concrete `IndicDataCollator` class.
+- A simple batching method is now available in the `IndicProcessor`.

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,130 @@

+Metadata-Version: 2.2
+Name: IndicTransToolkit
+Version: 1.0.3
+Summary: A simple, consistent, and extendable module for IndicTrans2 tokenizer compatible with HuggingFace models
+Home-page: https://github.com/VarunGumma/IndicTransToolkit
+Author: Varun Gumma
+Author-email: [email protected]
+License: MIT
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: setuptools>=68.2.2
+Requires-Dist: torch
+Requires-Dist: cython
+Requires-Dist: sacremoses
+Requires-Dist: sentencepiece
+Requires-Dist: transformers
+Requires-Dist: sacrebleu
+Requires-Dist: indic-nlp-library-IT2@ git+https://github.com/VarunGumma/indic_nlp_library.git
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+# IndicTransToolkit
+## About
+The goal of this repository is to provide a simple, modular, and extendable toolkit for [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2) and be compatible with the HuggingFace models released. Please refer to the `CHANGELOG.md` for latest developments.
+## Pre-requisites
+ - `Python 3.8+`
+ - [Indic NLP Library](https://github.com/VarunGumma/indic_nlp_library)
+ - Other requirements as listed in `requirements.txt`
+## Configuration
+ - Editable installation (Note, this may take a while):
+```bash
+git clone https://github.com/VarunGumma/IndicTransToolkit
+cd IndicTransToolkit
+pip install --editable . --use-pep517 # required for pip >= 25.0
+# in case it fails, try:
+# pip install --editable . --use-pep517 --config-settings editable_mode=compat
+```
+## Examples
+For the training usecase, please refer [here](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface).
+### PreTainedTokenizer
+```python
+import torch
+from IndicTransToolkit.processor import IndicProcessor # NOW IMPLEMENTED IN CYTHON !!
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+ip = IndicProcessor(inference=True)
+tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
+model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
+sentences = [
+    "This is a test sentence.",
+    "This is another longer different test sentence.",
+    "Please send an SMS to 9876543210 and an email on [email protected] by 15th October, 2023.",
+]
+batch = ip.preprocess_batch(sentences, src_lang="eng_Latn", tgt_lang="hin_Deva", visualize=False) # set it to visualize=True to print a progress bar
+batch = tokenizer(batch, padding="longest", truncation=True, max_length=256, return_tensors="pt")
+with torch.inference_mode():
+    outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)
+with tokenizer.as_target_tokenizer():
+    # This scoping is absolutely necessary, as it will instruct the tokenizer to tokenize using the target vocabulary.
+    # Failure to use this scoping will result in gibberish/unexpected predictions as the output will be de-tokenized with the source vocabulary instead.
+    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+outputs = ip.postprocess_batch(outputs, lang="hin_Deva")
+print(outputs)
+>>> ['यह एक परीक्षण वाक्य है।', 'यह एक और लंबा अलग परीक्षण वाक्य है।', 'कृपया 9876543210 पर एक एस. एम. एस. भेजें और 15 अक्टूबर, 2023 तक [email protected] पर एक ईमेल भेजें।']
+```
+### Evaluation
+- `IndicEvaluator` is a python implementation of [compute_metrics.sh](https://github.com/AI4Bharat/IndicTrans2/blob/main/compute_metrics.sh).
+- We have found that this python implementation gives slightly lower scores than the original `compute_metrics.sh`. So, please use this function cautiously, and feel free to raise a PR if you have found the bug/fix.
+```python
+from IndicTransToolkit import IndicEvaluator
+# this method returns a dictionary with BLEU and ChrF2++ scores with appropriate signatures
+evaluator = IndicEvaluator()
+scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=pred_file, refs=ref_file)
+# alternatively, you can pass the list of predictions and references instead of files
+# scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=preds, refs=refs)
+```
+## Authors
+ - Varun Gumma ([email protected])
+ - Jay Gala ([email protected])
+ - Pranjal Agadh Chitale ([email protected])
+ - Raj Dabre ([email protected])
+## Bugs and Contribution
+Since this a bleeding-edge module, you may encounter broken stuff and import issues once in a while. In case you encounter any bugs or want additional functionalities, please feel free to raise `Issues`/`Pull Requests` or contact the authors.
+## Citation
+If you use our codebase, or models, please do cite the following paper:
+```bibtex
+@article{
+    gala2023indictrans,
+    title={IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages},
+    author={Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan},
+    journal={Transactions on Machine Learning Research},
+    issn={2835-8856},
+    year={2023},
+    url={https://openreview.net/forum?id=vfT4YuzAYA},
+    note={}
+}
+```

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+LICENSE
+README.md
+pyproject.toml
+setup.py
+IndicTransToolkit/__init__.py
+IndicTransToolkit/collator.py
+IndicTransToolkit/evaluator.py
+IndicTransToolkit/processor.c
+IndicTransToolkit/version.py
+IndicTransToolkit.egg-info/PKG-INFO
+IndicTransToolkit.egg-info/SOURCES.txt
+IndicTransToolkit.egg-info/dependency_links.txt
+IndicTransToolkit.egg-info/not-zip-safe
+IndicTransToolkit.egg-info/requires.txt
+IndicTransToolkit.egg-info/top_level.txt

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/not-zip-safe ADDED Viewed

	@@ -0,0 +1 @@


1	+

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+setuptools>=68.2.2
+torch
+cython
+sacremoses
+sentencepiece
+transformers
+sacrebleu
+indic-nlp-library-IT2@ git+https://github.com/VarunGumma/indic_nlp_library.git

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ IndicTransToolkit

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .evaluator import IndicEvaluator
+from .collator import IndicDataCollator
+from .processor import IndicProcessor
+__all__ = [
+    "IndicEvaluator",
+    "IndicDataCollator",
+    "IndicProcessor",
+]

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (377 Bytes). View file

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (391 Bytes). View file

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-310.pyc ADDED Viewed

Binary file (2.18 kB). View file

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-313.pyc ADDED Viewed

Binary file (3.21 kB). View file

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-310.pyc ADDED Viewed

Binary file (4.19 kB). View file

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-313.pyc ADDED Viewed

Binary file (6.38 kB). View file

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/processor.cpython-310.pyc ADDED Viewed

Binary file (11.7 kB). View file

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/collator.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import numpy as np
+from dataclasses import dataclass
+from typing import Any, Optional, Union
+from transformers.utils import PaddingStrategy
+from transformers.tokenization_utils import PreTrainedTokenizerBase
+from transformers.data.data_collator import pad_without_fast_tokenizer_warning
+@dataclass
+class IndicDataCollator:
+    tokenizer: PreTrainedTokenizerBase
+    model: Optional[Any] = None
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    label_pad_token_id: int = -100
+    return_tensors: str = "pt"
+    def __call__(self, features, return_tensors=None):
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+        labels = (
+            [feature["labels"] for feature in features]
+            if "labels" in features[0].keys()
+            else None
+        )
+        # We have to pad the labels before calling `tokenizer.pad` as
+        # this method won't pad them and needs them of the same length to return tensors.
+        if labels is not None:
+            max_label_length = max(len(l) for l in labels)
+            if self.pad_to_multiple_of is not None:
+                max_label_length = (
+                    (max_label_length + self.pad_to_multiple_of - 1)
+                    // self.pad_to_multiple_of
+                    * self.pad_to_multiple_of
+                )
+            # fairseq by defaults right pad the labels for seq2seq tasks
+            for feature in features:
+                remainder = [self.label_pad_token_id] * (
+                    max_label_length - len(feature["labels"])
+                )
+                if isinstance(feature["labels"], list):
+                    feature["labels"] = feature["labels"] + remainder
+                else:
+                    feature["labels"] = np.concatenate(
+                        [feature["labels"], remainder]
+                    ).astype(np.int64)
+        self.tokenizer.padding_side = "left"
+        features = pad_without_fast_tokenizer_warning(
+            self.tokenizer,
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            return_tensors=return_tensors,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+        )
+        # prepare decoder_input_ids
+        if (
+            labels is not None
+            and self.model is not None
+            and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
+        ):
+            decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(
+                labels=features["labels"]
+            )
+            features["decoder_input_ids"] = decoder_input_ids
+        return features

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/evaluator.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from typing import List, Union
+from sacrebleu.metrics import CHRF, BLEU
+from indicnlp.tokenize import indic_tokenize
+from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
+class IndicEvaluator:
+    def __init__(self):
+        # === Metrics ===
+        self._chrf2_metric = CHRF(word_order=2)
+        self._bleu_metric_13a = BLEU(tokenize="13a")
+        self._bleu_metric_none = BLEU(tokenize="none")
+        # === Normalizer factory and cache ===
+        self._indic_norm_factory = IndicNormalizerFactory()
+        self._normalizer_cache = {}  # Cache normalizers by iso_lang
+        # === FLORES -> ISO codes ===
+        self._flores_codes = {
+            "asm_Beng": "as",
+            "awa_Deva": "hi",
+            "ben_Beng": "bn",
+            "bho_Deva": "hi",
+            "brx_Deva": "hi",
+            "doi_Deva": "hi",
+            "eng_Latn": "en",
+            "gom_Deva": "kK",
+            "gon_Deva": "hi",
+            "guj_Gujr": "gu",
+            "hin_Deva": "hi",
+            "hne_Deva": "hi",
+            "kan_Knda": "kn",
+            "kas_Arab": "ur",
+            "kas_Deva": "hi",
+            "kha_Latn": "en",
+            "lus_Latn": "en",
+            "mag_Deva": "hi",
+            "mai_Deva": "hi",
+            "mal_Mlym": "ml",
+            "mar_Deva": "mr",
+            "mni_Beng": "bn",
+            "mni_Mtei": "hi",
+            "npi_Deva": "ne",
+            "ory_Orya": "or",
+            "pan_Guru": "pa",
+            "san_Deva": "hi",
+            "sat_Olck": "or",
+            "snd_Arab": "ur",
+            "snd_Deva": "hi",
+            "tam_Taml": "ta",
+            "tel_Telu": "te",
+            "urd_Arab": "ur",
+            "unr_Deva": "hi",
+        }
+    def _get_normalizer(self, iso_lang: str):
+        """
+        Return a cached normalizer for a given iso_lang.
+        """
+        if iso_lang not in self._normalizer_cache:
+            self._normalizer_cache[iso_lang] = self._indic_norm_factory.get_normalizer(iso_lang)
+        return self._normalizer_cache[iso_lang]
+    def _preprocess(self, sentences: List[str], lang: str) -> List[str]:
+        """
+        Preprocess the sentences using IndicNLP:
+        1) Normalization (using a cached normalizer),
+        2) Trivial tokenization.
+        """
+        iso_lang = self._flores_codes.get(lang, "hi")
+        # Fetch from cache to avoid reconstructing the normalizer
+        normalizer = self._get_normalizer(iso_lang)
+        # Local references for speed
+        trivial_tokenize = indic_tokenize.trivial_tokenize
+        normalize_fn = normalizer.normalize
+        processed_sentences = []
+        for line in sentences:
+            # single .strip() before normalizing
+            line = line.strip()
+            norm_line = normalize_fn(line)
+            tokens = trivial_tokenize(norm_line, iso_lang)
+            processed_sentences.append(" ".join(tokens))
+        return processed_sentences
+    def evaluate(
+        self,
+        tgt_lang: str,
+        preds: Union[List[str], str],
+        refs: Union[List[str], str],
+    ):
+        """
+        Evaluate BLEU and chrF2++ scores for the given predictions and references.
+        - If preds/refs are strings (filenames), read them from disk.
+        - If they are lists, evaluate them directly.
+        - For non-English languages, applies Indic NLP preprocessing before scoring.
+        """
+        assert preds is not None and refs is not None, "Predictions and References cannot be None"
+        # Convert file paths to lists if needed
+        if isinstance(preds, str):
+            with open(preds, "r", encoding="utf-8") as fp:
+                preds = [line.strip() for line in fp]
+        if isinstance(refs, str):
+            with open(refs, "r", encoding="utf-8") as fr:
+                refs = [line.strip() for line in fr]
+        assert len(preds) == len(refs), "Number of predictions and references do not match"
+        # Local references to metrics for speed
+        bleu_none = self._bleu_metric_none
+        bleu_13a = self._bleu_metric_13a
+        chrf2 = self._chrf2_metric
+        scores = {}
+        # For English (eng_Latn), skip Indic NLP normalization
+        if tgt_lang != "eng_Latn":
+            preds_ = self._preprocess(preds, tgt_lang)
+            refs_ = self._preprocess(refs, tgt_lang)
+            bleu_score = bleu_none.corpus_score(preds_, [refs_])
+            chrf_score = chrf2.corpus_score(preds_, [refs_])
+            scores["bleu"] = {
+                "score": round(bleu_score.score, 1),
+                "signature": bleu_none.get_signature().format(),
+            }
+            scores["chrF2++"] = {
+                "score": round(chrf_score.score, 1),
+                "signature": chrf2.get_signature().format(),
+            }
+        else:
+            # For English, 13a tokenization is standard
+            bleu_score = bleu_13a.corpus_score(preds, [refs])
+            chrf_score = chrf2.corpus_score(preds, [refs])
+            scores["bleu"] = {
+                "score": round(bleu_score.score, 1),
+                "signature": bleu_13a.get_signature().format(),
+            }
+            scores["chrF2++"] = {
+                "score": round(chrf_score.score, 1),
+                "signature": chrf2.get_signature().format(),
+            }
+        return scores

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.c ADDED Viewed

The diff for this file is too large to render. See raw diff

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.cp310-win_amd64.pyd ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba760d57f3accf3b24d9cc331dcda273d0612998a034d9250eb8c9db5b9f908a
+size 141312

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1a101ecb27adaf367f00c90b3f8e96e7fbda3bf0560d48c368fec3750a040a4
+size 229200

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.pyx ADDED Viewed

	@@ -0,0 +1,503 @@

+# cython: language_level=3, boundscheck=False, cdivision=True, wraparound=False
+"""
+Cython version of the IndicProcessor class with optimizations for performance.
+Only preprocess_batch and postprocess_batch are exposed as cpdef methods.
+All other methods are internal (cdef) for optimized Cython usage.
+"""
+import regex as re
+from tqdm import tqdm
+from queue import Queue
+from typing import List, Dict, Union
+# Importing Python objects since these libraries don't offer C-extensions
+from indicnlp.tokenize import indic_tokenize, indic_detokenize
+from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
+from sacremoses import MosesPunctNormalizer, MosesTokenizer, MosesDetokenizer
+from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
+cdef class IndicProcessor:
+    cdef public bint inference
+    # Precompiled regex patterns and placeholders
+    cdef object _MULTISPACE_REGEX
+    cdef object _DIGIT_SPACE_PERCENT
+    cdef object _DOUBLE_QUOT_PUNC
+    cdef object _DIGIT_NBSP_DIGIT
+    cdef object _END_BRACKET_SPACE_PUNC_REGEX
+    cdef object _URL_PATTERN
+    cdef object _NUMERAL_PATTERN
+    cdef object _EMAIL_PATTERN
+    cdef object _OTHER_PATTERN
+    cdef list _PUNC_REPLACEMENTS
+    cdef list _INDIC_FAILURE_CASES
+    cdef dict _flores_codes
+    cdef dict _digits_translation_table
+    # Placeholder maps stored in a Python Queue (treated as `object` for Cython)
+    cdef object _placeholder_entity_maps
+    # Tools (also Python objects)
+    cdef object _en_tok
+    cdef object _en_normalizer
+    cdef object _en_detok
+    cdef object _xliterator
+    def __cinit__(self, bint inference=True):
+        """
+        Constructor for IndicProcessor. Initializes all necessary components.
+        """
+        self.inference = inference
+        ##############################
+        # FLORES -> ISO CODES
+        ##############################
+        self._flores_codes = {
+            "asm_Beng": "as",
+            "awa_Deva": "hi",
+            "ben_Beng": "bn",
+            "bho_Deva": "hi",
+            "brx_Deva": "hi",
+            "doi_Deva": "hi",
+            "eng_Latn": "en",
+            "gom_Deva": "kK",
+            "gon_Deva": "hi",
+            "guj_Gujr": "gu",
+            "hin_Deva": "hi",
+            "hne_Deva": "hi",
+            "kan_Knda": "kn",
+            "kas_Arab": "ur",
+            "kas_Deva": "hi",
+            "kha_Latn": "en",
+            "lus_Latn": "en",
+            "mag_Deva": "hi",
+            "mai_Deva": "hi",
+            "mal_Mlym": "ml",
+            "mar_Deva": "mr",
+            "mni_Beng": "bn",
+            "mni_Mtei": "hi",
+            "npi_Deva": "ne",
+            "ory_Orya": "or",
+            "pan_Guru": "pa",
+            "san_Deva": "hi",
+            "sat_Olck": "or",
+            "snd_Arab": "ur",
+            "snd_Deva": "hi",
+            "tam_Taml": "ta",
+            "tel_Telu": "te",
+            "urd_Arab": "ur",
+            "unr_Deva": "hi",
+        }
+        ##############################
+        # INDIC DIGIT TRANSLATION (str.translate)
+        ##############################
+        self._digits_translation_table = {}
+        cdef dict digits_dict = {
+            "\u09e6": "0", "\u0ae6": "0", "\u0ce6": "0", "\u0966": "0",
+            "\u0660": "0", "\uabf0": "0", "\u0b66": "0", "\u0a66": "0",
+            "\u1c50": "0", "\u06f0": "0",
+            "\u09e7": "1", "\u0ae7": "1", "\u0967": "1", "\u0ce7": "1",
+            "\u06f1": "1", "\uabf1": "1", "\u0b67": "1", "\u0a67": "1",
+            "\u1c51": "1", "\u0c67": "1",
+            "\u09e8": "2", "\u0ae8": "2", "\u0968": "2", "\u0ce8": "2",
+            "\u06f2": "2", "\uabf2": "2", "\u0b68": "2", "\u0a68": "2",
+            "\u1c52": "2", "\u0c68": "2",
+            "\u09e9": "3", "\u0ae9": "3", "\u0969": "3", "\u0ce9": "3",
+            "\u06f3": "3", "\uabf3": "3", "\u0b69": "3", "\u0a69": "3",
+            "\u1c53": "3", "\u0c69": "3",
+            "\u09ea": "4", "\u0aea": "4", "\u096a": "4", "\u0cea": "4",
+            "\u06f4": "4", "\uabf4": "4", "\u0b6a": "4", "\u0a6a": "4",
+            "\u1c54": "4", "\u0c6a": "4",
+            "\u09eb": "5", "\u0aeb": "5", "\u096b": "5", "\u0ceb": "5",
+            "\u06f5": "5", "\uabf5": "5", "\u0b6b": "5", "\u0a6b": "5",
+            "\u1c55": "5", "\u0c6b": "5",
+            "\u09ec": "6", "\u0aec": "6", "\u096c": "6", "\u0cec": "6",
+            "\u06f6": "6", "\uabf6": "6", "\u0b6c": "6", "\u0a6c": "6",
+            "\u1c56": "6", "\u0c6c": "6",
+            "\u09ed": "7", "\u0aed": "7", "\u096d": "7", "\u0ced": "7",
+            "\u06f7": "7", "\uabf7": "7", "\u0b6d": "7", "\u0a6d": "7",
+            "\u1c57": "7", "\u0c6d": "7",
+            "\u09ee": "8", "\u0aee": "8", "\u096e": "8", "\u0cee": "8",
+            "\u06f8": "8", "\uabf8": "8", "\u0b6e": "8", "\u0a6e": "8",
+            "\u1c58": "8", "\u0c6e": "8",
+            "\u09ef": "9", "\u0aef": "9", "\u096f": "9", "\u0cef": "9",
+            "\u06f9": "9", "\uabf9": "9", "\u0b6f": "9", "\u0a6f": "9",
+            "\u1c59": "9", "\u0c6f": "9",
+        }
+        for k, v in digits_dict.items():
+            self._digits_translation_table[ord(k)] = v
+        # Also map ASCII '0'-'9'
+        for c in range(ord('0'), ord('9') + 1):
+            self._digits_translation_table[c] = chr(c)
+        ##############################
+        # PLACEHOLDER MAP QUEUE
+        ##############################
+        self._placeholder_entity_maps = Queue()
+        ##############################
+        # MOSES (as Python objects)
+        ##############################
+        self._en_tok = MosesTokenizer(lang="en")
+        self._en_normalizer = MosesPunctNormalizer()
+        self._en_detok = MosesDetokenizer(lang="en")
+        ##############################
+        # TRANSLITERATOR (Python object)
+        ##############################
+        self._xliterator = UnicodeIndicTransliterator()
+        ##############################
+        # Precompiled Patterns
+        ##############################
+        self._MULTISPACE_REGEX = re.compile(r"[ ]{2,}")
+        self._DIGIT_SPACE_PERCENT = re.compile(r"(\d) %")
+        self._DOUBLE_QUOT_PUNC = re.compile(r"\"([,\.]+)")
+        self._DIGIT_NBSP_DIGIT = re.compile(r"(\d) (\d)")
+        self._END_BRACKET_SPACE_PUNC_REGEX = re.compile(r"\) ([\.!:?;,])")
+        self._URL_PATTERN = re.compile(
+            r"\b(?<![\w/.])(?:(?:https?|ftp)://)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b"
+        )
+        self._NUMERAL_PATTERN = re.compile(
+            r"(~?\d+\.?\d*\s?%?\s?-?\s?~?\d+\.?\d*\s?%|~?\d+%|\d+[-\/.,:']\d+[-\/.,:'+]\d+(?:\.\d+)?|\d+[-\/.:'+]\d+(?:\.\d+)?)"
+        )
+        self._EMAIL_PATTERN = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}")
+        self._OTHER_PATTERN = re.compile(r"[A-Za-z0-9]*[#|@]\w+")
+        # Combined punctuation replacements
+        self._PUNC_REPLACEMENTS = [
+            (re.compile(r"\r"), ""),
+            (re.compile(r"\(\s*"), "("),
+            (re.compile(r"\s*\)"), ")"),
+            (re.compile(r"\s:\s?"), ":"),
+            (re.compile(r"\s;\s?"), ";"),
+            (re.compile(r"[`´‘‚’]"), "'"),
+            (re.compile(r"[„“”«»]"), '"'),
+            (re.compile(r"[–—]"), "-"),
+            (re.compile(r"\.\.\."), "..."),
+            (re.compile(r" %"), "%"),
+            (re.compile(r"nº "), "nº "),
+            (re.compile(r" ºC"), " ºC"),
+            (re.compile(r" [?!;]"), lambda m: m.group(0).strip()),
+            (re.compile(r", "), ", "),
+        ]
+        self._INDIC_FAILURE_CASES = [
+            "آی ڈی ",
+            "ꯑꯥꯏꯗꯤ",
+            "आईडी",
+            "आई . डी . ",
+            "आई . डी .",
+            "आई. डी. ",
+            "आई. डी.",
+            "आय. डी. ",
+            "आय. डी.",
+            "आय . डी . ",
+            "आय . डी .",
+            "ऐटि",
+            "آئی ڈی ",
+            "ᱟᱭᱰᱤ ᱾",
+            "आयडी",
+            "ऐडि",
+            "आइडि",
+            "ᱟᱭᱰᱤ",
+        ]
+    # Internal Method: Apply punctuation replacements
+    cdef str _apply_punc_replacements(self, str text, list replacements) except *:
+        """
+        Apply a list of (pattern, replacement) in sequence to text.
+        """
+        cdef int i
+        cdef tuple pair
+        for i in range(len(replacements)):
+            pair = replacements[i]
+            text = pair[0].sub(pair[1], text)
+        return text
+    # Internal Method: Punctuation Normalization
+    cdef str _punc_norm(self, str text) except *:
+        """
+        Consolidate punctuation normalization in fewer passes.
+        """
+        # 1) Apply replacements
+        text = self._apply_punc_replacements(text, self._PUNC_REPLACEMENTS)
+        # 2) Additional patterns
+        text = self._MULTISPACE_REGEX.sub(" ", text)
+        text = self._END_BRACKET_SPACE_PUNC_REGEX.sub(r")\1", text)
+        text = self._DIGIT_SPACE_PERCENT.sub(r"\1%", text)
+        text = self._DOUBLE_QUOT_PUNC.sub(r'\1"', text)
+        text = self._DIGIT_NBSP_DIGIT.sub(r"\1.\2", text)
+        return text.strip()
+    # Internal Method: Wrap Text with Placeholders
+    cdef str _wrap_with_placeholders(self, str text) except *:
+        """
+        Wrap substrings with matched patterns in the text with placeholders.
+        Store the placeholder map in the queue for retrieval in postprocessing.
+        """
+        cdef int serial_no = 1
+        cdef dict placeholder_entity_map = {}
+        cdef list patterns = [
+            self._EMAIL_PATTERN,
+            self._URL_PATTERN,
+            self._NUMERAL_PATTERN,
+            self._OTHER_PATTERN,
+        ]
+        cdef object pattern
+        cdef set matches
+        cdef str match
+        cdef str base_placeholder
+        cdef int i
+        for pattern in patterns:
+            matches = set(pattern.findall(text))
+            for match in matches:
+                # Additional checks
+                if pattern is self._URL_PATTERN:
+                    if len(match.replace(".", "")) < 4:
+                        continue
+                if pattern is self._NUMERAL_PATTERN:
+                    if len(match.replace(" ", "").replace(".", "").replace(":", "")) < 4:
+                        continue
+                base_placeholder = f"<ID{serial_no}>"
+                # Map various placeholder formats to the matched text
+                placeholder_entity_map[f"<ID{serial_no}>"] = match
+                placeholder_entity_map[f"< ID{serial_no} >"] = match
+                placeholder_entity_map[f"[ID{serial_no}]"] = match
+                placeholder_entity_map[f"[ ID{serial_no} ]"] = match
+                placeholder_entity_map[f"[ID {serial_no}]"] = match
+                placeholder_entity_map[f"<ID{serial_no}]"] = match
+                placeholder_entity_map[f"< ID{serial_no}]"] = match
+                placeholder_entity_map[f"<ID{serial_no} ]"] = match
+                # Handle Indic failure cases
+                for i in range(len(self._INDIC_FAILURE_CASES)):
+                    indic_case = self._INDIC_FAILURE_CASES[i]
+                    placeholder_entity_map[f"<{indic_case}{serial_no}>"] = match
+                    placeholder_entity_map[f"< {indic_case}{serial_no} >"] = match
+                    placeholder_entity_map[f"< {indic_case} {serial_no} >"] = match
+                    placeholder_entity_map[f"<{indic_case} {serial_no}]"] = match
+                    placeholder_entity_map[f"< {indic_case} {serial_no} ]"] = match
+                    placeholder_entity_map[f"[{indic_case}{serial_no}]"] = match
+                    placeholder_entity_map[f"[{indic_case} {serial_no}]"] = match
+                    placeholder_entity_map[f"[ {indic_case}{serial_no} ]"] = match
+                    placeholder_entity_map[f"[ {indic_case} {serial_no} ]"] = match
+                    placeholder_entity_map[f"{indic_case} {serial_no}"] = match
+                    placeholder_entity_map[f"{indic_case}{serial_no}"] = match
+                # Replace the match with the base placeholder
+                text = text.replace(match, base_placeholder)
+                serial_no += 1
+        # Clean up any remaining placeholder artifacts
+        text = re.sub(r"\s+", " ", text).replace(">/", ">").replace("]/", "]")
+        self._placeholder_entity_maps.put(placeholder_entity_map)
+        return text
+    # Internal Method: Normalize Text
+    cdef str _normalize(self, str text) except *:
+        """
+        Normalizes numerals and optionally wraps placeholders.
+        """
+        # Single-pass digit translation
+        text = text.translate(self._digits_translation_table)
+        if self.inference:
+            text = self._wrap_with_placeholders(text)
+        return text
+    # Internal Method: Indic Tokenize and Transliterate
+    cdef str _do_indic_tokenize_and_transliterate(
+        self,
+        str sentence,
+        object normalizer,
+        str iso_lang,
+        bint transliterate
+    ) except *:
+        """
+        Helper method: normalizes, tokenizes, optionally transliterates from iso_lang -> 'hi'.
+        """
+        cdef str normed
+        cdef list tokens
+        cdef str joined
+        cdef str xlated
+        normed = normalizer.normalize(sentence.strip())
+        tokens = indic_tokenize.trivial_tokenize(normed, iso_lang)
+        joined = " ".join(tokens)
+        xlated = joined
+        if transliterate:
+            xlated = self._xliterator.transliterate(joined, iso_lang, "hi")
+            xlated = xlated.replace(" ् ", "्")
+        return xlated
+    # Internal Method: Preprocess a Single Sentence
+    cdef str _preprocess(
+        self,
+        str sent,
+        str src_lang,
+        str tgt_lang,
+        object normalizer,
+        bint is_target
+    ) except *:
+        """
+        Preprocess a single sentence: punctuation normalization, numeral normalization,
+        tokenization, transliteration, and adding language tags if necessary.
+        """
+        cdef str iso_lang = self._flores_codes.get(src_lang, "hi")
+        cdef str script_part = src_lang.split("_")[1]
+        cdef bint do_transliterate = True
+        cdef str e_strip
+        cdef str e_norm
+        cdef list e_tokens
+        cdef str processed_sent
+        # 1) Punctuation normalization
+        sent = self._punc_norm(sent)
+        # 2) Numerals & placeholders
+        sent = self._normalize(sent)
+        if script_part in ["Arab", "Aran", "Olck", "Mtei", "Latn"]:
+            do_transliterate = False
+        if iso_lang == "en":
+            # English path
+            e_strip = sent.strip()
+            e_norm = self._en_normalizer.normalize(e_strip)
+            e_tokens = self._en_tok.tokenize(e_norm, escape=False)
+            processed_sent = " ".join(e_tokens)
+        else:
+            # Indic path
+            processed_sent = self._do_indic_tokenize_and_transliterate(sent, normalizer, iso_lang, do_transliterate)
+        processed_sent = processed_sent.strip()
+        if not is_target:
+            return f"{src_lang} {tgt_lang} {processed_sent}"
+        else:
+            return processed_sent
+    # Internal Method: Postprocess a Single Sentence
+    cdef str _postprocess(self, object sent, str lang) except *:
+        """
+        Postprocess a single sentence:
+        1) Pull placeholder map from queue
+        2) Fix scripts for Perso-Arabic
+        3) Restore placeholders
+        4) Detokenize
+        """
+        cdef dict placeholder_entity_map
+        cdef str lang_code
+        cdef str script_code
+        cdef str iso_lang
+        cdef str k
+        cdef str v
+        cdef str xlated
+        # Unwrap if sent is a tuple or list
+        if isinstance(sent, (tuple, list)):
+            sent = sent[0]
+        placeholder_entity_map = self._placeholder_entity_maps.get()
+        lang_code, script_code = lang.split("_", 1)
+        iso_lang = self._flores_codes.get(lang, "hi")
+        # Fix for Perso-Arabic scripts
+        if script_code in ["Arab", "Aran"]:
+            sent = (
+                sent.replace(" ؟", "؟")
+                    .replace(" ۔", "۔")
+                    .replace(" ،", "،")
+                    .replace("ٮ۪", "ؠ")
+            )
+        # Oriya fix
+        if lang_code == "ory":
+            sent = sent.replace("ଯ଼", "ୟ")
+        # Restore placeholders
+        for k, v in placeholder_entity_map.items():
+            sent = sent.replace(k, v)
+        # Detokenize
+        if lang == "eng_Latn":
+            return self._en_detok.detokenize(sent.split(" "))
+        else:
+            xlated = self._xliterator.transliterate(sent, "hi", iso_lang)
+            return indic_detokenize.trivial_detokenize(xlated, iso_lang)
+    # Exposed Method: Preprocess a Batch of Sentences
+    cpdef list preprocess_batch(
+        self,
+        List[str] batch,
+        str src_lang,
+        str tgt_lang=None,
+        bint is_target=False,
+        bint visualize=False
+    ):
+        """
+        Preprocess an array of sentences (normalize, tokenize, transliterate).
+        This is exposed for external use.
+        """
+        cdef object normalizer = None
+        cdef str iso_code = self._flores_codes.get(src_lang, "hi")
+        cdef object iterator
+        cdef list results
+        cdef int i
+        cdef int n = len(batch)
+        if src_lang != "eng_Latn":
+            normalizer = IndicNormalizerFactory().get_normalizer(iso_code)
+        if visualize:
+            iterator = tqdm(batch, total=n, desc=f" | > Pre-processing {src_lang}", unit="line")
+        else:
+            iterator = batch
+        return [self._preprocess(s, src_lang, tgt_lang, normalizer, is_target) for s in iterator]
+    # Exposed Method: Postprocess a Batch of Sentences
+    cpdef list postprocess_batch(
+        self,
+        List[str] sents,
+        str lang="hin_Deva",
+        bint visualize=False
+    ):
+        """
+        Postprocess a batch of sentences:
+        Restore placeholders, fix script issues, and detokenize.
+        This is exposed for external use.
+        """
+        cdef object iterator
+        cdef list results
+        cdef int i
+        cdef int n = len(sents)
+        if visualize:
+            iterator = tqdm(sents, total=n, desc=f" | > Post-processing {lang}", unit="line")
+        else:
+            iterator = sents
+        results = [self._postprocess(s, lang) for s in iterator]
+        self._placeholder_entity_maps.queue.clear()
+        return results

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "1.0.3"

IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/version.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1.0.3

IndicTrans2/huggingface_interface/IndicTransToolkit/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) Varun Gumma.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE

IndicTrans2/huggingface_interface/IndicTransToolkit/README.md ADDED Viewed

	@@ -0,0 +1,97 @@

+# IndicTransToolkit
+## About
+The goal of this repository is to provide a simple, modular, and extendable toolkit for [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2) and be compatible with the HuggingFace models released. Please refer to the `CHANGELOG.md` for latest developments.
+## Pre-requisites
+ - `Python 3.8+`
+ - [Indic NLP Library](https://github.com/VarunGumma/indic_nlp_library)
+ - Other requirements as listed in `requirements.txt`
+## Configuration
+ - Editable installation (Note, this may take a while):
+```bash
+git clone https://github.com/VarunGumma/IndicTransToolkit
+cd IndicTransToolkit
+pip install --editable . --use-pep517 # required for pip >= 25.0
+# in case it fails, try:
+# pip install --editable . --use-pep517 --config-settings editable_mode=compat
+```
+## Examples
+For the training usecase, please refer [here](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface).
+### PreTainedTokenizer
+```python
+import torch
+from IndicTransToolkit.processor import IndicProcessor # NOW IMPLEMENTED IN CYTHON !!
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+ip = IndicProcessor(inference=True)
+tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
+model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
+sentences = [
+    "This is a test sentence.",
+    "This is another longer different test sentence.",
+    "Please send an SMS to 9876543210 and an email on [email protected] by 15th October, 2023.",
+]
+batch = ip.preprocess_batch(sentences, src_lang="eng_Latn", tgt_lang="hin_Deva", visualize=False) # set it to visualize=True to print a progress bar
+batch = tokenizer(batch, padding="longest", truncation=True, max_length=256, return_tensors="pt")
+with torch.inference_mode():
+    outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)
+with tokenizer.as_target_tokenizer():
+    # This scoping is absolutely necessary, as it will instruct the tokenizer to tokenize using the target vocabulary.
+    # Failure to use this scoping will result in gibberish/unexpected predictions as the output will be de-tokenized with the source vocabulary instead.
+    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+outputs = ip.postprocess_batch(outputs, lang="hin_Deva")
+print(outputs)
+>>> ['यह एक परीक्षण वाक्य है।', 'यह एक और लंबा अलग परीक्षण वाक्य है।', 'कृपया 9876543210 पर एक एस. एम. एस. भेजें और 15 अक्टूबर, 2023 तक [email protected] पर एक ईमेल भेजें।']
+```
+### Evaluation
+- `IndicEvaluator` is a python implementation of [compute_metrics.sh](https://github.com/AI4Bharat/IndicTrans2/blob/main/compute_metrics.sh).
+- We have found that this python implementation gives slightly lower scores than the original `compute_metrics.sh`. So, please use this function cautiously, and feel free to raise a PR if you have found the bug/fix.
+```python
+from IndicTransToolkit import IndicEvaluator
+# this method returns a dictionary with BLEU and ChrF2++ scores with appropriate signatures
+evaluator = IndicEvaluator()
+scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=pred_file, refs=ref_file)
+# alternatively, you can pass the list of predictions and references instead of files
+# scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=preds, refs=refs)
+```
+## Authors
+ - Varun Gumma ([email protected])
+ - Jay Gala ([email protected])
+ - Pranjal Agadh Chitale ([email protected])
+ - Raj Dabre ([email protected])
+## Bugs and Contribution
+Since this a bleeding-edge module, you may encounter broken stuff and import issues once in a while. In case you encounter any bugs or want additional functionalities, please feel free to raise `Issues`/`Pull Requests` or contact the authors.
+## Citation
+If you use our codebase, or models, please do cite the following paper:
+```bibtex
+@article{
+    gala2023indictrans,
+    title={IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages},
+    author={Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan},
+    journal={Transactions on Machine Learning Research},
+    issn={2835-8856},
+    year={2023},
+    url={https://openreview.net/forum?id=vfT4YuzAYA},
+    note={}
+}
+```

IndicTrans2/huggingface_interface/IndicTransToolkit/app.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import gradio as gr
+import torch
+from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
+from IndicTransToolkit import IndicProcessor
+import speech_recognition as sr
+# Constants
+BATCH_SIZE = 4
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+quantization = None
+# ---- IndicTrans2 Model Initialization ----
+def initialize_model_and_tokenizer(ckpt_dir, quantization):
+    if quantization == "4-bit":
+        qconfig = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
+    elif quantization == "8-bit":
+        qconfig = BitsAndBytesConfig(
+            load_in_8bit=True,
+            bnb_8bit_use_double_quant=True,
+            bnb_8bit_compute_dtype=torch.bfloat16,
+        )
+    else:
+        qconfig = None
+    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
+    model = AutoModelForSeq2SeqLM.from_pretrained(
+        ckpt_dir,
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+        quantization_config=qconfig,
+    )
+    if qconfig is None:
+        model = model.to(DEVICE)
+        if DEVICE == "cuda":
+            model.half()
+    model.eval()
+    return tokenizer, model
+def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
+    translations = []
+    for i in range(0, len(input_sentences), BATCH_SIZE):
+        batch = input_sentences[i : i + BATCH_SIZE]
+        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)
+        inputs = tokenizer(
+            batch,
+            truncation=True,
+            padding="longest",
+            return_tensors="pt",
+            return_attention_mask=True,
+        ).to(DEVICE)
+        with torch.no_grad():
+            generated_tokens = model.generate(
+                **inputs,
+                use_cache=True,
+                min_length=0,
+                max_length=256,
+                num_beams=5,
+                num_return_sequences=1,
+            )
+        with tokenizer.as_target_tokenizer():
+            generated_tokens = tokenizer.batch_decode(
+                generated_tokens.detach().cpu().tolist(),
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True,
+            )
+        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)
+        del inputs
+        torch.cuda.empty_cache()
+    return translations
+# Initialize IndicTrans2
+en_indic_ckpt_dir = "ai4bharat/indictrans2-indic-en-1B"
+en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, quantization)
+ip = IndicProcessor(inference=True)
+# ---- Gradio Function ----
+def transcribe_and_translate(audio):
+    recognizer = sr.Recognizer()
+    with sr.AudioFile(audio) as source:
+        audio_data = recognizer.record(source)
+        try:
+            # Malayalam transcription using Google API
+            malayalam_text = recognizer.recognize_google(audio_data, language="ml-IN")
+        except sr.UnknownValueError:
+            return "Could not understand audio", ""
+        except sr.RequestError as e:
+            return f"Google API Error: {e}", ""
+    # Translation
+    en_sents = [malayalam_text]
+    src_lang, tgt_lang = "mal_Mlym", "eng_Latn"
+    translations = batch_translate(en_sents, src_lang, tgt_lang, en_indic_model, en_indic_tokenizer, ip)
+    return malayalam_text, translations[0]
+# ---- Gradio Interface ----
+iface = gr.Interface(
+    fn=transcribe_and_translate,
+    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
+    outputs=[
+        gr.Textbox(label="Malayalam Transcription"),
+        gr.Textbox(label="English Translation")
+    ],
+    title="Malayalam Speech Recognition & Translation",
+    description="Speak in Malayalam → Transcribe using Google Speech Recognition → Translate to English using IndicTrans2."
+)
+iface.launch(debug=True)

IndicTrans2/huggingface_interface/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.cpython-310-x86_64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d57d4239b3638a272e4b70292f10494ee4a0fee201a9d74c62fc35a3d263a45
+size 260304

IndicTrans2/huggingface_interface/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1a101ecb27adaf367f00c90b3f8e96e7fbda3bf0560d48c368fec3750a040a4
+size 229200

IndicTrans2/huggingface_interface/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.o ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9e82df38b208dc0a9b468ff669c9da159c7deaabcb389fcfacd43e038504fec
+size 347184

IndicTrans2/huggingface_interface/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/processor.o ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d27c2cc00c97a89f97f7c28bc9175c5c403a0e2a372a0b39f1c5fe8609adda09
+size 303696

IndicTrans2/huggingface_interface/IndicTransToolkit/main.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import torch
+from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
+from IndicTransToolkit import IndicProcessor
+# Constants
+BATCH_SIZE = 4
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+quantization = None
+def initialize_model_and_tokenizer(ckpt_dir, quantization):
+    """Initialize the model and tokenizer with optional quantization."""
+    if quantization == "4-bit":
+        qconfig = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
+    elif quantization == "8-bit":
+        qconfig = BitsAndBytesConfig(
+            load_in_8bit=True,
+            bnb_8bit_use_double_quant=True,
+            bnb_8bit_compute_dtype=torch.bfloat16,
+        )
+    else:
+        qconfig = None
+    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
+    model = AutoModelForSeq2SeqLM.from_pretrained(
+        ckpt_dir,
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+        quantization_config=qconfig,
+    )
+    if qconfig is None:
+        model = model.to(DEVICE)
+        if DEVICE == "cuda":
+            model.half()
+    model.eval()
+    return tokenizer, model
+def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
+    """Batch translate sentences from src_lang to tgt_lang."""
+    translations = []
+    for i in range(0, len(input_sentences), BATCH_SIZE):
+        batch = input_sentences[i : i + BATCH_SIZE]
+        # Preprocess the batch and extract entity mappings
+        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)
+        # Tokenize the batch and generate input encodings
+        inputs = tokenizer(
+            batch,
+            truncation=True,
+            padding="longest",
+            return_tensors="pt",
+            return_attention_mask=True,
+        ).to(DEVICE)
+        # Generate translations using the model
+        with torch.no_grad():
+            generated_tokens = model.generate(
+                **inputs,
+                use_cache=True,
+                min_length=0,
+                max_length=256,
+                num_beams=5,
+                num_return_sequences=1,
+            )
+        # Decode the generated tokens into text
+        with tokenizer.as_target_tokenizer():
+            generated_tokens = tokenizer.batch_decode(
+                generated_tokens.detach().cpu().tolist(),
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True,
+            )
+        # Postprocess the translations, including entity replacement
+        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)
+        del inputs
+        torch.cuda.empty_cache()
+    return translations
+# Initialize the model and processor
+en_indic_ckpt_dir = "ai4bharat/indictrans2-indic-en-1B"
+en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, quantization)
+ip = IndicProcessor(inference=True)
+# Sample sentences
+en_sents = [
+    """ഹലോ ഫ്രണ്ട്സ് കോളേജ് സ്കൂളിൻറെ മറ്റൊരു അധ്യായത്തിലേക്ക് ഏവർക്കും സ്വാഗതം ഇന്ന് ഞാൻ വന്നിരിക്കുന്നത് ചെറിയ കുട്ടികൾക്കായുള്ള ഒരു മലയാളം പ്രസംഗവും ആയിട്ടാണ് പ്രസംഗ വിഷയം ഇന്ത്യ എൻറെ രാജ്യം ആയിരക്കണക്കിന് വർഷങ്ങളുടെ പാരമ്പര്യം പേറുന്ന മഹത്തായ രാജ്യമാണ് ഇന്ത്യ 1947 ൽ ബ്രിട്ടീഷുകാരിൽ നിന്നും സ്വാതന്ത്ര്യം നേടിയ നമ്മുടെ ഭാരതം അനേകം നാട്ടുരാജ്യങ്ങൾ ചേർന്ന് ഏറ്റവും വലിയ ജനാധിപത്യ രാജ്യമായി ആശയുടെ അടിസ്ഥാനത്തിൽ നല്ല ഭരണത്തിന് സഹായകമാകും വിധം സംസ്ഥാനങ്ങൾ രൂപം കൊണ്ടും എന്ന് 28 സംസ്ഥാനങ്ങൾ ആണ് ഇന്ത്യയിൽ ഉള്ളത് നാനാത്വത്തിലെ ഏകത്വം എന്ന ചിന്ത വിവിധ ഭാഷകളും ജാതികളും മതങ്ങളും ആചാരങ്ങളും ജീവിതരീതികളും ഉള്ള ഒരു വലിയ ജനതയെ ഒറ്റക്കെട്ടായി നിർത്തുന്നു അതാണ് ഭാരതത്തിൻറെ വിജയം നേടിയ ലോകമേ തറവാട് എന്നതാണ് ഭാരത സംസ്കാരം അതുകൊണ്ട് തന്നെ ഇന്ത്യക്കാരെ മാത്രമല്ല ലോകം മുഴുവനും ഉള്ള എല്ലാവരെയും ഭാരതം സന്തോഷത്തോടെ ഉൾക്കൊള്ളുകയും സ്നേഹിക്കുകയും ചെയ്യുന്ന പ്രസിഡണ്ടും പ്രധാനമന്ത്രിയും മന്ത്രിമാരും ചേർന്ന് നമ്മുടെ രാജ്യം ഭരിക്കുന്നു മുഖ്യമന്ത്രിയും മന്ത്രിമാരും ചേർന്ന് സംസ്ഥാനങ്ങളെയും പരിപാലിക്കുന്നു എൻറെ ഇന്ത്യ അഭിമാനമാണ് സംസ്കാരങ്ങൾ ചേർന്ന് മനോഹരിയായി പുഞ്ചിരിക്കുന്ന എൻറെ അമ്മ ഭാരതമെന്നു കേട്ടാൽ തിളക്കണം ചോര നമുക്ക് ഞരമ്പുകളിൽ"""
+]
+# Translation
+src_lang, tgt_lang = "mal_Mlym", "eng_Latn"
+hi_translations = batch_translate(en_sents, src_lang, tgt_lang, en_indic_model, en_indic_tokenizer, ip)
+# Print translations
+print(f"\n{src_lang} - {tgt_lang}")
+for input_sentence, translation in zip(en_sents, hi_translations):
+    print(f"{src_lang}: {input_sentence}")
+    print(f"{tgt_lang}: {translation}")
+# Free GPU memory
+del en_indic_tokenizer, en_indic_model
+torch.cuda.empty_cache()

IndicTrans2/huggingface_interface/IndicTransToolkit/pyproject.toml ADDED Viewed

	@@ -0,0 +1,25 @@

+[build-system]
+requires = [
+    "setuptools>=68.2.2",
+    "wheel",
+    "Cython",
+]
+build-backend = "setuptools.build_meta"
+[tool.black]
+# Black configuration for code formatting
+line-length = 88
+target-version = ['py38']
+exclude = '''
+/(
+    \.git
+  | \.hg
+  | \.mypy_cache
+  | \.tox
+  | \.venv
+  | _build
+  | buck-out
+  | build
+  | dist
+)/
+'''