nitikdias commited on
Commit
74ee63f
·
verified ·
1 Parent(s): df298e7

Upload 114 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +7 -0
  2. IndicTrans2/.gitignore +148 -0
  3. IndicTrans2/LICENSE +21 -0
  4. IndicTrans2/README.md +528 -0
  5. IndicTrans2/apply_sentence_piece.sh +48 -0
  6. IndicTrans2/baseline_eval/azure_translate.py +183 -0
  7. IndicTrans2/baseline_eval/google_translate.py +129 -0
  8. IndicTrans2/baseline_eval/m2m100_inference.py +148 -0
  9. IndicTrans2/baseline_eval/mbart_inference.py +159 -0
  10. IndicTrans2/baseline_eval/nllb_moe_cpu_inference.py +157 -0
  11. IndicTrans2/compute_comet_score.sh +84 -0
  12. IndicTrans2/compute_metrics.sh +29 -0
  13. IndicTrans2/compute_metrics_significance.sh +66 -0
  14. IndicTrans2/eval.sh +54 -0
  15. IndicTrans2/eval_rev.sh +55 -0
  16. IndicTrans2/finetune.sh +54 -0
  17. IndicTrans2/huggingface_interface/.gitignore +1 -0
  18. IndicTrans2/huggingface_interface/IndicTransToolkit/.gitignore +4 -0
  19. IndicTrans2/huggingface_interface/IndicTransToolkit/CHANGELOG.md +16 -0
  20. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/PKG-INFO +130 -0
  21. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/SOURCES.txt +15 -0
  22. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/dependency_links.txt +1 -0
  23. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/not-zip-safe +1 -0
  24. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/requires.txt +8 -0
  25. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/top_level.txt +1 -0
  26. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__init__.py +9 -0
  27. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-310.pyc +0 -0
  28. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-313.pyc +0 -0
  29. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-310.pyc +0 -0
  30. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-313.pyc +0 -0
  31. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-310.pyc +0 -0
  32. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-313.pyc +0 -0
  33. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/processor.cpython-310.pyc +0 -0
  34. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/collator.py +74 -0
  35. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/evaluator.py +151 -0
  36. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.c +0 -0
  37. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.cp310-win_amd64.pyd +3 -0
  38. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so +3 -0
  39. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.pyx +503 -0
  40. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/version.py +1 -0
  41. IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/version.txt +1 -0
  42. IndicTrans2/huggingface_interface/IndicTransToolkit/LICENSE +21 -0
  43. IndicTrans2/huggingface_interface/IndicTransToolkit/README.md +97 -0
  44. IndicTrans2/huggingface_interface/IndicTransToolkit/app.py +118 -0
  45. IndicTrans2/huggingface_interface/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.cpython-310-x86_64-linux-gnu.so +3 -0
  46. IndicTrans2/huggingface_interface/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so +3 -0
  47. IndicTrans2/huggingface_interface/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.o +3 -0
  48. IndicTrans2/huggingface_interface/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/processor.o +3 -0
  49. IndicTrans2/huggingface_interface/IndicTransToolkit/main.py +113 -0
  50. IndicTrans2/huggingface_interface/IndicTransToolkit/pyproject.toml +25 -0
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ IndicTrans2/huggingface_interface/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
37
+ IndicTrans2/huggingface_interface/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
38
+ IndicTrans2/huggingface_interface/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.o filter=lfs diff=lfs merge=lfs -text
39
+ IndicTrans2/huggingface_interface/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/processor.o filter=lfs diff=lfs merge=lfs -text
40
+ IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.cp310-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
41
+ IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
42
+ IndicTrans2/translation_guidelines.pdf filter=lfs diff=lfs merge=lfs -text
IndicTrans2/.gitignore ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ignore libs and data folder we use
2
+ indic_nlp_library
3
+ indic_nlp_resources
4
+ fairseq
5
+ devtest
6
+ checkpoints
7
+ eval_benchmarks
8
+
9
+ # Byte-compiled / optimized / DLL files
10
+ __pycache__/
11
+ *.py[cod]
12
+ *$py.class
13
+
14
+ # C extensions
15
+ *.so
16
+
17
+ # Distribution / packaging
18
+ .Python
19
+ build/
20
+ develop-eggs/
21
+ dist/
22
+ downloads/
23
+ eggs/
24
+ .eggs/
25
+ lib/
26
+ lib64/
27
+ parts/
28
+ sdist/
29
+ var/
30
+ wheels/
31
+ share/python-wheels/
32
+ *.egg-info/
33
+ .installed.cfg
34
+ *.egg
35
+ MANIFEST
36
+
37
+ # PyInstaller
38
+ # Usually these files are written by a python script from a template
39
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
40
+ *.manifest
41
+ *.spec
42
+
43
+ # Installer logs
44
+ pip-log.txt
45
+ pip-delete-this-directory.txt
46
+
47
+ # Unit test / coverage reports
48
+ htmlcov/
49
+ .tox/
50
+ .nox/
51
+ .coverage
52
+ .coverage.*
53
+ .cache
54
+ nosetests.xml
55
+ coverage.xml
56
+ *.cover
57
+ *.py,cover
58
+ .hypothesis/
59
+ .pytest_cache/
60
+ cover/
61
+
62
+ # Translations
63
+ *.mo
64
+ *.pot
65
+
66
+ # Django stuff:
67
+ *.log
68
+ local_settings.py
69
+ db.sqlite3
70
+ db.sqlite3-journal
71
+
72
+ # Flask stuff:
73
+ instance/
74
+ .webassets-cache
75
+
76
+ # Scrapy stuff:
77
+ .scrapy
78
+
79
+ # Sphinx documentation
80
+ docs/_build/
81
+
82
+ # PyBuilder
83
+ .pybuilder/
84
+ target/
85
+
86
+ # Jupyter Notebook
87
+ .ipynb_checkpoints
88
+
89
+ # IPython
90
+ profile_default/
91
+ ipython_config.py
92
+
93
+ # pyenv
94
+ # For a library or package, you might want to ignore these files since the code is
95
+ # intended to run in multiple environments; otherwise, check them in:
96
+ # .python-version
97
+
98
+ # pipenv
99
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
101
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
102
+ # install all needed dependencies.
103
+ #Pipfile.lock
104
+
105
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
106
+ __pypackages__/
107
+
108
+ # Celery stuff
109
+ celerybeat-schedule
110
+ celerybeat.pid
111
+
112
+ # SageMath parsed files
113
+ *.sage.py
114
+
115
+ # Environments
116
+ .env
117
+ .venv
118
+ env/
119
+ venv/
120
+ ENV/
121
+ env.bak/
122
+ venv.bak/
123
+
124
+ # Spyder project settings
125
+ .spyderproject
126
+ .spyproject
127
+
128
+ # Rope project settings
129
+ .ropeproject
130
+
131
+ # mkdocs documentation
132
+ /site
133
+
134
+ # mypy
135
+ .mypy_cache/
136
+ .dmypy.json
137
+ dmypy.json
138
+
139
+ # Pyre type checker
140
+ .pyre/
141
+
142
+ # pytype static type analyzer
143
+ .pytype/
144
+
145
+ # Cython debug symbols
146
+ cython_debug/
147
+
148
+ .DS_Store
IndicTrans2/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) AI4Bharat.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE
IndicTrans2/README.md ADDED
@@ -0,0 +1,528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IndicTrans2
2
+
3
+ [📜 Paper](https://arxiv.org/abs/2305.16307) | [🌐 Website](https://ai4bharat.iitm.ac.in/indic-trans2) | [▶️ Demo](https://models.ai4bharat.org/#/nmt/v2) | [🤗 HF Interface](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface) | [![colab link](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI4Bharat/IndicTrans2/blob/main/huggingface_interface/colab_inference.ipynb)
4
+
5
+ IndicTrans2 is the first open-source transformer-based multilingual NMT model that supports high-quality translations across all the 22 scheduled Indic languages — including multiple scripts for low-resouce languages like Kashmiri, Manipuri and Sindhi. It adopts script unification wherever feasible to leverage transfer learning by lexical sharing between languages. Overall, the model supports five scripts Perso-Arabic (Kashmiri, Sindhi, Urdu), Ol Chiki (Santali), Meitei (Manipuri), Latin (English), and Devanagari (used for all the remaining languages).
6
+
7
+ We open-souce all our training dataset (BPCC), back-translation data (BPCC-BT), final IndicTrans2 models, evaluation benchmarks (IN22, which includes IN22-Gen and IN22-Conv) and training and inference scripts for easier use and adoption within the research community. We hope that this will foster even more research in low-resource Indic languages, leading to further improvements in the quality of low-resource translation through contributions from the research community.
8
+
9
+ This code repository contains instructions for downloading the artifacts associated with IndicTrans2, as well as the code for training/fine-tuning the multilingual NMT models.
10
+
11
+ Here is the list of languages supported by the IndicTrans2 models:
12
+
13
+ <table>
14
+ <tbody>
15
+ <tr>
16
+ <td>Assamese (asm_Beng)</td>
17
+ <td>Kashmiri (Arabic) (kas_Arab)</td>
18
+ <td>Punjabi (pan_Guru)</td>
19
+ </tr>
20
+ <tr>
21
+ <td>Bengali (ben_Beng)</td>
22
+ <td>Kashmiri (Devanagari) (kas_Deva)</td>
23
+ <td>Sanskrit (san_Deva)</td>
24
+ </tr>
25
+ <tr>
26
+ <td>Bodo (brx_Deva)</td>
27
+ <td>Maithili (mai_Deva)</td>
28
+ <td>Santali (sat_Olck)</td>
29
+ </tr>
30
+ <tr>
31
+ <td>Dogri (doi_Deva)</td>
32
+ <td>Malayalam (mal_Mlym)</td>
33
+ <td>Sindhi (Arabic) (snd_Arab)</td>
34
+ </tr>
35
+ <tr>
36
+ <td>English (eng_Latn)</td>
37
+ <td>Marathi (mar_Deva)</td>
38
+ <td>Sindhi (Devanagari) (snd_Deva)</td>
39
+ </tr>
40
+ <tr>
41
+ <td>Konkani (gom_Deva)</td>
42
+ <td>Manipuri (Bengali) (mni_Beng)</td>
43
+ <td>Tamil (tam_Taml)</td>
44
+ </tr>
45
+ <tr>
46
+ <td>Gujarati (guj_Gujr)</td>
47
+ <td>Manipuri (Meitei) (mni_Mtei)</td>
48
+ <td>Telugu (tel_Telu)</td>
49
+ </tr>
50
+ <tr>
51
+ <td>Hindi (hin_Deva)</td>
52
+ <td>Nepali (npi_Deva)</td>
53
+ <td>Urdu (urd_Arab)</td>
54
+ </tr>
55
+ <tr>
56
+ <td>Kannada (kan_Knda)</td>
57
+ <td>Odia (ory_Orya)</td>
58
+ <td></td>
59
+ </tr>
60
+ </tbody>
61
+ </table>
62
+
63
+ ## Updates
64
+ - 🚨 Jan 18, 2025 - Long Context Models- RoPE-based variants of IndicTrans2 models capable of handling sequence lengths **upto 2048 tokens** are available [here](https://huggingface.co/collections/prajdabre/indictrans2-rope-6742ddac669a05db0804db35).
65
+ - 🚨 Dec 20, 2024 - The latest releases of the high-quality human-annotated BPCC-Seed dataset would henceforth be made available on the [AI4Bharat Website](https://ai4bharat.iitm.ac.in/datasets/bpcc).
66
+ - 🚨 Dec 30, 2023 - Migrated IndicTrans2 tokenizer for HF compatible IndicTrans2 models to [IndicTransToolkit](https://github.com/VarunGumma/IndicTransToolkit) and will be maintained separately there from now onwards. Add LoRA fine-tuning scripts for our IndicTrans2 models in [huggingface_interface](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface).
67
+ - 🚨 Dec 1, 2023 - Release of Indic-Indic model and corresponding distilled variants for each base model. Please refer to the [Download section](https://github.com/AI4Bharat/IndicTrans2#multilingual-translation-models) for the checkpoints.
68
+ - 🚨 Sep 9, 2023 - Added HF compatible IndicTrans2 models. Please refer to the [README](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface) for detailed example usage.
69
+
70
+ ## Tables of Contents
71
+
72
+ - [Download Models and Other Artifacts](#download-models-and-other-artifacts)
73
+ - [Multilingual Translation Models](#multilingual-translation-models)
74
+ - [Training Data](#training-data)
75
+ - [Evaluation Data](#evaluation-data)
76
+ - [Installation](#installation)
77
+ - [Data](#data)
78
+ - [Training](#training)
79
+ - [Evaluation](#evaluation)
80
+ - [Preparing Data for Training](#preparing-data-for-training)
81
+ - [Using our SPM model and Fairseq dictionary](#using-our-spm-model-and-fairseq-dictionary)
82
+ - [Training your own SPM models and learning Fairseq dictionary](#training-your-own-spm-models-and-learning-fairseq-dictionary)
83
+ - [Training / Fine-tuning](#training--fine-tuning)
84
+ - [Inference](#inference)
85
+ - [Fairseq Inference](#fairseq-inference)
86
+ - [CT2 Inference](#ct2-inference)
87
+ - [Evaluations](#evaluations)
88
+ - [Baseline Evaluation](#baseline-evaluation)
89
+ - [LICENSE](#license)
90
+ - [Citation](#citation)
91
+
92
+ ## Download Models and Other Artifacts
93
+
94
+ ### Multilingual Translation Models
95
+
96
+ | Model | En-Indic | Indic-En | Indic-Indic | Evaluations |
97
+ | ---------------------------- | ----------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
98
+ | Base (used for benchmarking) | [Fairseq](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/en-indic-preprint.tar.gz) & [HF](https://huggingface.co/ai4bharat/indictrans2-en-indic-1B) | [fairseq](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/indic-en-preprint.tar.gz) & [HF](https://huggingface.co/ai4bharat/indictrans2-indic-en-1B) | [HF](https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B) | [translations](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/translation_outputs.tar.gz) (as of May 10, 2023), [metrics](https://drive.google.com/drive/folders/1lOOdaU0VdRSBgJEsNav5zC7wwLBis9NI?usp=sharing) |
99
+ | Distilled | [Fairseq](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/en-indic-dist.tar.gz) & [HF](https://huggingface.co/ai4bharat/indictrans2-en-indic-dist-200M) | [Fairseq](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/indic-en-dist.tar.gz) & [HF](https://huggingface.co/ai4bharat/indictrans2-indic-en-dist-200M) | [HF](https://huggingface.co/ai4bharat/indictrans2-indic-indic-dist-320M) |
100
+
101
+ ### Training Data
102
+
103
+ |Data | URL |
104
+ |-------------------------------------------|--------------------------------------------------------------------------------------------------|
105
+ | ✨ BPCC-Seed Latest Release | [HF Config: bpcc-seed-latest](https://huggingface.co/datasets/ai4bharat/BPCC) |
106
+ | BPCC (*Used in Paper - utilizes the BPCC-Seed V1 dataset*) | [HF Config: bpcc-seed-v1](https://huggingface.co/datasets/ai4bharat/BPCC) |
107
+ | Back-translation (BPCC-BT) | Will be updated |
108
+ | Full Data Split | [Download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/BPCC.zip) |
109
+
110
+
111
+
112
+ ### Evaluation Data
113
+
114
+ | Data | URL |
115
+ | ----------------------- | ------------------------------------------------------------------------------------ |
116
+ | IN22 test set | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/IN22_testset.zip) |
117
+ | FLORES-22 Indic dev set | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/flores-22_dev.zip) |
118
+
119
+ ## Installation
120
+
121
+ Instructions to setup and install everything before running the code.
122
+
123
+ ```bash
124
+ # Clone the github repository and navigate to the project directory.
125
+ git clone https://github.com/AI4Bharat/IndicTrans2
126
+ cd IndicTrans2
127
+
128
+ # Install all the dependencies and requirements associated with the project.
129
+ source install.sh
130
+ ```
131
+
132
+ Note: We recommend creating a virtual environment with python>=3.7.
133
+
134
+ ### Additional notes about Installation
135
+ The ``prepare_data_joint_finetuning.sh`` and ``prepare_data_joint_training.sh`` scripts expect that the sentencepiece commandline utility and GNU parallel are installed.
136
+ 1. To install the sentencepiece command line utility, please follow the instructions [here](https://github.com/google/sentencepiece?tab=readme-ov-file#build-and-install-sentencepiece-command-line-tools-from-c-source).
137
+ 2. Please check if GNU parallel is installed, if not please install the same or alternatively in case of installation issues, remove ``parallel --pipe --keep-order`` from the respective training / finetuning script as well as ``apply_sentence_piece.sh``.
138
+
139
+
140
+ ## Data
141
+
142
+ ### Training
143
+
144
+ Bharat Parallel Corpus Collection (BPCC) is a comprehensive and publicly available parallel corpus that includes both existing and new data for all 22 scheduled Indic languages. It is comprised of two parts: BPCC-Mined and BPCC-Human, totaling approximately 230 million bitext pairs. BPCC-Mined contains about 228 million pairs, with nearly 126 million pairs newly added as a part of this work. On the other hand, BPCC-Human consists of 2.2 million gold standard English-Indic pairs, with an additional 644K bitext pairs from English Wikipedia sentences (forming the BPCC-H-Wiki subset) and 139K sentences covering everyday use cases (forming the BPCC-H-Daily subset). It is worth highlighting that BPCC provides the first available datasets for 7 languages and significantly increases the available data for all languages covered.
145
+
146
+ You can find the contribution from different sources in the following table:
147
+
148
+ <table>
149
+ <tbody>
150
+ <tr>
151
+ <td rowspan="4">BPCC-Mined</th>
152
+ <td rowspan="2">Existing</th>
153
+ <td>Samanantar</th>
154
+ <td>19.4M</th>
155
+ </tr>
156
+ <tr>
157
+ <td>NLLB</th>
158
+ <td>85M</th>
159
+ </tr>
160
+ <tr>
161
+ <td rowspan="2">Newly Added</th>
162
+ <td>Samanantar++</th>
163
+ <td>121.6M</th>
164
+ </tr>
165
+ <tr>
166
+ <td>Comparable</th>
167
+ <td>4.3M</th>
168
+ </tr>
169
+ <tr>
170
+ <td rowspan="5">BPCC-Human</td>
171
+ <td rowspan="3">Existing</td>
172
+ <td>NLLB</td>
173
+ <td>18.5K</td>
174
+ </tr>
175
+ <tr>
176
+ <td>ILCI</td>
177
+ <td>1.3M</td>
178
+ </tr>
179
+ <tr>
180
+ <td>Massive</td>
181
+ <td>115K</td>
182
+ </tr>
183
+ <tr>
184
+ <td rowspan="2">Newly Added</td>
185
+ <td>Wiki</td>
186
+ <td>644K</td>
187
+ </tr>
188
+ <tr>
189
+ <td>Daily</td>
190
+ <td>139K</td>
191
+ </tr>
192
+ </tbody>
193
+ </table>
194
+
195
+ Additionally, we provide augmented back-translation data generated by our intermediate IndicTrans2 models for training purposes. Please refer our paper for more details on the selection of sample proportions and sources.
196
+
197
+ <table>
198
+ <tbody>
199
+ <tr>
200
+ <td>English BT data (English Original)</td>
201
+ <td>401.9M</td>
202
+ </tr>
203
+ <tr>
204
+ <td>Indic BT data (Indic Original)</td>
205
+ <td>400.9M</td>
206
+ </tr>
207
+ </tbody>
208
+ </table>
209
+
210
+ <br>
211
+
212
+ ### Evaluation
213
+
214
+ IN22 test set is a newly created comprehensive benchmark for evaluating machine translation performance in multi-domain, n-way parallel contexts across 22 Indic languages. It has been created from three distinct subsets, namely IN22-Wiki, IN22-Web and IN22-Conv. The Wikipedia and Web sources subsets offer diverse content spanning news, entertainment, culture, legal, and India-centric topics. IN22-Wiki and IN22-Web have been combined and considered for evaluation purposes and released as IN22-Gen. Meanwhile, IN22-Conv the conversation domain subset is designed to assess translation quality in typical day-to-day conversational-style applications.
215
+
216
+ <table>
217
+ <tbody>
218
+ <tr>
219
+ <td>IN22-Gen (IN22-Wiki + IN22-Web)</td>
220
+ <td>1024 sentences</td>
221
+ <td>🤗 <a href="https://huggingface.co/datasets/ai4bharat/IN22-Gen">ai4bharat/IN22-Gen</td>
222
+ </tr>
223
+ <tr>
224
+ <td>IN22-Conv</td>
225
+ <td>1503 sentences</td>
226
+ <td>🤗 <a href="https://huggingface.co/datasets/ai4bharat/IN22-Conv">ai4bharat/IN22-Conv</td>
227
+ </tr>
228
+ </tbody>
229
+ </table>
230
+
231
+ You can download the data artifacts released as a part of this work from the [following section](#download-models-and-other-artifacts).
232
+
233
+ ## Preparing Data for Training
234
+
235
+ BPCC data is organized under different subsets as described above, where each subset contains language pair subdirectories with the sentences pairs. We also provide LaBSE and LASER for the mined subsets of BPCC. In order to replicate our training setup, you will need to combine the data for corresponding language pairs from different subsets and remove overlapping bitext pairs if any.
236
+
237
+ Here is the expected directory structure of the data:
238
+
239
+ ```bash
240
+ BPCC
241
+ ├── eng_Latn-asm_Beng
242
+ │ ├── train.eng_Latn
243
+ │ └── train.asm_Beng
244
+ ├── eng_Latn-ben_Beng
245
+ └── ...
246
+ ```
247
+
248
+ While we provide deduplicated subsets with the current available benchmarks, we highly recommend performing deduplication using the combined monolingual side of all the benchmarks. You can use the following command for deduplication once you combine the monolingual side of all the benchmarks in the directory.
249
+
250
+ ```python3
251
+ python3 scripts/dedup_benchmark.py <in_data_dir> <out_data_dir> <benchmark_dir>
252
+ ```
253
+
254
+ - `<in_data_dir>`: path to the directory containing train data for each language pair in the format `{src_lang}-{tgt_lang}`
255
+ - `<out_data_dir>`: path to the directory where the deduplicated train data will be written for each language pair in the format `{src_lang}-{tgt_lang}`
256
+ - `<benchmark_dir>`: path to the directory containing the language-wise monolingual side of dev/test set, with monolingual files named as `test.{lang}`
257
+
258
+ ### Using our SPM model and Fairseq dictionary
259
+
260
+ Once you complete the deduplication of the training data with the available benchmarks, you can preprocess and binarize the data for training models. Please download our trained SPM model and learned Fairseq dictionary using the following links for your experiments.
261
+
262
+ | | En-Indic | Indic-En | Indic-Indic |
263
+ | ------------------ | -------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- |
264
+ | SPM model | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/en-indic-spm.zip) | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/indic-en-spm.zip) | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/indic-indic-spm.zip) |
265
+ | Fairseq dictionary | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/en-indic-fairseq-dict.zip) | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/indic-en-fairseq-dict.zip) | [download](https://huggingface.co/datasets/ai4bharat/BPCC/resolve/main/additional/indic-indic-fairseq-dict.zip) |
266
+
267
+ To prepare the data for training En-Indic model, please do the following:
268
+
269
+ 1. Download the SPM model in the experiment directory and rename it as `vocab`.
270
+ 2. Download the Fairseq dictionary in the experiment directory and rename it as `final_bin`.
271
+
272
+ Here is the expected directory for training En-Indic model:
273
+
274
+ ```bash
275
+ en-indic-exp
276
+ ├── train
277
+ │ ├── eng_Latn-asm_Beng
278
+ │ │ ├── train.eng_Latn
279
+ │ │ └── train.asm_Beng
280
+ │ ├── eng_Latn-ben_Beng
281
+ │ └── ...
282
+ ├── devtest
283
+ │ └── all
284
+ │ ├── eng_Latn-asm_Beng
285
+ │ │ ├── dev.eng_Latn
286
+ │ │ └── dev.asm_Beng
287
+ │ ├── eng_Latn-ben_Beng
288
+ │ └── ...
289
+ ├── vocab
290
+ │ ├── model.SRC
291
+ │ ├── model.TGT
292
+ │ ├── vocab.SRC
293
+ │ └── vocab.TGT
294
+ └── final_bin
295
+ ├── dict.SRC.txt
296
+ └── dict.TGT.txt
297
+ ```
298
+
299
+ To prepare data for training the Indic-En model, you should reverse the language pair directories within the train and devtest directories. Additionally, make sure to download the corresponding SPM model and Fairseq dictionary and put them in the experiment directory, similar to the procedure mentioned above for En-Indic model training.
300
+
301
+ You can binarize the data for model training using the following:
302
+
303
+ ```bash
304
+ bash prepare_data_joint_finetuning.sh <exp_dir>
305
+ ```
306
+
307
+ - `<exp_dir>`: path to the directory containing the raw data for binarization
308
+
309
+ You will need to follow the same steps for data preparation in case of fine-tuning models.
310
+
311
+ ### Training your own SPM models and learning Fairseq dictionary
312
+
313
+ If you want to train your own SPM model and learn Fairseq dictionary, then please do the following:
314
+
315
+ 1. Collect a balanced amount of English and Indic monolingual data (we use around 3 million sentences per language-script combination). If some languages have limited data available, increase their representation to achieve a fair distribution of tokens across languages.
316
+ 2. Perform script unification for Indic languages wherever possible using `scripts/preprocess_translate.py` and concatenate all Indic data into a single file.
317
+ 3. Train two SPM models, one for English and other for Indic side using the following:
318
+
319
+ ```bash
320
+ spm_train --input=train.indic --model_prefix=<model_name> --vocab_size=<vocab_size> --character_coverage=1.0 --model_type=BPE
321
+ ```
322
+
323
+ 4. Copy the trained SPM models in the experiment directory mentioned earlier and learn the Fairseq dictionary using the following:
324
+
325
+ ```bash
326
+ bash prepare_data_joint_training.sh <exp_dir>
327
+ ```
328
+
329
+ 5. You will need to use the same Fairseq dictionary for any subsequent fine-tuning experiments and refer to the steps described above ([link](#using-our-spm-model-and-fairseq-dictionary)).
330
+
331
+ ## Training / Fine-tuning
332
+
333
+ After binarizing the data, you can use train.sh to train the models. We provide the default hyperparameters used in this work. You can modify the hyperparameters as per your requirement if needed. If you want to train the model on a customized architecture, then please define the architecture in `model_configs/custom_transformer.py`. You can start the model training with the following command:
334
+
335
+ ```bash
336
+ bash train.sh <exp_dir> <model_arch>
337
+ ```
338
+
339
+ - `<exp_dir>`: path to the directory containing the binarized data
340
+ - `<model_arch>`: custom transformer architecture used for model training
341
+
342
+ For fine-tuning, the initial steps remain the same. However, the `finetune.sh` script includes an additional argument, `pretrained_ckpt`, which specifies the model checkpoint to be loaded for further fine-tuning. You can perform fine-tuning using the following command:
343
+
344
+ ```bash
345
+ bash finetune.sh <exp_dir> <model_arch> <pretrained_ckpt>
346
+ ```
347
+
348
+ - `<exp_dir>`: path to the directory containing the binarized data
349
+ - `<model_arch>`: custom transformer architecture used for model training
350
+ - `transformer_18_18` - For IT2 Base models
351
+ - `transformer_base18L` - For IT2 Distilled models
352
+ - `<pretrained_ckpt>`: path to the fairseq model checkpoint to be loaded for further fine-tuning
353
+
354
+ You can download the model artifacts released as a part of this work from the [following section](#download-models-and-other-artifacts).
355
+
356
+ The pretrained checkpoints have 3 directories, a fairseq model directory and 2 CT-ported model directories. Please note that the CT2 models are provided only for efficient inference. For fine-tuning purposes you should use the `fairseq_model`. Post that you can use the [fairseq-ct2-converter](https://opennmt.net/CTranslate2/guides/fairseq.html) to port your fine-tuned checkpoints to CT2 for faster inference.
357
+
358
+ ## Inference
359
+
360
+ ### Fairseq Inference
361
+
362
+ In order to run inference on our pretrained models using bash interface, please use the following:
363
+
364
+ ```bash
365
+ bash joint_translate.sh <infname> <outfname> <src_lang> <tgt_lang> <ckpt_dir>
366
+ ```
367
+
368
+ - `infname`: path to the input file containing sentences
369
+ - `outfname`: path to the output file where the translations should be stored
370
+ - `src_lang`: source language
371
+ - `tgt_lang`: target language
372
+ - `ckpt_dir`: path to the fairseq model checkpoint directory
373
+
374
+ If you want to run the inference using python interface then please execute the following block of code from the root directory:
375
+
376
+ ```python3
377
+ from inference.engine import Model
378
+
379
+ model = Model(ckpt_dir, model_type="fairseq")
380
+
381
+ sents = [sent1, sent2,...]
382
+
383
+ # for a batch of sentences
384
+ model.batch_translate(sents, src_lang, tgt_lang)
385
+
386
+ # for a paragraph
387
+ model.translate_paragraph(text, src_lang, tgt_lang)
388
+ ```
389
+
390
+ ### CT2 Inference
391
+
392
+ In order to run inference on CT2-ported model using python inference then please execute the following block of code from the root directory:
393
+
394
+ ```python3
395
+ from inference.engine import Model
396
+
397
+ model = Model(ckpt_dir, model_type="ctranslate2")
398
+
399
+ sents = [sent1, sent2,...]
400
+
401
+ # for a batch of sentences
402
+ model.batch_translate(sents, src_lang, tgt_lang)
403
+
404
+ # for a paragraph
405
+ model.translate_paragraph(text, src_lang, tgt_lang)
406
+ ```
407
+
408
+ ## Evaluations
409
+
410
+ We consider the chrF++ score as our primary metric. Additionally, we also report the BLEU and Comet scores.
411
+ We also perform statistical significance tests for each metric to ascertain whether the differences are statistically significant.
412
+
413
+ In order to run our evaluation scripts, you will need to organize the evaluation test sets into the following directory structure:
414
+
415
+ ```bash
416
+ eval_benchmarks
417
+ ├── flores
418
+ │ └── eng_Latn-asm_Beng
419
+ │ ├── test.eng_Latn
420
+ │ └── test.asm_Beng
421
+ ├── in22-gen
422
+ ├── in22-conv
423
+ ├── ntrex
424
+ └── ...
425
+ ```
426
+
427
+ To compute the BLEU and chrF++ scores for prediction file, you can use the following command:
428
+
429
+ ```bash
430
+ bash compute_metrics.sh <pred_fname> <ref_fname> <tgt_lang>
431
+ ```
432
+
433
+ - `pred_fname`: path to the model translations
434
+ - `ref_fname`: path to the reference translations
435
+ - `tgt_lang`: target language
436
+
437
+ In order to automate the inference over the individual test sets for En-Indic, you can use the following command:
438
+
439
+ ```bash
440
+ bash eval.sh <devtest_data_dir> <ckpt_dir> <system>
441
+ ```
442
+
443
+ - `<devtest_data_dir>`: path to the evaluation set with language pair subdirectories (for example, flores directory in the above tree structure)
444
+ - `<ckpt_dir>`: path to the fairseq model checkpoint directory
445
+ - `<system>`: system name suffix to store the predictions in the format `test.{lang}.pred.{system}`
446
+
447
+ In case of Indic-En evaluation, please use the following command:
448
+
449
+ ```bash
450
+ bash eval_rev.sh <devtest_data_dir> <ckpt_dir> <system>
451
+ ```
452
+
453
+ - `<devtest_data_dir>`: path to the evaluation set with language pair subdirectories (for example, flores directory in the above tree structure)
454
+ - `<ckpt_dir>`: path to the fairseq model checkpoint directory
455
+ - `<system>`: system name suffix to store the predictions in the format `test.{lang}.pred.{system}`
456
+
457
+ **_Note: You don’t need to reverse the test set directions for each language pair._**
458
+
459
+ In case of Indic-Indic evaluation, please use the following command:
460
+
461
+ ```bash
462
+ bash pivot_eval.sh <devtest_data_dir> <pivot_lang> <src2pivot_ckpt_dir> <pivot2tgt_ckpt_dir> <system>
463
+ ```
464
+
465
+ - `<devtest_data_dir>`: path to the evaluation set with language pair subdirectories (for example, flores directory in the above tree structure)
466
+ - `<pivot_lang>`: pivot language (default should be `eng_Latn`)
467
+ - `<src2pivot_ckpt_dir>`: path to the fairseq Indic-En model checkpoint directory
468
+ - `<pivot2tgt_ckpt_dir>`: path to the fairseq En-Indic model checkpoint directory
469
+ - `<system>`: system name suffix to store the predictions in the format test.{lang}.pred.{system}
470
+
471
+ In order to perform significance testing for BLEU and chrF++ metrics after you have the predictions for different systems, you can use the following command:
472
+
473
+ ```bash
474
+ bash compute_comet_metrics_significance.sh <devtest_data_dir>
475
+ ```
476
+
477
+ - `<devtest_data_dir>`: path to the evaluation set with language pair subdirectories (for example, flores directory in the above tree structure)
478
+
479
+ Similarly, to compute the COMET scores and perform significance testing on predictions of different systems, you can use the following command.
480
+
481
+ ```bash
482
+ bash compute_comet_score.sh <devtest_data_dir>
483
+ ```
484
+
485
+ - `<devtest_data_dir>`: path to the evaluation set with language pair subdirectories (for example, flores directory in the above tree structure)
486
+
487
+ Please note that as we compute significance tests with the same script and automate everything, it is best to have all the predictions for all the systems in place to avoid repeating anything.
488
+ Also, we define the systems in the script itself, if you want to try out other systems, make sure to edit it there itself.
489
+
490
+ ### Baseline Evaluation
491
+
492
+ To generate the translation results for baseline models such as M2M-100, MBART, Azure, Google, and NLLB MoE, you can check the scripts provided in the "baseline_eval" directory of this repository. For NLLB distilled, you can either modify NLLB_MoE eval or use this [repository](https://github.com/pluiez/NLLB-inference). Similarly, for IndicTrans inference, please refer to this [repository](https://github.com/ai4bharat/IndicTrans).
493
+
494
+ You can download the translation outputs released as a part of this work from the [following section](#download-models-and-other-artifacts).
495
+
496
+ ## LICENSE
497
+
498
+ The following table lists the licenses associated with the different artifacts released as a part of this work:
499
+
500
+ | Artifact | LICENSE |
501
+ | ----------------------------------------------------- | --------------------------------------------------------------------- |
502
+ | Existing Mined Corpora (NLLB & Samanantar) | [CC0](https://creativecommons.org/share-your-work/public-domain/cc0/) |
503
+ | Existing Seed Corpora (NLLB-Seed, ILCI, MASSIVE) | [CC0](https://creativecommons.org/share-your-work/public-domain/cc0/) |
504
+ | Newly Added Mined Corpora (Samanantar++ & Comparable) | [CC0](https://creativecommons.org/share-your-work/public-domain/cc0/) |
505
+ | Newly Added Seed Corpora (BPCC-H-Wiki & BPCC-H-Daily) | [CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/) |
506
+ | Newly Created IN-22 test set (IN22-Gen & IN22-Conv) | [CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/) |
507
+ | Back-translation data (BPCC-BT) | [CC0](https://creativecommons.org/share-your-work/public-domain/cc0/) |
508
+ | Model checkpoints | [MIT](https://github.com/ai4bharat/IndicTrans2/blob/main/LICENSE) |
509
+
510
+ The mined corpora collection (BPCC-Mined), existing seed corpora (NLLB-Seed, ILCI, MASSIVE), Backtranslation data (BPCC-BT), are released under the following licensing scheme:
511
+
512
+ - We do not own any of the text from which this data has been extracted.
513
+ - We license the actual packaging of this data under the Creative Commons [CC0 license (“no rights reserved”)](https://creativecommons.org/share-your-work/public-domain/cc0/).
514
+ - To the extent possible under law, [AI4Bharat](https://ai4bharat.iitm.ac.in/) has waived all copyright and related or neighboring rights to BPCC-Mined, existing seed corpora (NLLB-Seed, ILCI, MASSIVE) and BPCC-BT.
515
+
516
+ ## Citation
517
+
518
+ ```bibtex
519
+ @article{gala2023indictrans,
520
+ title={IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages},
521
+ author={Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan},
522
+ journal={Transactions on Machine Learning Research},
523
+ issn={2835-8856},
524
+ year={2023},
525
+ url={https://openreview.net/forum?id=vfT4YuzAYA},
526
+ note={}
527
+ }
528
+ ```
IndicTrans2/apply_sentence_piece.sh ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # This script tokenizes the preprocessed train and dev set using the trained spm models.
4
+
5
+
6
+ echo `date`
7
+ exp_dir=$1 # path to the experiment directory
8
+ data_dir=$2 # path to the data directory where all lang pairs are concatenated
9
+ bpe_dir=$3 # path to the tokenized data directory
10
+ src_lang=$4 # source language
11
+ tgt_lang=$5 # target language
12
+ split=$6 # name of the split
13
+ parallel_installed=${7:-false} # If GNU Parallel is installed or not
14
+
15
+ in_split_dir=$data_dir/$split
16
+ out_split_dir=$bpe_dir/$split
17
+
18
+ echo "Apply Sentence Piece tokenization to SRC corpus"
19
+ # for very large datasets, it is recommended to use gnu-parallel to speed up applying bpe
20
+
21
+ if $parallel_installed; then
22
+ parallel --pipe --keep-order \
23
+ spm_encode --model=$exp_dir/vocab/model.SRC \
24
+ --output_format=piece \
25
+ < $in_split_dir.$src_lang \
26
+ > $out_split_dir.$src_lang
27
+ else
28
+ spm_encode --model=$exp_dir/vocab/model.SRC \
29
+ --output_format=piece \
30
+ < $in_split_dir.$src_lang \
31
+ > $out_split_dir.$src_lang
32
+ fi
33
+
34
+ echo "Apply Sentence Piece tokenization to TGT corpus"
35
+ # for very large datasets, it is recommended to use gnu-parallel to speed up applying bpe
36
+
37
+ if $parallel_installed; then
38
+ parallel --pipe --keep-order \
39
+ spm_encode --model=$exp_dir/vocab/model.TGT \
40
+ --output_format=piece \
41
+ < $in_split_dir.$tgt_lang \
42
+ > $out_split_dir.$tgt_lang
43
+ else
44
+ spm_encode --model=$exp_dir/vocab/model.TGT \
45
+ --output_format=piece \
46
+ < $in_split_dir.$tgt_lang \
47
+ > $out_split_dir.$tgt_lang
48
+ fi
IndicTrans2/baseline_eval/azure_translate.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import glob
4
+ import requests
5
+ from urllib.parse import urlencode
6
+ from dotenv import dotenv_values
7
+ import traceback
8
+ import time
9
+
10
+ flores_to_iso = {
11
+ "asm_Beng": "as",
12
+ "ben_Beng": "bn",
13
+ "brx_Deva": "brx",
14
+ "doi_Deva": "doi",
15
+ "eng_Latn": "en",
16
+ "gom_Deva": "gom",
17
+ "guj_Gujr": "gu",
18
+ "hin_Deva": "hi",
19
+ "kan_Knda": "kn",
20
+ "kas_Arab": "ks",
21
+ "kas_Deva": "ks_Deva",
22
+ "mai_Deva": "mai",
23
+ "mal_Mlym": "ml",
24
+ "mar_Deva": "mr",
25
+ "mni_Beng": "mni_Beng",
26
+ "mni_Mtei": "mni",
27
+ "npi_Deva": "ne",
28
+ "ory_Orya": "or",
29
+ "pan_Guru": "pa",
30
+ "san_Deva": "sa",
31
+ "sat_Olck": "sat",
32
+ "snd_Arab": "sd",
33
+ "snd_Deva": "sd_Deva",
34
+ "tam_Taml": "ta",
35
+ "tel_Telu": "te",
36
+ "urd_Arab": "ur",
37
+ }
38
+
39
+
40
+ class AzureTranslator:
41
+ def __init__(
42
+ self,
43
+ subscription_key: str,
44
+ region: str,
45
+ endpoint: str = "https://api.cognitive.microsofttranslator.com",
46
+ ) -> None:
47
+ self.http_headers = {
48
+ "Ocp-Apim-Subscription-Key": subscription_key,
49
+ "Ocp-Apim-Subscription-Region": region,
50
+ }
51
+ self.translate_endpoint = endpoint + "/translate?api-version=3.0&"
52
+ self.languages_endpoint = endpoint + "/languages?api-version=3.0"
53
+
54
+ self.supported_languages = self.get_supported_languages()
55
+
56
+ def get_supported_languages(self) -> dict:
57
+ return requests.get(self.languages_endpoint).json()["translation"]
58
+
59
+ def batch_translate(self, texts: list, src_lang: str, tgt_lang: str) -> list:
60
+ if not texts:
61
+ return texts
62
+
63
+ src_lang = flores_to_iso[src_lang]
64
+ tgt_lang = flores_to_iso[tgt_lang]
65
+
66
+ if src_lang not in self.supported_languages:
67
+ raise NotImplementedError(
68
+ f"Source language code: `{src_lang}` not supported!"
69
+ )
70
+
71
+ if tgt_lang not in self.supported_languages:
72
+ raise NotImplementedError(
73
+ f"Target language code: `{tgt_lang}` not supported!"
74
+ )
75
+
76
+ body = [{"text": text} for text in texts]
77
+ query_string = urlencode(
78
+ {
79
+ "from": src_lang,
80
+ "to": tgt_lang,
81
+ }
82
+ )
83
+
84
+ try:
85
+ response = requests.post(
86
+ self.translate_endpoint + query_string,
87
+ headers=self.http_headers,
88
+ json=body,
89
+ )
90
+ except:
91
+ traceback.print_exc()
92
+ return None
93
+
94
+ try:
95
+ response = response.json()
96
+ except:
97
+ traceback.print_exc()
98
+ print("Response:", response.text)
99
+ return None
100
+
101
+ return [payload["translations"][0]["text"] for payload in response]
102
+
103
+ def text_translate(self, text: str, src_lang: str, tgt_lang: str) -> str:
104
+ return self.batch_translate([text], src_lang, tgt_lang)[0]
105
+
106
+
107
+ if __name__ == "__main__":
108
+ root_dir = sys.argv[1]
109
+
110
+ # Expects a .env file containing the API credentials.
111
+ config = dotenv_values(os.path.join(os.path.dirname(__file__), ".env"))
112
+
113
+ t = AzureTranslator(
114
+ config["AZURE_TRANSLATOR_TEXT_SUBSCRIPTION_KEY"],
115
+ config["AZURE_TRANSLATOR_TEXT_REGION"],
116
+ config["AZURE_TRANSLATOR_TEXT_ENDPOINT"],
117
+ )
118
+
119
+ pairs = sorted(glob.glob(os.path.join(root_dir, "*")))
120
+
121
+ for i, pair in enumerate(pairs):
122
+ basename = os.path.basename(pair)
123
+
124
+ print(pair)
125
+
126
+ src_lang, tgt_lang = basename.split("-")
127
+
128
+ print(f"{src_lang} - {tgt_lang}")
129
+
130
+ # source to target translations
131
+ src_infname = os.path.join(pair, f"test.{src_lang}")
132
+ tgt_outfname = os.path.join(pair, f"test.{tgt_lang}.pred.azure")
133
+ if not os.path.exists(src_infname):
134
+ continue
135
+
136
+ src_sents = [
137
+ sent.replace("\n", "").strip()
138
+ for sent in open(src_infname, "r").read().split("\n")
139
+ if sent
140
+ ]
141
+
142
+ if not os.path.exists(tgt_outfname):
143
+ try:
144
+ translations = []
145
+ for i in range(0, len(src_sents), 128):
146
+ start, end = i, int(min(i + 128, len(src_sents)))
147
+ translations.extend(
148
+ t.batch_translate(src_sents[start:end], src_lang, tgt_lang)
149
+ )
150
+ with open(tgt_outfname, "w") as f:
151
+ f.write("\n".join(translations))
152
+
153
+ time.sleep(10)
154
+ except Exception as e:
155
+ print(e)
156
+ continue
157
+
158
+ # target to source translations
159
+ tgt_infname = os.path.join(pair, f"test.{tgt_lang}")
160
+ src_outfname = os.path.join(pair, f"test.{src_lang}.pred.azure")
161
+ if not os.path.exists(tgt_infname):
162
+ continue
163
+
164
+ tgt_sents = [
165
+ sent.replace("\n", "").strip()
166
+ for sent in open(tgt_infname, "r").read().split("\n")
167
+ if sent
168
+ ]
169
+
170
+ if not os.path.exists(src_outfname):
171
+ try:
172
+ translations = []
173
+ for i in range(0, len(tgt_sents), 128):
174
+ start, end = i, int(min(i + 128, len(tgt_sents)))
175
+ translations.extend(
176
+ t.batch_translate(tgt_sents[start:end], tgt_lang, src_lang)
177
+ )
178
+ with open(src_outfname, "w") as f:
179
+ f.write("\n".join(translations))
180
+ except Exception as e:
181
+ continue
182
+
183
+ time.sleep(10)
IndicTrans2/baseline_eval/google_translate.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import glob
4
+ from tqdm import tqdm
5
+ from google.cloud import translate
6
+
7
+ # Expects a json file containing the API credentials.
8
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join(
9
+ os.path.dirname(__file__), r"api_key.json"
10
+ )
11
+
12
+ flores_to_iso = {
13
+ "asm_Beng": "as",
14
+ "ben_Beng": "bn",
15
+ "doi_Deva": "doi",
16
+ "eng_Latn": "en",
17
+ "gom_Deva": "gom",
18
+ "guj_Gujr": "gu",
19
+ "hin_Deva": "hi",
20
+ "kan_Knda": "kn",
21
+ "mai_Deva": "mai",
22
+ "mal_Mlym": "ml",
23
+ "mar_Deva": "mr",
24
+ "mni_Mtei": "mni_Mtei",
25
+ "npi_Deva": "ne",
26
+ "ory_Orya": "or",
27
+ "pan_Guru": "pa",
28
+ "san_Deva": "sa",
29
+ "sat_Olck": "sat",
30
+ "snd_Arab": "sd",
31
+ "tam_Taml": "ta",
32
+ "tel_Telu": "te",
33
+ "urd_Arab": "ur",
34
+ }
35
+
36
+
37
+ # Copy the project id from the json file containing API credentials
38
+ def translate_text(text, src_lang, tgt_lang, project_id="project_id"):
39
+
40
+ src_lang = flores_to_iso[src_lang]
41
+ tgt_lang = flores_to_iso[tgt_lang]
42
+
43
+ if src_lang == "mni_Mtei":
44
+ src_lang = "mni-Mtei"
45
+
46
+ if tgt_lang == "mni_Mtei":
47
+ tgt_lang = "mni-Mtei"
48
+
49
+ client = translate.TranslationServiceClient()
50
+
51
+ location = "global"
52
+
53
+ parent = f"projects/{project_id}/locations/{location}"
54
+
55
+ response = client.translate_text(
56
+ request={
57
+ "parent": parent,
58
+ "contents": [text],
59
+ "mime_type": "text/plain", # mime types: text/plain, text/html
60
+ "source_language_code": src_lang,
61
+ "target_language_code": tgt_lang,
62
+ }
63
+ )
64
+
65
+ translated_text = ""
66
+ for translation in response.translations:
67
+ translated_text += translation.translated_text
68
+
69
+ return translated_text
70
+
71
+
72
+ if __name__ == "__main__":
73
+ root_dir = sys.argv[1]
74
+
75
+ pairs = sorted(glob.glob(os.path.join(root_dir, "*")))
76
+
77
+ for pair in pairs:
78
+
79
+ print(pair)
80
+
81
+ basename = os.path.basename(pair)
82
+
83
+ src_lang, tgt_lang = basename.split("-")
84
+ if src_lang not in flores_to_iso.keys() or tgt_lang not in flores_to_iso.keys():
85
+ continue
86
+
87
+ if src_lang == "eng_Latn":
88
+ lang = tgt_lang
89
+ else:
90
+ lang = src_lang
91
+
92
+ lang = flores_to_iso[lang]
93
+
94
+ if lang not in "as bn doi gom gu hi kn mai ml mni_Mtei mr ne or pa sa sd ta te ur":
95
+ continue
96
+
97
+ print(f"{src_lang} - {tgt_lang}")
98
+
99
+ # source to target translations
100
+
101
+ src_infname = os.path.join(pair, f"test.{src_lang}")
102
+ tgt_outfname = os.path.join(pair, f"test.{tgt_lang}.pred.google")
103
+ if os.path.exists(src_infname) and not os.path.exists(tgt_outfname):
104
+ src_sents = [
105
+ sent.replace("\n", "").strip()
106
+ for sent in open(src_infname, "r").read().split("\n")
107
+ if sent
108
+ ]
109
+ translations = [
110
+ translate_text(text, src_lang, tgt_lang).strip() for text in tqdm(src_sents)
111
+ ]
112
+ with open(tgt_outfname, "w") as f:
113
+ f.write("\n".join(translations))
114
+
115
+ # # target to source translations
116
+ tgt_infname = os.path.join(pair, f"test.{tgt_lang}")
117
+ src_outfname = os.path.join(pair, f"test.{src_lang}.pred.google")
118
+ if os.path.exists(tgt_infname) and not os.path.exists(src_outfname):
119
+ tgt_sents = [
120
+ sent.replace("\n", "").strip()
121
+ for sent in open(tgt_infname, "r").read().split("\n")
122
+ if sent
123
+ ]
124
+ translations = [
125
+ translate_text(text, tgt_lang, src_lang).strip() for text in tqdm(tgt_sents)
126
+ ]
127
+
128
+ with open(src_outfname, "w") as f:
129
+ f.write("\n".join(translations))
IndicTrans2/baseline_eval/m2m100_inference.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import sys
4
+ from tqdm import tqdm
5
+ import torch
6
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
+
8
+
9
+ # dictionary mapping flores codes to M2M-100 supported codes
10
+ langs_supported = {
11
+ "eng_Latn": "en",
12
+ "ben_Beng": "bn",
13
+ "guj_Gujr": "gu",
14
+ "hin_Deva": "hi",
15
+ "kan_Knda": "kn",
16
+ "mal_Mlym": "ml",
17
+ "mar_Deva": "mr",
18
+ "npi_Deva": "ne",
19
+ "ory_Orya": "or",
20
+ "pan_Guru": "pa",
21
+ "snd_Arab": "sd",
22
+ "tam_Taml": "ta",
23
+ "urd_Arab": "ur",
24
+ }
25
+
26
+
27
+ def predict(batch, tokenizer, model, bos_token_id):
28
+ encoded_batch = tokenizer(batch, padding=True, return_tensors="pt").to(model.device)
29
+ generated_tokens = model.generate(
30
+ **encoded_batch,
31
+ num_beams=5,
32
+ max_length=256,
33
+ min_length=0,
34
+ forced_bos_token_id=bos_token_id,
35
+ )
36
+ hypothesis = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
37
+ return hypothesis
38
+
39
+
40
+ def main(devtest_data_dir, batch_size):
41
+ # load the pre-trained M2M-100 tokenizer and model
42
+ model_name = "facebook/m2m100-12B-last-ckpt"
43
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
44
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
45
+ model.eval()
46
+
47
+ # iterate over a list of language pairs from `devtest_data_dir`
48
+ for pair in sorted(os.listdir(devtest_data_dir)):
49
+ if "-" not in pair:
50
+ continue
51
+
52
+ src_lang, tgt_lang = pair.split("-")
53
+
54
+ # check if the source and target languages are supported
55
+ if (
56
+ src_lang not in langs_supported.keys()
57
+ or tgt_lang not in langs_supported.keys()
58
+ ):
59
+ print(f"Skipping {src_lang}-{tgt_lang} ...")
60
+ continue
61
+
62
+ # -------------------------------------------------------------------
63
+ # source to target evaluation
64
+ # -------------------------------------------------------------------
65
+ print(f"Evaluating {src_lang}-{tgt_lang} ...")
66
+
67
+ infname = os.path.join(devtest_data_dir, pair, f"test.{src_lang}")
68
+ outfname = os.path.join(devtest_data_dir, pair, f"test.{tgt_lang}.pred.m2m100")
69
+
70
+ with open(infname, "r") as f:
71
+ src_sents = f.read().split("\n")
72
+
73
+ add_new_line = False
74
+ if src_sents[-1] == "":
75
+ add_new_line = True
76
+ src_sents = src_sents[:-1]
77
+
78
+ # set the source language for tokenization
79
+ tokenizer.src_lang = langs_supported[src_lang]
80
+
81
+ # process sentences in batches and generate predictions
82
+ hypothesis = []
83
+ for i in tqdm(range(0, len(src_sents), batch_size)):
84
+ start, end = i, int(min(len(src_sents), i + batch_size))
85
+ batch = src_sents[start:end]
86
+ bos_token_id = tokenizer.lang_code_to_id[langs_supported[tgt_lang]]
87
+ hypothesis += predict(batch, tokenizer, model, bos_token_id)
88
+
89
+ assert len(hypothesis) == len(src_sents)
90
+
91
+ hypothesis = [
92
+ re.sub("\s+", " ", x.replace("\n", " ").replace("\t", " ")).strip()
93
+ for x in hypothesis
94
+ ]
95
+ if add_new_line:
96
+ hypothesis = hypothesis
97
+
98
+ with open(outfname, "w") as f:
99
+ f.write("\n".join(hypothesis))
100
+
101
+ # -------------------------------------------------------------------
102
+ # target to source evaluation
103
+ # -------------------------------------------------------------------
104
+ infname = os.path.join(devtest_data_dir, pair, f"test.{tgt_lang}")
105
+ outfname = os.path.join(devtest_data_dir, pair, f"test.{src_lang}.pred.m2m100")
106
+
107
+ with open(infname, "r") as f:
108
+ src_sents = f.read().split("\n")
109
+
110
+ add_new_line = False
111
+ if src_sents[-1] == "":
112
+ add_new_line = True
113
+ src_sents = src_sents[:-1]
114
+
115
+ # set the source language for tokenization
116
+ tokenizer.src_lang = langs_supported[tgt_lang]
117
+
118
+ # process sentences in batches and generate predictions
119
+ hypothesis = []
120
+ for i in tqdm(range(0, len(src_sents), batch_size)):
121
+ start, end = i, int(min(len(src_sents), i + batch_size))
122
+ batch = src_sents[start:end]
123
+ bos_token_id = tokenizer.lang_code_to_id[langs_supported[src_lang]]
124
+ hypothesis += predict(batch, tokenizer, model, bos_token_id)
125
+
126
+ assert len(hypothesis) == len(src_sents)
127
+
128
+ hypothesis = [
129
+ re.sub("\s+", " ", x.replace("\n", " ").replace("\t", " ")).strip()
130
+ for x in hypothesis
131
+ ]
132
+ if add_new_line:
133
+ hypothesis = hypothesis
134
+
135
+ with open(outfname, "w") as f:
136
+ f.write("\n".join(hypothesis))
137
+
138
+
139
+ if __name__ == "__main__":
140
+ # expects En-X subdirectories pairs within the devtest data directory
141
+ devtest_data_dir = sys.argv[1]
142
+ batch_size = int(sys.argv[2])
143
+
144
+ if not torch.cuda.is_available():
145
+ print("No GPU available")
146
+ sys.exit(1)
147
+
148
+ main(devtest_data_dir, batch_size)
IndicTrans2/baseline_eval/mbart_inference.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import sys
4
+ from tqdm import tqdm
5
+ import torch
6
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
+
8
+
9
+ # dictionary mapping flores codes to mBART supported codes
10
+ langs_supported = {
11
+ "eng_Latn": "en_XX",
12
+ "guj_Gujr": "gu_IN",
13
+ "hin_Deva": "hi_IN",
14
+ "npi_Deva": "ne_NP",
15
+ "ben_Beng": "bn_IN",
16
+ "mal_Mlym": "ml_IN",
17
+ "mar_Deva": "mr_IN",
18
+ "tam_Taml": "ta_IN",
19
+ "tel_Telu": "te_IN",
20
+ "urd_Arab": "ur_PK",
21
+ }
22
+
23
+
24
+ def predict(batch, tokenizer, model, bos_token_id):
25
+ encoded_batch = tokenizer(batch, padding=True, return_tensors="pt").to(model.device)
26
+ generated_tokens = model.generate(
27
+ **encoded_batch,
28
+ num_beams=5,
29
+ max_length=256,
30
+ min_length=0,
31
+ forced_bos_token_id=bos_token_id,
32
+ )
33
+ hypothesis = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
34
+ return hypothesis
35
+
36
+
37
+ def main(devtest_data_dir, batch_size):
38
+ # load the pre-trained mBART tokenizers and models for English-XX and XX-English translation
39
+ enxx_model_name = "facebook/mbart-large-50-one-to-many-mmt"
40
+ xxen_model_name = "facebook/mbart-large-50-many-to-one-mmt"
41
+ tokenizers = {
42
+ "enxx": AutoTokenizer.from_pretrained(enxx_model_name),
43
+ "xxen": AutoTokenizer.from_pretrained(xxen_model_name),
44
+ }
45
+ models = {
46
+ "enxx": AutoModelForSeq2SeqLM.from_pretrained(enxx_model_name).cuda(),
47
+ "xxen": AutoModelForSeq2SeqLM.from_pretrained(xxen_model_name).cuda(),
48
+ }
49
+
50
+ # set the models to evaluation mode
51
+ for model_name in models:
52
+ models[model_name].eval()
53
+
54
+ # iterate over a list of language pairs from `devtest_data_dir`
55
+ for pair in sorted(os.listdir(devtest_data_dir)):
56
+ if "-" not in pair:
57
+ continue
58
+
59
+ src_lang, tgt_lang = pair.split("-")
60
+
61
+ # check if the source and target languages are supported
62
+ if (
63
+ src_lang not in langs_supported.keys()
64
+ or tgt_lang not in langs_supported.keys()
65
+ ):
66
+ print(f"Skipping {src_lang}-{tgt_lang} ...")
67
+ continue
68
+
69
+ # -------------------------------------------------------------------
70
+ # source to target evaluation
71
+ # -------------------------------------------------------------------
72
+ print(f"Evaluating {src_lang}-{tgt_lang} ...")
73
+
74
+ infname = os.path.join(devtest_data_dir, pair, f"test.{src_lang}")
75
+ outfname = os.path.join(devtest_data_dir, pair, f"test.{tgt_lang}.pred.mbart50")
76
+
77
+ with open(infname, "r") as f:
78
+ src_sents = f.read().split("\n")
79
+
80
+ add_new_line = False
81
+ if src_sents[-1] == "":
82
+ add_new_line = True
83
+ src_sents = src_sents[:-1]
84
+
85
+ # set the source language for tokenization
86
+ tokenizers["enxx"].src_lang = langs_supported[src_lang]
87
+
88
+ # process sentences in batches and generate predictions
89
+ hypothesis = []
90
+ for i in tqdm(range(0, len(src_sents), batch_size)):
91
+ start, end = i, int(min(len(src_sents), i + batch_size))
92
+ batch = src_sents[start:end]
93
+ bos_token_id = tokenizers["enxx"].lang_code_to_id[langs_supported[tgt_lang]]
94
+ hypothesis += predict(
95
+ batch, tokenizers["enxx"], models["enxx"], bos_token_id
96
+ )
97
+
98
+ assert len(hypothesis) == len(src_sents)
99
+
100
+ hypothesis = [
101
+ re.sub("\s+", " ", x.replace("\n", " ").replace("\t", " ")).strip()
102
+ for x in hypothesis
103
+ ]
104
+ if add_new_line:
105
+ hypothesis = hypothesis
106
+
107
+ with open(outfname, "w") as f:
108
+ f.write("\n".join(hypothesis))
109
+
110
+ # -------------------------------------------------------------------
111
+ # target to source evaluation
112
+ # -------------------------------------------------------------------
113
+ infname = os.path.join(devtest_data_dir, pair, f"test.{tgt_lang}")
114
+ outfname = os.path.join(devtest_data_dir, pair, f"test.{src_lang}.pred.mbart50")
115
+
116
+ with open(infname, "r") as f:
117
+ src_sents = f.read().split("\n")
118
+
119
+ add_new_line = False
120
+ if src_sents[-1] == "":
121
+ add_new_line = True
122
+ src_sents = src_sents[:-1]
123
+
124
+ # set the source language for tokenization
125
+ tokenizers["xxen"].src_lang = langs_supported[tgt_lang]
126
+
127
+ # process sentences in batches and generate predictions
128
+ hypothesis = []
129
+ for i in tqdm(range(0, len(src_sents), batch_size)):
130
+ start, end = i, int(min(len(src_sents), i + batch_size))
131
+ batch = src_sents[start:end]
132
+ bos_token_id = tokenizers["xxen"].lang_code_to_id[langs_supported[src_lang]]
133
+ hypothesis += predict(
134
+ batch, tokenizers["xxen"], models["xxen"], bos_token_id
135
+ )
136
+
137
+ assert len(hypothesis) == len(src_sents)
138
+
139
+ hypothesis = [
140
+ re.sub("\s+", " ", x.replace("\n", " ").replace("\t", " ")).strip()
141
+ for x in hypothesis
142
+ ]
143
+ if add_new_line:
144
+ hypothesis = hypothesis
145
+
146
+ with open(outfname, "w") as f:
147
+ f.write("\n".join(hypothesis))
148
+
149
+
150
+ if __name__ == "__main__":
151
+ # expects En-X subdirectories pairs within the devtest data directory
152
+ devtest_data_dir = sys.argv[1]
153
+ batch_size = int(sys.argv[2])
154
+
155
+ if not torch.cuda.is_available():
156
+ print("No GPU available")
157
+ sys.exit(1)
158
+
159
+ main(devtest_data_dir, batch_size)
IndicTrans2/baseline_eval/nllb_moe_cpu_inference.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import sys
4
+ from tqdm import tqdm
5
+ import torch
6
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
+
8
+ langs_supported = [
9
+ "asm_Beng",
10
+ "ben_Beng",
11
+ "guj_Gujr",
12
+ "eng_Latn",
13
+ "hin_Deva",
14
+ "kas_Deva",
15
+ "kas_Arab",
16
+ "kan_Knda",
17
+ "mal_Mlym",
18
+ "mai_Deva",
19
+ "mar_Deva",
20
+ "mni_Beng",
21
+ "npi_Deva",
22
+ "ory_Orya",
23
+ "pan_Guru",
24
+ "san_Deva",
25
+ "snd_Arab",
26
+ "sat_Olck",
27
+ "tam_Taml",
28
+ "tel_Telu",
29
+ "urd_Arab",
30
+ ]
31
+
32
+
33
+ def predict(batch, tokenizer, model, bos_token_id):
34
+ encoded_batch = tokenizer(batch, padding=True, return_tensors="pt").to(model.device)
35
+ generated_tokens = model.generate(
36
+ **encoded_batch,
37
+ num_beams=5,
38
+ max_length=256,
39
+ min_length=0,
40
+ forced_bos_token_id=bos_token_id,
41
+ )
42
+ hypothesis = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
43
+ return hypothesis
44
+
45
+
46
+ def main(devtest_data_dir, batch_size):
47
+ # load the pre-trained NLLB tokenizer and model
48
+ model_name = "facebook/nllb-moe-54b"
49
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
50
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
51
+ model.eval()
52
+
53
+ # iterate over a list of language pairs from `devtest_data_dir`
54
+ for pair in sorted(os.listdir(devtest_data_dir)):
55
+ if "-" not in pair:
56
+ continue
57
+
58
+ src_lang, tgt_lang = pair.split("-")
59
+
60
+ # check if the source and target languages are supported
61
+ if (
62
+ src_lang not in langs_supported.keys()
63
+ or tgt_lang not in langs_supported.keys()
64
+ ):
65
+ print(f"Skipping {src_lang}-{tgt_lang} ...")
66
+ continue
67
+
68
+ # -------------------------------------------------------------------
69
+ # source to target evaluation
70
+ # -------------------------------------------------------------------
71
+ print(f"Evaluating {src_lang}-{tgt_lang} ...")
72
+
73
+ infname = os.path.join(devtest_data_dir, pair, f"test.{src_lang}")
74
+ outfname = os.path.join(
75
+ devtest_data_dir, pair, f"test.{tgt_lang}.pred.nllb_moe"
76
+ )
77
+
78
+ with open(infname, "r") as f:
79
+ src_sents = f.read().split("\n")
80
+
81
+ add_new_line = False
82
+ if src_sents[-1] == "":
83
+ add_new_line = True
84
+ src_sents = src_sents[:-1]
85
+
86
+ # set the source language for tokenization
87
+ tokenizer.src_lang = src_lang
88
+
89
+ # process sentences in batches and generate predictions
90
+ hypothesis = []
91
+ for i in tqdm(range(0, len(src_sents), batch_size)):
92
+ start, end = i, int(min(len(src_sents), i + batch_size))
93
+ batch = src_sents[start:end]
94
+ if tgt_lang == "sat_Olck":
95
+ bos_token_id = tokenizer.lang_code_to_id["sat_Beng"]
96
+ else:
97
+ bos_token_id = tokenizer.lang_code_to_id[tgt_lang]
98
+ hypothesis += predict(batch, tokenizer, model, bos_token_id)
99
+
100
+ assert len(hypothesis) == len(src_sents)
101
+
102
+ hypothesis = [
103
+ re.sub("\s+", " ", x.replace("\n", " ").replace("\t", " ")).strip()
104
+ for x in hypothesis
105
+ ]
106
+ if add_new_line:
107
+ hypothesis = hypothesis
108
+
109
+ with open(outfname, "w") as f:
110
+ f.write("\n".join(hypothesis))
111
+
112
+ # -------------------------------------------------------------------
113
+ # target to source evaluation
114
+ # -------------------------------------------------------------------
115
+ infname = os.path.join(devtest_data_dir, pair, f"test.{tgt_lang}")
116
+ outfname = os.path.join(
117
+ devtest_data_dir, pair, f"test.{src_lang}.pred.nllb_moe"
118
+ )
119
+
120
+ with open(infname, "r") as f:
121
+ src_sents = f.read().split("\n")
122
+
123
+ add_new_line = False
124
+ if src_sents[-1] == "":
125
+ add_new_line = True
126
+ src_sents = src_sents[:-1]
127
+
128
+ # set the source language for tokenization
129
+ tokenizer.src_lang = "sat_Beng" if tgt_lang == "sat_Olck" else tgt_lang
130
+
131
+ # process sentences in batches and generate predictions
132
+ hypothesis = []
133
+ for i in tqdm(range(0, len(src_sents), batch_size)):
134
+ start, end = i, int(min(len(src_sents), i + batch_size))
135
+ batch = src_sents[start:end]
136
+ bos_token_id = tokenizer.lang_code_to_id[langs_supported[src_lang]]
137
+ hypothesis += predict(batch, tokenizer, model, bos_token_id)
138
+
139
+ assert len(hypothesis) == len(src_sents)
140
+
141
+ hypothesis = [
142
+ re.sub("\s+", " ", x.replace("\n", " ").replace("\t", " ")).strip()
143
+ for x in hypothesis
144
+ ]
145
+ if add_new_line:
146
+ hypothesis = hypothesis
147
+
148
+ with open(outfname, "w") as f:
149
+ f.write("\n".join(hypothesis))
150
+
151
+
152
+ if __name__ == "__main__":
153
+ # expects En-X subdirectories pairs within the devtest data directory
154
+ devtest_data_dir = sys.argv[1]
155
+ batch_size = int(sys.argv[2])
156
+
157
+ main(devtest_data_dir, batch_size)
IndicTrans2/compute_comet_score.sh ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # This script computes COMET metrics and also performs significance testing on the evaluation set
4
+ # where each subdirectory contains En-X pair
5
+
6
+
7
+ echo `date`
8
+ devtest_data_dir=$1 # path to the evaluation directory
9
+ model_name=${2-"Unbabel/wmt22-comet-da"} # name of the model checkpoint
10
+
11
+ # predefined list of languages supported by COMET
12
+ langs=(asm_Beng ben_Beng guj_Gujr hin_Deva kan_Knda mal_Mlym mar_Deva ory_Orya pan_Guru tam_Taml tel_Telu urd_Arab)
13
+
14
+ # we predefine a set of systems which we consider for evaluation
15
+ # feel free to change the below line in case you want to add or remove any system
16
+ system=(google azure nllb mbart50 m2m100 it1 it2)
17
+
18
+
19
+ # iterate over the list of predefined languages
20
+ for lang in "${langs[@]}"; do
21
+
22
+ mkdir -p "$devtest_data_dir/eng_Latn-$lang/comet"
23
+
24
+ # --------------------------------------------------------------
25
+ # COMET score computation
26
+ # --------------------------------------------------------------
27
+
28
+ # iterate over the list of predefined systems
29
+ for sys in "${system[@]}"; do
30
+
31
+ echo "${sys}"
32
+
33
+ # en - indic direction
34
+ if [ -f "$devtest_data_dir/eng_Latn-$lang/test.$lang.pred.$sys" ]; then
35
+ echo "eng_Latn-${lang}"
36
+
37
+ src_fname=$devtest_data_dir/eng_Latn-$lang/test.eng_Latn
38
+ pred_fname=$devtest_data_dir/eng_Latn-$lang/test.$lang.pred.$sys
39
+ ref_fname=$devtest_data_dir/eng_Latn-$lang/test.$lang
40
+ out_fname=$devtest_data_dir/eng_Latn-$lang/comet/eng_Latn_${lang}_${sys}_comet.txt
41
+
42
+ # Compute COMET scores using the `comet-score`
43
+ comet-score -s $src_fname -t $pred_fname -r $ref_fname --gpus 1 --model $model_name --quiet --only_system > $out_fname
44
+ fi
45
+
46
+ # indic - en direction
47
+ if [ -f "$devtest_data_dir/eng_Latn-$lang/test.eng_Latn.pred.$sys" ]; then
48
+ echo "${lang}-eng_Latn"
49
+
50
+ src_fname=$devtest_data_dir/eng_Latn-$lang/test.$lang
51
+ pred_fname=$devtest_data_dir/eng_Latn-$lang/test.eng_Latn.pred.$sys
52
+ ref_fname=$devtest_data_dir/eng_Latn-$lang/test.eng_Latn
53
+ out_fname=$devtest_data_dir/eng_Latn-$lang/comet/${lang}_eng_Latn_${sys}_comet.txt
54
+
55
+ # Compute COMET scores using the `comet-score`
56
+ comet-score -s $src_fname -t $pred_fname -r $ref_fname --gpus 1 --model $model_name --quiet --only_system > $out_fname
57
+ fi
58
+
59
+ done
60
+
61
+ # --------------------------------------------------------------
62
+ # COMET significance testing
63
+ # --------------------------------------------------------------
64
+
65
+ # en - indic direction
66
+ src_fname=$devtest_data_dir/eng_Latn-$lang/test.eng_Latn
67
+ pred_fname=$devtest_data_dir/eng_Latn-$lang/test.$lang.pred.*
68
+ ref_fname=$devtest_data_dir/eng_Latn-$lang/test.$lang
69
+ out_fname=$devtest_data_dir/eng_Latn-$lang/comet/eng_Latn_${lang}_comet_stat.txt
70
+
71
+ # Compute COMET significance scores using the `comet-compare`
72
+ comet-compare -s $src_fname -t $pred_fname -r $ref_fname > $out_fname
73
+
74
+
75
+ # indic-en direction
76
+ src_fname=$devtest_data_dir/eng_Latn-$lang/test.$lang
77
+ pred_fname=$devtest_data_dir/eng_Latn-$lang/test.eng_Latn.pred.*
78
+ ref_fname=$devtest_data_dir/eng_Latn-$lang/test.eng_Latn
79
+ out_fname=$devtest_data_dir/eng_Latn-$lang/comet/${lang}_eng_Latn_comet_stat.txt
80
+
81
+ # Compute COMET significance scores using the `comet-compare`
82
+ comet-compare -s $src_fname -t $pred_fname -r $ref_fname > $out_fname
83
+
84
+ done
IndicTrans2/compute_metrics.sh ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # This script compute the evaluation metrics such as BLEU, chrF, chrF++ using the
4
+ # detokenized predictions of the translation systems using sacrebleu (version 2.3.1).
5
+ # If the target language is:
6
+ # English: directly use Moses tokenizer that is internally supported (`mteval-v13a`)
7
+ # Indic: use IndicNLP tokenizers and skip tokenization step in sacrebleu.
8
+
9
+
10
+ echo `date`
11
+ pred_fname=$1 # path to the predction file
12
+ ref_fname=$2 # path to the reference file
13
+ tgt_lang=$3 # target language
14
+
15
+
16
+ if [ $tgt_lang == 'eng_Latn' ]; then
17
+ # directly tokenize the prediction and reference files using sacrebleu and compute the metric
18
+ sacrebleu $ref_fname < $pred_fname -m bleu chrf
19
+ sacrebleu $ref_fname < $pred_fname -m chrf --chrf-word-order 2
20
+ else
21
+
22
+ # indicnlp tokenize prediction and reference files before evaluation
23
+ input_size=`python scripts/preprocess_translate.py $ref_fname $ref_fname.tok $tgt_lang false false`
24
+ input_size=`python scripts/preprocess_translate.py $pred_fname $pred_fname.tok $tgt_lang false false`
25
+
26
+ # since we are tokenizing with indicnlp separately, we are setting tokenize to none here
27
+ sacrebleu --tokenize none $ref_fname.tok < $pred_fname.tok -m bleu chrf
28
+ sacrebleu --tokenize none $ref_fname.tok < $pred_fname.tok -m chrf --chrf-word-order 2
29
+ fi
IndicTrans2/compute_metrics_significance.sh ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # This script performs significance testing for metrics such as BLEU, chrF++ using sacrebleu on the evaluation set
4
+ # where each subdirectory contains En-X pair
5
+
6
+
7
+ echo `date`
8
+ devtest_data_dir=$1 # path to the evaluation directory
9
+
10
+ # we predefine a set of systems which we consider for evaluation
11
+ # feel free to change the below line in case you want to add or remove any system
12
+ system=(google azure nllb mbart50 m2m100 it1 it2)
13
+
14
+
15
+ # get a list of language pairs in the `devtest_data_dir`
16
+ pairs=$(ls -d $devtest_data_dir/eng_Latn-* | sort)
17
+
18
+
19
+ # iterate over each language pair
20
+ for pair in ${pairs[@]}; do
21
+ # extract the source and target languages from the pair name
22
+ pair=$(basename $pair)
23
+ src_lang=$(echo "$pair" | cut -d "-" -f 1)
24
+ tgt_lang=$(echo "$pair" | cut -d "-" -f 2)
25
+
26
+ if [[ $src_lang == "eng_Latn" ]]; then
27
+
28
+ # ----------------------------------------------------------------------
29
+ # en - indic direction
30
+ # ----------------------------------------------------------------------
31
+ echo "${src_lang} - ${tgt_lang}"
32
+
33
+ # find all the prediction files for different systems and tokenize it using IndicNLP
34
+ pred_fnames=$devtest_data_dir/$pair/test.${tgt_lang}.pred.*
35
+ ref_fname=$devtest_data_dir/$pair/test.${tgt_lang}
36
+
37
+ for pred_fname in $(find . -type f -name $pred_fnames); do
38
+ input_size=`python scripts/preprocess_translate.py $pred_fname $pred_fname.tok $tgt_lang false false`
39
+ done
40
+
41
+ input_size=`python scripts/preprocess_translate.py $ref_fname $ref_fname.tok $tgt_lang false false`
42
+
43
+ ref_fname=$devtest_data_dir/$pair/test.${tgt_lang}.tok
44
+ it2_fname=$devtest_data_dir/$pair/test.${tgt_lang}.pred.it2.tok
45
+ sys_fnames=$devtest_data_dir/$pair/test.${tgt_lang}.pred.*.tok
46
+ bleu_out_fname=$devtest_data_dir/$pair/${src_lang}_${tgt_lang}_bleu_significance.txt
47
+ chrF_out_fname=$devtest_data_dir/$pair/${src_lang}_${tgt_lang}_chrF++_significance.txt
48
+
49
+ sacrebleu --tokenize none $ref_fname -i $it2_fname $sys_fnames --paired-bs -m bleu --format text > $bleu_out_fname
50
+ sacrebleu --tokenize none $it2_fname $sys_fnames --paired-bs -m chrf --chrf-word-order 2 --format text > $chrF_out_fname
51
+
52
+ # ----------------------------------------------------------------------
53
+ # indic - en direction
54
+ # ----------------------------------------------------------------------
55
+ echo "${tgt_lang} - ${src_lang}"
56
+
57
+ ref_fname=$devtest_data_dir/$pair/test.${src_lang}
58
+ it2_fname=$devtest_data_dir/$pair/test.${src_lang}.pred.it2
59
+ sys_fnames=$devtest_data_dir/$pair/test.${src_lang}.pred.*
60
+ bleu_out_fname=$devtest_data_dir/$pair/${tgt_lang}_${src_lang}_bleu_significance.txt
61
+ chrF_out_fname=$devtest_data_dir/$pair/${tgt_lang}_${src_lang}_chrF++_significance.txt
62
+
63
+ sacrebleu --tokenize none $ref_fname -i $it2_fname $sys_fnames --paired-bs -m bleu --format text > $bleu_out_fname
64
+ sacrebleu --tokenize none $it2_fname $sys_fnames --paired-bs -m chrf --chrf-word-order 2 --format text > $chrF_out_fname
65
+
66
+ fi
IndicTrans2/eval.sh ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # This script evaluates the performance of a machine translation system
4
+ # on a evaluation set in forward direction. For example, if the evaluation set
5
+ # consists of language pairs, such as En-X, where En represents the English language
6
+ # and X represents the target Indic language then this script accesses the translation
7
+ # system from the English language (En) to the target Indic language (X) direction.
8
+
9
+
10
+ echo `date`
11
+ devtest_data_dir=$1 # path to the evaluation directory
12
+ ckpt_dir=$2 # path to the checkpoint directory
13
+ system=${3:-"it2"} # name of the machine translation system
14
+
15
+
16
+ # get a list of language pairs in the `devtest_data_dir`
17
+ pairs=$(ls -d $devtest_data_dir/* | sort)
18
+
19
+
20
+ # iterate over each language pair
21
+ for pair in ${pairs[@]}; do
22
+ # extract the source and target languages from the pair name
23
+ pair=$(basename $pair)
24
+ src_lang=$(echo "$pair" | cut -d "-" -f 1)
25
+ tgt_lang=$(echo "$pair" | cut -d "-" -f 2)
26
+
27
+ src_fname=$devtest_data_dir/$src_lang-$tgt_lang/test.$src_lang
28
+ tgt_fname=$devtest_data_dir/$src_lang-$tgt_lang/test.$tgt_lang
29
+
30
+ # check if the source and target files exists
31
+ if [ -f "$src_fname" ] && [ -f "$tgt_fname" ]; then
32
+ echo "Evaluating $src_lang-$tgt_lang ..."
33
+ else
34
+ echo "Skipping $src_lang-$tgt_lang ..."
35
+ continue
36
+ fi
37
+
38
+ # generate translations if the system name contains "it2"
39
+ if [[ $system == *"it2"* ]]; then
40
+ echo "Generating Translations"
41
+ bash joint_translate.sh $src_fname $tgt_fname.pred.$system $src_lang $tgt_lang $ckpt_dir
42
+ fi
43
+
44
+ # compute automatic string-based metrics if the prediction exists for the system
45
+ if [[ -f "${tgt_fname}.pred.${system}" ]]; then
46
+ echo "Computing Metrics"
47
+ bash compute_metrics.sh $tgt_fname.pred.$system $tgt_fname $tgt_lang > $devtest_data_dir/$src_lang-$tgt_lang/${src_lang}_${tgt_lang}_${system}_scores.txt
48
+ fi
49
+
50
+ # remove the intermediate files
51
+ rm -rf $tgt_fname.pred.$system.*
52
+ rm -rf $devtest_data_dir/$src_lang-$tgt_lang/*.tok
53
+
54
+ done
IndicTrans2/eval_rev.sh ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # This script evaluates the performance of a machine translation system
4
+ # on a evaluation set in forward direction. For example, if the evaluation set
5
+ # consists of language pairs, such as En-X, where En represents the English language
6
+ # and X represents the target Indic language then this script accesses the translation
7
+ # system from the target Indic language (X) to the English language (En) direction.
8
+
9
+
10
+ echo `date`
11
+ devtest_data_dir=$1 # path to the evaluation directory
12
+ ckpt_dir=$2 # path to the checkpoint directory
13
+ system=${3:-"it2"} # name of the machine translation system
14
+
15
+
16
+ # get a list of language pairs in the `devtest_data_dir`
17
+ pairs=$(ls -d $devtest_data_dir/* | sort)
18
+
19
+
20
+ # iterate over each language pair
21
+ for pair in ${pairs[@]}; do
22
+ # extract the source and target languages from the pair name
23
+ pair=$(basename $pair)
24
+ src_lang=$(echo "$pair" | cut -d "-" -f 1)
25
+ tgt_lang=$(echo "$pair" | cut -d "-" -f 2)
26
+
27
+ src_fname=$devtest_data_dir/$src_lang-$tgt_lang/test.$tgt_lang
28
+ tgt_fname=$devtest_data_dir/$src_lang-$tgt_lang/test.$src_lang
29
+
30
+ # check if the source and target files exists
31
+ # in this case, we flip the actual target file as source and vice-versa
32
+ if [ -f "$src_fname" ] && [ -f "$tgt_fname" ]; then
33
+ echo "Evaluating $src_lang-$tgt_lang ..."
34
+ else
35
+ echo "Skipping $src_lang-$tgt_lang ..."
36
+ continue
37
+ fi
38
+
39
+ # generate translations if the system name contains "it2"
40
+ if [[ $system == *"it2"* ]]; then
41
+ echo "Generating Translations"
42
+ bash joint_translate.sh $src_fname $tgt_fname.pred.$system $tgt_lang $src_lang $ckpt_dir
43
+ fi
44
+
45
+ # compute automatic string-based metrics if the prediction exists for the system
46
+ if [[ -f "${tgt_fname}.pred.${system}" ]]; then
47
+ echo "Computing Metrics"
48
+ bash compute_metrics.sh $tgt_fname.pred.$system $tgt_fname $src_lang > $devtest_data_dir/$src_lang-$tgt_lang/${tgt_lang}_${src_lang}_${system}_scores.txt
49
+ fi
50
+
51
+ # remove the intermediate files
52
+ rm -rf $tgt_fname.pred.$system.*
53
+ rm -rf $devtest_data_dir/$src_lang-$tgt_lang/*.tok
54
+
55
+ done
IndicTrans2/finetune.sh ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # This script finetunes the pretrained translation model on the binarized data using fairseq.
4
+
5
+
6
+ echo `date`
7
+ exp_dir=$1 # path of the experiment directory
8
+ model_arch=${2:-"transformer_18_18"} # model architecture (defaults to `transformer_18_18`)
9
+ pretrained_ckpt=$3 # path to the pretrained checkpoint `.pt` file
10
+
11
+
12
+ fairseq-train $exp_dir/final_bin \
13
+ --max-source-positions=256 \
14
+ --max-target-positions=256 \
15
+ --source-lang=SRC \
16
+ --target-lang=TGT \
17
+ --max-update=1000000 \
18
+ --save-interval-updates=1000 \
19
+ --arch=$model_arch \
20
+ --activation-fn gelu \
21
+ --criterion=label_smoothed_cross_entropy \
22
+ --label-smoothing=0.1 \
23
+ --optimizer adam \
24
+ --adam-betas "(0.9, 0.98)" \
25
+ --lr-scheduler=inverse_sqrt \
26
+ --clip-norm 1.0 \
27
+ --warmup-init-lr 1e-07 \
28
+ --lr 3e-5 \
29
+ --warmup-updates 2000 \
30
+ --dropout 0.2 \
31
+ --save-dir $exp_dir/model \
32
+ --keep-last-epochs 5 \
33
+ --keep-interval-updates 3 \
34
+ --patience 10 \
35
+ --skip-invalid-size-inputs-valid-test \
36
+ --fp16 \
37
+ --user-dir model_configs \
38
+ --update-freq=4 \
39
+ --distributed-world-size 8 \
40
+ --num-workers 24 \
41
+ --max-tokens 1024 \
42
+ --eval-bleu \
43
+ --eval-bleu-args "{\"beam\": 1, \"lenpen\": 1.0, \"max_len_a\": 1.2, \"max_len_b\": 10}" \
44
+ --eval-bleu-detok moses \
45
+ --eval-bleu-remove-bpe sentencepiece \
46
+ --eval-bleu-print-samples \
47
+ --best-checkpoint-metric bleu \
48
+ --maximize-best-checkpoint-metric \
49
+ --restore-file $pretrained_ckpt \
50
+ --reset-lr-scheduler \
51
+ --reset-meters \
52
+ --reset-dataloader \
53
+ --reset-optimizer \
54
+ --task translation
IndicTrans2/huggingface_interface/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ IndicTransTokenizer
IndicTrans2/huggingface_interface/IndicTransToolkit/.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ dist/
2
+ build/
3
+ *.egg-info/
4
+ */*/__pycache__/
IndicTrans2/huggingface_interface/IndicTransToolkit/CHANGELOG.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ # 📢 Release v1.0.3
4
+ - 🚨 The `IndicProcessor` class has been re-written in [Cython](https://github.com/cython/cython) for faster implementation. This gives us atleast `+10 lines/s`.
5
+ - A new `visualize` argument as been added to `preprocess_batch` to track the processing with a `tqdm` bar.
6
+
7
+ # 📢 Release v1.0.2
8
+ - The repository has been renamed to `IndicTransToolkit`.
9
+ - 🚨 The custom tokenizer is now **removed** from the repository. Please revert to a previous commit ([v1.0.1](https://github.com/VarunGumma/IndicTransToolkit/tree/0e68fb5872f4d821578a5252f90ad43c9649370f)) to use it **(strongly discouraged)**. The official _(and only tokenizer)_ is available on HF along with the models.
10
+
11
+ # 📢 Release v1.0.0
12
+ - The [PreTrainedTokenizer](https://huggingface.co/docs/transformers/main_classes/tokenizer) for IndicTrans2 is now available on HF 🎉🎉 Note that, you still need the `IndicProcessor` to pre-process the sentences before tokenization.
13
+ - 🚨 **In favor of the standard PreTrainedTokenizer, we deprecated the custom tokenizer. However, this custom tokenizer will still be available here for backward compatibility, but no further updates/bug-fixes will be provided.**
14
+ - The `indic_evaluate` function is now consolidated into a concrete `IndicEvaluator` class.
15
+ - The data collation function for training is consolidated into a concrete `IndicDataCollator` class.
16
+ - A simple batching method is now available in the `IndicProcessor`.
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/PKG-INFO ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.2
2
+ Name: IndicTransToolkit
3
+ Version: 1.0.3
4
+ Summary: A simple, consistent, and extendable module for IndicTrans2 tokenizer compatible with HuggingFace models
5
+ Home-page: https://github.com/VarunGumma/IndicTransToolkit
6
+ Author: Varun Gumma
7
+ Author-email: [email protected]
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: setuptools>=68.2.2
16
+ Requires-Dist: torch
17
+ Requires-Dist: cython
18
+ Requires-Dist: sacremoses
19
+ Requires-Dist: sentencepiece
20
+ Requires-Dist: transformers
21
+ Requires-Dist: sacrebleu
22
+ Requires-Dist: indic-nlp-library-IT2@ git+https://github.com/VarunGumma/indic_nlp_library.git
23
+ Dynamic: author
24
+ Dynamic: author-email
25
+ Dynamic: classifier
26
+ Dynamic: description
27
+ Dynamic: description-content-type
28
+ Dynamic: home-page
29
+ Dynamic: license
30
+ Dynamic: requires-dist
31
+ Dynamic: requires-python
32
+ Dynamic: summary
33
+
34
+ # IndicTransToolkit
35
+
36
+ ## About
37
+ The goal of this repository is to provide a simple, modular, and extendable toolkit for [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2) and be compatible with the HuggingFace models released. Please refer to the `CHANGELOG.md` for latest developments.
38
+
39
+ ## Pre-requisites
40
+ - `Python 3.8+`
41
+ - [Indic NLP Library](https://github.com/VarunGumma/indic_nlp_library)
42
+ - Other requirements as listed in `requirements.txt`
43
+
44
+ ## Configuration
45
+ - Editable installation (Note, this may take a while):
46
+ ```bash
47
+ git clone https://github.com/VarunGumma/IndicTransToolkit
48
+ cd IndicTransToolkit
49
+
50
+ pip install --editable . --use-pep517 # required for pip >= 25.0
51
+
52
+ # in case it fails, try:
53
+ # pip install --editable . --use-pep517 --config-settings editable_mode=compat
54
+ ```
55
+
56
+ ## Examples
57
+ For the training usecase, please refer [here](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface).
58
+
59
+ ### PreTainedTokenizer
60
+ ```python
61
+ import torch
62
+ from IndicTransToolkit.processor import IndicProcessor # NOW IMPLEMENTED IN CYTHON !!
63
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
64
+
65
+ ip = IndicProcessor(inference=True)
66
+ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
67
+ model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
68
+
69
+ sentences = [
70
+ "This is a test sentence.",
71
+ "This is another longer different test sentence.",
72
+ "Please send an SMS to 9876543210 and an email on [email protected] by 15th October, 2023.",
73
+ ]
74
+
75
+ batch = ip.preprocess_batch(sentences, src_lang="eng_Latn", tgt_lang="hin_Deva", visualize=False) # set it to visualize=True to print a progress bar
76
+ batch = tokenizer(batch, padding="longest", truncation=True, max_length=256, return_tensors="pt")
77
+
78
+ with torch.inference_mode():
79
+ outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)
80
+
81
+ with tokenizer.as_target_tokenizer():
82
+ # This scoping is absolutely necessary, as it will instruct the tokenizer to tokenize using the target vocabulary.
83
+ # Failure to use this scoping will result in gibberish/unexpected predictions as the output will be de-tokenized with the source vocabulary instead.
84
+ outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
85
+
86
+ outputs = ip.postprocess_batch(outputs, lang="hin_Deva")
87
+ print(outputs)
88
+
89
+ >>> ['यह एक परीक्षण वाक्य है।', 'यह एक और लंबा अलग परीक्षण वाक्य है।', 'कृपया 9876543210 पर एक एस. एम. एस. भेजें और 15 अक्टूबर, 2023 तक [email protected] पर एक ईमेल भेजें।']
90
+ ```
91
+
92
+ ### Evaluation
93
+ - `IndicEvaluator` is a python implementation of [compute_metrics.sh](https://github.com/AI4Bharat/IndicTrans2/blob/main/compute_metrics.sh).
94
+ - We have found that this python implementation gives slightly lower scores than the original `compute_metrics.sh`. So, please use this function cautiously, and feel free to raise a PR if you have found the bug/fix.
95
+ ```python
96
+ from IndicTransToolkit import IndicEvaluator
97
+
98
+ # this method returns a dictionary with BLEU and ChrF2++ scores with appropriate signatures
99
+ evaluator = IndicEvaluator()
100
+ scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=pred_file, refs=ref_file)
101
+
102
+ # alternatively, you can pass the list of predictions and references instead of files
103
+ # scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=preds, refs=refs)
104
+ ```
105
+
106
+ ## Authors
107
+ - Varun Gumma ([email protected])
108
+ - Jay Gala ([email protected])
109
+ - Pranjal Agadh Chitale ([email protected])
110
+ - Raj Dabre ([email protected])
111
+
112
+
113
+ ## Bugs and Contribution
114
+ Since this a bleeding-edge module, you may encounter broken stuff and import issues once in a while. In case you encounter any bugs or want additional functionalities, please feel free to raise `Issues`/`Pull Requests` or contact the authors.
115
+
116
+
117
+ ## Citation
118
+ If you use our codebase, or models, please do cite the following paper:
119
+ ```bibtex
120
+ @article{
121
+ gala2023indictrans,
122
+ title={IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages},
123
+ author={Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan},
124
+ journal={Transactions on Machine Learning Research},
125
+ issn={2835-8856},
126
+ year={2023},
127
+ url={https://openreview.net/forum?id=vfT4YuzAYA},
128
+ note={}
129
+ }
130
+ ```
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ IndicTransToolkit/__init__.py
6
+ IndicTransToolkit/collator.py
7
+ IndicTransToolkit/evaluator.py
8
+ IndicTransToolkit/processor.c
9
+ IndicTransToolkit/version.py
10
+ IndicTransToolkit.egg-info/PKG-INFO
11
+ IndicTransToolkit.egg-info/SOURCES.txt
12
+ IndicTransToolkit.egg-info/dependency_links.txt
13
+ IndicTransToolkit.egg-info/not-zip-safe
14
+ IndicTransToolkit.egg-info/requires.txt
15
+ IndicTransToolkit.egg-info/top_level.txt
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/not-zip-safe ADDED
@@ -0,0 +1 @@
 
 
1
+
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/requires.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ setuptools>=68.2.2
2
+ torch
3
+ cython
4
+ sacremoses
5
+ sentencepiece
6
+ transformers
7
+ sacrebleu
8
+ indic-nlp-library-IT2@ git+https://github.com/VarunGumma/indic_nlp_library.git
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ IndicTransToolkit
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from .evaluator import IndicEvaluator
2
+ from .collator import IndicDataCollator
3
+ from .processor import IndicProcessor
4
+
5
+ __all__ = [
6
+ "IndicEvaluator",
7
+ "IndicDataCollator",
8
+ "IndicProcessor",
9
+ ]
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (377 Bytes). View file
 
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (391 Bytes). View file
 
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-310.pyc ADDED
Binary file (2.18 kB). View file
 
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-313.pyc ADDED
Binary file (3.21 kB). View file
 
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-310.pyc ADDED
Binary file (4.19 kB). View file
 
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-313.pyc ADDED
Binary file (6.38 kB). View file
 
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/__pycache__/processor.cpython-310.pyc ADDED
Binary file (11.7 kB). View file
 
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/collator.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from dataclasses import dataclass
3
+ from typing import Any, Optional, Union
4
+
5
+ from transformers.utils import PaddingStrategy
6
+ from transformers.tokenization_utils import PreTrainedTokenizerBase
7
+ from transformers.data.data_collator import pad_without_fast_tokenizer_warning
8
+
9
+
10
+ @dataclass
11
+ class IndicDataCollator:
12
+ tokenizer: PreTrainedTokenizerBase
13
+ model: Optional[Any] = None
14
+ padding: Union[bool, str, PaddingStrategy] = True
15
+ max_length: Optional[int] = None
16
+ pad_to_multiple_of: Optional[int] = None
17
+ label_pad_token_id: int = -100
18
+ return_tensors: str = "pt"
19
+
20
+ def __call__(self, features, return_tensors=None):
21
+
22
+ if return_tensors is None:
23
+ return_tensors = self.return_tensors
24
+
25
+ labels = (
26
+ [feature["labels"] for feature in features]
27
+ if "labels" in features[0].keys()
28
+ else None
29
+ )
30
+ # We have to pad the labels before calling `tokenizer.pad` as
31
+ # this method won't pad them and needs them of the same length to return tensors.
32
+ if labels is not None:
33
+ max_label_length = max(len(l) for l in labels)
34
+ if self.pad_to_multiple_of is not None:
35
+ max_label_length = (
36
+ (max_label_length + self.pad_to_multiple_of - 1)
37
+ // self.pad_to_multiple_of
38
+ * self.pad_to_multiple_of
39
+ )
40
+
41
+ # fairseq by defaults right pad the labels for seq2seq tasks
42
+ for feature in features:
43
+ remainder = [self.label_pad_token_id] * (
44
+ max_label_length - len(feature["labels"])
45
+ )
46
+ if isinstance(feature["labels"], list):
47
+ feature["labels"] = feature["labels"] + remainder
48
+ else:
49
+ feature["labels"] = np.concatenate(
50
+ [feature["labels"], remainder]
51
+ ).astype(np.int64)
52
+
53
+ self.tokenizer.padding_side = "left"
54
+ features = pad_without_fast_tokenizer_warning(
55
+ self.tokenizer,
56
+ features,
57
+ padding=self.padding,
58
+ max_length=self.max_length,
59
+ return_tensors=return_tensors,
60
+ pad_to_multiple_of=self.pad_to_multiple_of,
61
+ )
62
+
63
+ # prepare decoder_input_ids
64
+ if (
65
+ labels is not None
66
+ and self.model is not None
67
+ and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
68
+ ):
69
+ decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(
70
+ labels=features["labels"]
71
+ )
72
+ features["decoder_input_ids"] = decoder_input_ids
73
+
74
+ return features
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/evaluator.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union
2
+ from sacrebleu.metrics import CHRF, BLEU
3
+
4
+ from indicnlp.tokenize import indic_tokenize
5
+ from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
6
+
7
+
8
+ class IndicEvaluator:
9
+ def __init__(self):
10
+ # === Metrics ===
11
+ self._chrf2_metric = CHRF(word_order=2)
12
+ self._bleu_metric_13a = BLEU(tokenize="13a")
13
+ self._bleu_metric_none = BLEU(tokenize="none")
14
+
15
+ # === Normalizer factory and cache ===
16
+ self._indic_norm_factory = IndicNormalizerFactory()
17
+ self._normalizer_cache = {} # Cache normalizers by iso_lang
18
+
19
+ # === FLORES -> ISO codes ===
20
+ self._flores_codes = {
21
+ "asm_Beng": "as",
22
+ "awa_Deva": "hi",
23
+ "ben_Beng": "bn",
24
+ "bho_Deva": "hi",
25
+ "brx_Deva": "hi",
26
+ "doi_Deva": "hi",
27
+ "eng_Latn": "en",
28
+ "gom_Deva": "kK",
29
+ "gon_Deva": "hi",
30
+ "guj_Gujr": "gu",
31
+ "hin_Deva": "hi",
32
+ "hne_Deva": "hi",
33
+ "kan_Knda": "kn",
34
+ "kas_Arab": "ur",
35
+ "kas_Deva": "hi",
36
+ "kha_Latn": "en",
37
+ "lus_Latn": "en",
38
+ "mag_Deva": "hi",
39
+ "mai_Deva": "hi",
40
+ "mal_Mlym": "ml",
41
+ "mar_Deva": "mr",
42
+ "mni_Beng": "bn",
43
+ "mni_Mtei": "hi",
44
+ "npi_Deva": "ne",
45
+ "ory_Orya": "or",
46
+ "pan_Guru": "pa",
47
+ "san_Deva": "hi",
48
+ "sat_Olck": "or",
49
+ "snd_Arab": "ur",
50
+ "snd_Deva": "hi",
51
+ "tam_Taml": "ta",
52
+ "tel_Telu": "te",
53
+ "urd_Arab": "ur",
54
+ "unr_Deva": "hi",
55
+ }
56
+
57
+ def _get_normalizer(self, iso_lang: str):
58
+ """
59
+ Return a cached normalizer for a given iso_lang.
60
+ """
61
+ if iso_lang not in self._normalizer_cache:
62
+ self._normalizer_cache[iso_lang] = self._indic_norm_factory.get_normalizer(iso_lang)
63
+ return self._normalizer_cache[iso_lang]
64
+
65
+ def _preprocess(self, sentences: List[str], lang: str) -> List[str]:
66
+ """
67
+ Preprocess the sentences using IndicNLP:
68
+ 1) Normalization (using a cached normalizer),
69
+ 2) Trivial tokenization.
70
+ """
71
+ iso_lang = self._flores_codes.get(lang, "hi")
72
+ # Fetch from cache to avoid reconstructing the normalizer
73
+ normalizer = self._get_normalizer(iso_lang)
74
+
75
+ # Local references for speed
76
+ trivial_tokenize = indic_tokenize.trivial_tokenize
77
+ normalize_fn = normalizer.normalize
78
+
79
+ processed_sentences = []
80
+ for line in sentences:
81
+ # single .strip() before normalizing
82
+ line = line.strip()
83
+ norm_line = normalize_fn(line)
84
+ tokens = trivial_tokenize(norm_line, iso_lang)
85
+ processed_sentences.append(" ".join(tokens))
86
+
87
+ return processed_sentences
88
+
89
+ def evaluate(
90
+ self,
91
+ tgt_lang: str,
92
+ preds: Union[List[str], str],
93
+ refs: Union[List[str], str],
94
+ ):
95
+ """
96
+ Evaluate BLEU and chrF2++ scores for the given predictions and references.
97
+ - If preds/refs are strings (filenames), read them from disk.
98
+ - If they are lists, evaluate them directly.
99
+ - For non-English languages, applies Indic NLP preprocessing before scoring.
100
+ """
101
+ assert preds is not None and refs is not None, "Predictions and References cannot be None"
102
+
103
+ # Convert file paths to lists if needed
104
+ if isinstance(preds, str):
105
+ with open(preds, "r", encoding="utf-8") as fp:
106
+ preds = [line.strip() for line in fp]
107
+ if isinstance(refs, str):
108
+ with open(refs, "r", encoding="utf-8") as fr:
109
+ refs = [line.strip() for line in fr]
110
+
111
+ assert len(preds) == len(refs), "Number of predictions and references do not match"
112
+
113
+ # Local references to metrics for speed
114
+ bleu_none = self._bleu_metric_none
115
+ bleu_13a = self._bleu_metric_13a
116
+ chrf2 = self._chrf2_metric
117
+
118
+ scores = {}
119
+
120
+ # For English (eng_Latn), skip Indic NLP normalization
121
+ if tgt_lang != "eng_Latn":
122
+ preds_ = self._preprocess(preds, tgt_lang)
123
+ refs_ = self._preprocess(refs, tgt_lang)
124
+
125
+ bleu_score = bleu_none.corpus_score(preds_, [refs_])
126
+ chrf_score = chrf2.corpus_score(preds_, [refs_])
127
+
128
+ scores["bleu"] = {
129
+ "score": round(bleu_score.score, 1),
130
+ "signature": bleu_none.get_signature().format(),
131
+ }
132
+ scores["chrF2++"] = {
133
+ "score": round(chrf_score.score, 1),
134
+ "signature": chrf2.get_signature().format(),
135
+ }
136
+
137
+ else:
138
+ # For English, 13a tokenization is standard
139
+ bleu_score = bleu_13a.corpus_score(preds, [refs])
140
+ chrf_score = chrf2.corpus_score(preds, [refs])
141
+
142
+ scores["bleu"] = {
143
+ "score": round(bleu_score.score, 1),
144
+ "signature": bleu_13a.get_signature().format(),
145
+ }
146
+ scores["chrF2++"] = {
147
+ "score": round(chrf_score.score, 1),
148
+ "signature": chrf2.get_signature().format(),
149
+ }
150
+
151
+ return scores
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.c ADDED
The diff for this file is too large to render. See raw diff
 
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.cp310-win_amd64.pyd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba760d57f3accf3b24d9cc331dcda273d0612998a034d9250eb8c9db5b9f908a
3
+ size 141312
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1a101ecb27adaf367f00c90b3f8e96e7fbda3bf0560d48c368fec3750a040a4
3
+ size 229200
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/processor.pyx ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cython: language_level=3, boundscheck=False, cdivision=True, wraparound=False
2
+ """
3
+ Cython version of the IndicProcessor class with optimizations for performance.
4
+ Only preprocess_batch and postprocess_batch are exposed as cpdef methods.
5
+ All other methods are internal (cdef) for optimized Cython usage.
6
+ """
7
+
8
+ import regex as re
9
+ from tqdm import tqdm
10
+ from queue import Queue
11
+ from typing import List, Dict, Union
12
+
13
+ # Importing Python objects since these libraries don't offer C-extensions
14
+ from indicnlp.tokenize import indic_tokenize, indic_detokenize
15
+ from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
16
+ from sacremoses import MosesPunctNormalizer, MosesTokenizer, MosesDetokenizer
17
+ from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
18
+
19
+
20
+ cdef class IndicProcessor:
21
+ cdef public bint inference
22
+
23
+ # Precompiled regex patterns and placeholders
24
+ cdef object _MULTISPACE_REGEX
25
+ cdef object _DIGIT_SPACE_PERCENT
26
+ cdef object _DOUBLE_QUOT_PUNC
27
+ cdef object _DIGIT_NBSP_DIGIT
28
+ cdef object _END_BRACKET_SPACE_PUNC_REGEX
29
+
30
+ cdef object _URL_PATTERN
31
+ cdef object _NUMERAL_PATTERN
32
+ cdef object _EMAIL_PATTERN
33
+ cdef object _OTHER_PATTERN
34
+
35
+ cdef list _PUNC_REPLACEMENTS
36
+ cdef list _INDIC_FAILURE_CASES
37
+
38
+ cdef dict _flores_codes
39
+ cdef dict _digits_translation_table
40
+
41
+ # Placeholder maps stored in a Python Queue (treated as `object` for Cython)
42
+ cdef object _placeholder_entity_maps
43
+
44
+ # Tools (also Python objects)
45
+ cdef object _en_tok
46
+ cdef object _en_normalizer
47
+ cdef object _en_detok
48
+ cdef object _xliterator
49
+
50
+ def __cinit__(self, bint inference=True):
51
+ """
52
+ Constructor for IndicProcessor. Initializes all necessary components.
53
+ """
54
+ self.inference = inference
55
+
56
+ ##############################
57
+ # FLORES -> ISO CODES
58
+ ##############################
59
+ self._flores_codes = {
60
+ "asm_Beng": "as",
61
+ "awa_Deva": "hi",
62
+ "ben_Beng": "bn",
63
+ "bho_Deva": "hi",
64
+ "brx_Deva": "hi",
65
+ "doi_Deva": "hi",
66
+ "eng_Latn": "en",
67
+ "gom_Deva": "kK",
68
+ "gon_Deva": "hi",
69
+ "guj_Gujr": "gu",
70
+ "hin_Deva": "hi",
71
+ "hne_Deva": "hi",
72
+ "kan_Knda": "kn",
73
+ "kas_Arab": "ur",
74
+ "kas_Deva": "hi",
75
+ "kha_Latn": "en",
76
+ "lus_Latn": "en",
77
+ "mag_Deva": "hi",
78
+ "mai_Deva": "hi",
79
+ "mal_Mlym": "ml",
80
+ "mar_Deva": "mr",
81
+ "mni_Beng": "bn",
82
+ "mni_Mtei": "hi",
83
+ "npi_Deva": "ne",
84
+ "ory_Orya": "or",
85
+ "pan_Guru": "pa",
86
+ "san_Deva": "hi",
87
+ "sat_Olck": "or",
88
+ "snd_Arab": "ur",
89
+ "snd_Deva": "hi",
90
+ "tam_Taml": "ta",
91
+ "tel_Telu": "te",
92
+ "urd_Arab": "ur",
93
+ "unr_Deva": "hi",
94
+ }
95
+
96
+ ##############################
97
+ # INDIC DIGIT TRANSLATION (str.translate)
98
+ ##############################
99
+ self._digits_translation_table = {}
100
+ cdef dict digits_dict = {
101
+ "\u09e6": "0", "\u0ae6": "0", "\u0ce6": "0", "\u0966": "0",
102
+ "\u0660": "0", "\uabf0": "0", "\u0b66": "0", "\u0a66": "0",
103
+ "\u1c50": "0", "\u06f0": "0",
104
+
105
+ "\u09e7": "1", "\u0ae7": "1", "\u0967": "1", "\u0ce7": "1",
106
+ "\u06f1": "1", "\uabf1": "1", "\u0b67": "1", "\u0a67": "1",
107
+ "\u1c51": "1", "\u0c67": "1",
108
+
109
+ "\u09e8": "2", "\u0ae8": "2", "\u0968": "2", "\u0ce8": "2",
110
+ "\u06f2": "2", "\uabf2": "2", "\u0b68": "2", "\u0a68": "2",
111
+ "\u1c52": "2", "\u0c68": "2",
112
+
113
+ "\u09e9": "3", "\u0ae9": "3", "\u0969": "3", "\u0ce9": "3",
114
+ "\u06f3": "3", "\uabf3": "3", "\u0b69": "3", "\u0a69": "3",
115
+ "\u1c53": "3", "\u0c69": "3",
116
+
117
+ "\u09ea": "4", "\u0aea": "4", "\u096a": "4", "\u0cea": "4",
118
+ "\u06f4": "4", "\uabf4": "4", "\u0b6a": "4", "\u0a6a": "4",
119
+ "\u1c54": "4", "\u0c6a": "4",
120
+
121
+ "\u09eb": "5", "\u0aeb": "5", "\u096b": "5", "\u0ceb": "5",
122
+ "\u06f5": "5", "\uabf5": "5", "\u0b6b": "5", "\u0a6b": "5",
123
+ "\u1c55": "5", "\u0c6b": "5",
124
+
125
+ "\u09ec": "6", "\u0aec": "6", "\u096c": "6", "\u0cec": "6",
126
+ "\u06f6": "6", "\uabf6": "6", "\u0b6c": "6", "\u0a6c": "6",
127
+ "\u1c56": "6", "\u0c6c": "6",
128
+
129
+ "\u09ed": "7", "\u0aed": "7", "\u096d": "7", "\u0ced": "7",
130
+ "\u06f7": "7", "\uabf7": "7", "\u0b6d": "7", "\u0a6d": "7",
131
+ "\u1c57": "7", "\u0c6d": "7",
132
+
133
+ "\u09ee": "8", "\u0aee": "8", "\u096e": "8", "\u0cee": "8",
134
+ "\u06f8": "8", "\uabf8": "8", "\u0b6e": "8", "\u0a6e": "8",
135
+ "\u1c58": "8", "\u0c6e": "8",
136
+
137
+ "\u09ef": "9", "\u0aef": "9", "\u096f": "9", "\u0cef": "9",
138
+ "\u06f9": "9", "\uabf9": "9", "\u0b6f": "9", "\u0a6f": "9",
139
+ "\u1c59": "9", "\u0c6f": "9",
140
+ }
141
+ for k, v in digits_dict.items():
142
+ self._digits_translation_table[ord(k)] = v
143
+
144
+ # Also map ASCII '0'-'9'
145
+ for c in range(ord('0'), ord('9') + 1):
146
+ self._digits_translation_table[c] = chr(c)
147
+
148
+ ##############################
149
+ # PLACEHOLDER MAP QUEUE
150
+ ##############################
151
+ self._placeholder_entity_maps = Queue()
152
+
153
+ ##############################
154
+ # MOSES (as Python objects)
155
+ ##############################
156
+ self._en_tok = MosesTokenizer(lang="en")
157
+ self._en_normalizer = MosesPunctNormalizer()
158
+ self._en_detok = MosesDetokenizer(lang="en")
159
+
160
+ ##############################
161
+ # TRANSLITERATOR (Python object)
162
+ ##############################
163
+ self._xliterator = UnicodeIndicTransliterator()
164
+
165
+ ##############################
166
+ # Precompiled Patterns
167
+ ##############################
168
+ self._MULTISPACE_REGEX = re.compile(r"[ ]{2,}")
169
+ self._DIGIT_SPACE_PERCENT = re.compile(r"(\d) %")
170
+ self._DOUBLE_QUOT_PUNC = re.compile(r"\"([,\.]+)")
171
+ self._DIGIT_NBSP_DIGIT = re.compile(r"(\d) (\d)")
172
+ self._END_BRACKET_SPACE_PUNC_REGEX = re.compile(r"\) ([\.!:?;,])")
173
+
174
+ self._URL_PATTERN = re.compile(
175
+ r"\b(?<![\w/.])(?:(?:https?|ftp)://)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b"
176
+ )
177
+ self._NUMERAL_PATTERN = re.compile(
178
+ r"(~?\d+\.?\d*\s?%?\s?-?\s?~?\d+\.?\d*\s?%|~?\d+%|\d+[-\/.,:']\d+[-\/.,:'+]\d+(?:\.\d+)?|\d+[-\/.:'+]\d+(?:\.\d+)?)"
179
+ )
180
+ self._EMAIL_PATTERN = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}")
181
+ self._OTHER_PATTERN = re.compile(r"[A-Za-z0-9]*[#|@]\w+")
182
+
183
+ # Combined punctuation replacements
184
+ self._PUNC_REPLACEMENTS = [
185
+ (re.compile(r"\r"), ""),
186
+ (re.compile(r"\(\s*"), "("),
187
+ (re.compile(r"\s*\)"), ")"),
188
+ (re.compile(r"\s:\s?"), ":"),
189
+ (re.compile(r"\s;\s?"), ";"),
190
+ (re.compile(r"[`´‘‚’]"), "'"),
191
+ (re.compile(r"[„“”«»]"), '"'),
192
+ (re.compile(r"[–—]"), "-"),
193
+ (re.compile(r"\.\.\."), "..."),
194
+ (re.compile(r" %"), "%"),
195
+ (re.compile(r"nº "), "nº "),
196
+ (re.compile(r" ºC"), " ºC"),
197
+ (re.compile(r" [?!;]"), lambda m: m.group(0).strip()),
198
+ (re.compile(r", "), ", "),
199
+ ]
200
+
201
+ self._INDIC_FAILURE_CASES = [
202
+ "آی ڈی ",
203
+ "ꯑꯥꯏꯗꯤ",
204
+ "आईडी",
205
+ "आई . डी . ",
206
+ "आई . डी .",
207
+ "आई. डी. ",
208
+ "आई. डी.",
209
+ "आय. डी. ",
210
+ "आय. डी.",
211
+ "आय . डी . ",
212
+ "आय . डी .",
213
+ "ऐटि",
214
+ "آئی ڈی ",
215
+ "ᱟᱭᱰᱤ ᱾",
216
+ "आयडी",
217
+ "ऐडि",
218
+ "आइडि",
219
+ "ᱟᱭᱰᱤ",
220
+ ]
221
+
222
+ # Internal Method: Apply punctuation replacements
223
+ cdef str _apply_punc_replacements(self, str text, list replacements) except *:
224
+ """
225
+ Apply a list of (pattern, replacement) in sequence to text.
226
+ """
227
+ cdef int i
228
+ cdef tuple pair
229
+ for i in range(len(replacements)):
230
+ pair = replacements[i]
231
+ text = pair[0].sub(pair[1], text)
232
+ return text
233
+
234
+ # Internal Method: Punctuation Normalization
235
+ cdef str _punc_norm(self, str text) except *:
236
+ """
237
+ Consolidate punctuation normalization in fewer passes.
238
+ """
239
+ # 1) Apply replacements
240
+ text = self._apply_punc_replacements(text, self._PUNC_REPLACEMENTS)
241
+
242
+ # 2) Additional patterns
243
+ text = self._MULTISPACE_REGEX.sub(" ", text)
244
+ text = self._END_BRACKET_SPACE_PUNC_REGEX.sub(r")\1", text)
245
+ text = self._DIGIT_SPACE_PERCENT.sub(r"\1%", text)
246
+ text = self._DOUBLE_QUOT_PUNC.sub(r'\1"', text)
247
+ text = self._DIGIT_NBSP_DIGIT.sub(r"\1.\2", text)
248
+ return text.strip()
249
+
250
+ # Internal Method: Wrap Text with Placeholders
251
+ cdef str _wrap_with_placeholders(self, str text) except *:
252
+ """
253
+ Wrap substrings with matched patterns in the text with placeholders.
254
+ Store the placeholder map in the queue for retrieval in postprocessing.
255
+ """
256
+ cdef int serial_no = 1
257
+ cdef dict placeholder_entity_map = {}
258
+ cdef list patterns = [
259
+ self._EMAIL_PATTERN,
260
+ self._URL_PATTERN,
261
+ self._NUMERAL_PATTERN,
262
+ self._OTHER_PATTERN,
263
+ ]
264
+ cdef object pattern
265
+ cdef set matches
266
+ cdef str match
267
+ cdef str base_placeholder
268
+ cdef int i
269
+
270
+ for pattern in patterns:
271
+ matches = set(pattern.findall(text))
272
+ for match in matches:
273
+ # Additional checks
274
+ if pattern is self._URL_PATTERN:
275
+ if len(match.replace(".", "")) < 4:
276
+ continue
277
+ if pattern is self._NUMERAL_PATTERN:
278
+ if len(match.replace(" ", "").replace(".", "").replace(":", "")) < 4:
279
+ continue
280
+
281
+ base_placeholder = f"<ID{serial_no}>"
282
+ # Map various placeholder formats to the matched text
283
+ placeholder_entity_map[f"<ID{serial_no}>"] = match
284
+ placeholder_entity_map[f"< ID{serial_no} >"] = match
285
+ placeholder_entity_map[f"[ID{serial_no}]"] = match
286
+ placeholder_entity_map[f"[ ID{serial_no} ]"] = match
287
+ placeholder_entity_map[f"[ID {serial_no}]"] = match
288
+ placeholder_entity_map[f"<ID{serial_no}]"] = match
289
+ placeholder_entity_map[f"< ID{serial_no}]"] = match
290
+ placeholder_entity_map[f"<ID{serial_no} ]"] = match
291
+
292
+ # Handle Indic failure cases
293
+ for i in range(len(self._INDIC_FAILURE_CASES)):
294
+ indic_case = self._INDIC_FAILURE_CASES[i]
295
+ placeholder_entity_map[f"<{indic_case}{serial_no}>"] = match
296
+ placeholder_entity_map[f"< {indic_case}{serial_no} >"] = match
297
+ placeholder_entity_map[f"< {indic_case} {serial_no} >"] = match
298
+ placeholder_entity_map[f"<{indic_case} {serial_no}]"] = match
299
+ placeholder_entity_map[f"< {indic_case} {serial_no} ]"] = match
300
+ placeholder_entity_map[f"[{indic_case}{serial_no}]"] = match
301
+ placeholder_entity_map[f"[{indic_case} {serial_no}]"] = match
302
+ placeholder_entity_map[f"[ {indic_case}{serial_no} ]"] = match
303
+ placeholder_entity_map[f"[ {indic_case} {serial_no} ]"] = match
304
+ placeholder_entity_map[f"{indic_case} {serial_no}"] = match
305
+ placeholder_entity_map[f"{indic_case}{serial_no}"] = match
306
+
307
+ # Replace the match with the base placeholder
308
+ text = text.replace(match, base_placeholder)
309
+ serial_no += 1
310
+
311
+ # Clean up any remaining placeholder artifacts
312
+ text = re.sub(r"\s+", " ", text).replace(">/", ">").replace("]/", "]")
313
+ self._placeholder_entity_maps.put(placeholder_entity_map)
314
+ return text
315
+
316
+ # Internal Method: Normalize Text
317
+ cdef str _normalize(self, str text) except *:
318
+ """
319
+ Normalizes numerals and optionally wraps placeholders.
320
+ """
321
+ # Single-pass digit translation
322
+ text = text.translate(self._digits_translation_table)
323
+
324
+ if self.inference:
325
+ text = self._wrap_with_placeholders(text)
326
+ return text
327
+
328
+ # Internal Method: Indic Tokenize and Transliterate
329
+ cdef str _do_indic_tokenize_and_transliterate(
330
+ self,
331
+ str sentence,
332
+ object normalizer,
333
+ str iso_lang,
334
+ bint transliterate
335
+ ) except *:
336
+ """
337
+ Helper method: normalizes, tokenizes, optionally transliterates from iso_lang -> 'hi'.
338
+ """
339
+ cdef str normed
340
+ cdef list tokens
341
+ cdef str joined
342
+ cdef str xlated
343
+
344
+ normed = normalizer.normalize(sentence.strip())
345
+ tokens = indic_tokenize.trivial_tokenize(normed, iso_lang)
346
+ joined = " ".join(tokens)
347
+ xlated = joined
348
+ if transliterate:
349
+ xlated = self._xliterator.transliterate(joined, iso_lang, "hi")
350
+ xlated = xlated.replace(" ् ", "्")
351
+ return xlated
352
+
353
+ # Internal Method: Preprocess a Single Sentence
354
+ cdef str _preprocess(
355
+ self,
356
+ str sent,
357
+ str src_lang,
358
+ str tgt_lang,
359
+ object normalizer,
360
+ bint is_target
361
+ ) except *:
362
+ """
363
+ Preprocess a single sentence: punctuation normalization, numeral normalization,
364
+ tokenization, transliteration, and adding language tags if necessary.
365
+ """
366
+ cdef str iso_lang = self._flores_codes.get(src_lang, "hi")
367
+ cdef str script_part = src_lang.split("_")[1]
368
+ cdef bint do_transliterate = True
369
+ cdef str e_strip
370
+ cdef str e_norm
371
+ cdef list e_tokens
372
+ cdef str processed_sent
373
+
374
+ # 1) Punctuation normalization
375
+ sent = self._punc_norm(sent)
376
+
377
+ # 2) Numerals & placeholders
378
+ sent = self._normalize(sent)
379
+
380
+ if script_part in ["Arab", "Aran", "Olck", "Mtei", "Latn"]:
381
+ do_transliterate = False
382
+
383
+ if iso_lang == "en":
384
+ # English path
385
+ e_strip = sent.strip()
386
+ e_norm = self._en_normalizer.normalize(e_strip)
387
+ e_tokens = self._en_tok.tokenize(e_norm, escape=False)
388
+ processed_sent = " ".join(e_tokens)
389
+ else:
390
+ # Indic path
391
+ processed_sent = self._do_indic_tokenize_and_transliterate(sent, normalizer, iso_lang, do_transliterate)
392
+
393
+ processed_sent = processed_sent.strip()
394
+ if not is_target:
395
+ return f"{src_lang} {tgt_lang} {processed_sent}"
396
+ else:
397
+ return processed_sent
398
+
399
+ # Internal Method: Postprocess a Single Sentence
400
+ cdef str _postprocess(self, object sent, str lang) except *:
401
+ """
402
+ Postprocess a single sentence:
403
+ 1) Pull placeholder map from queue
404
+ 2) Fix scripts for Perso-Arabic
405
+ 3) Restore placeholders
406
+ 4) Detokenize
407
+ """
408
+ cdef dict placeholder_entity_map
409
+ cdef str lang_code
410
+ cdef str script_code
411
+ cdef str iso_lang
412
+ cdef str k
413
+ cdef str v
414
+ cdef str xlated
415
+
416
+ # Unwrap if sent is a tuple or list
417
+ if isinstance(sent, (tuple, list)):
418
+ sent = sent[0]
419
+
420
+ placeholder_entity_map = self._placeholder_entity_maps.get()
421
+ lang_code, script_code = lang.split("_", 1)
422
+ iso_lang = self._flores_codes.get(lang, "hi")
423
+
424
+ # Fix for Perso-Arabic scripts
425
+ if script_code in ["Arab", "Aran"]:
426
+ sent = (
427
+ sent.replace(" ؟", "؟")
428
+ .replace(" ۔", "۔")
429
+ .replace(" ،", "،")
430
+ .replace("ٮ۪", "ؠ")
431
+ )
432
+
433
+ # Oriya fix
434
+ if lang_code == "ory":
435
+ sent = sent.replace("ଯ଼", "ୟ")
436
+
437
+ # Restore placeholders
438
+ for k, v in placeholder_entity_map.items():
439
+ sent = sent.replace(k, v)
440
+
441
+ # Detokenize
442
+ if lang == "eng_Latn":
443
+ return self._en_detok.detokenize(sent.split(" "))
444
+ else:
445
+ xlated = self._xliterator.transliterate(sent, "hi", iso_lang)
446
+ return indic_detokenize.trivial_detokenize(xlated, iso_lang)
447
+
448
+ # Exposed Method: Preprocess a Batch of Sentences
449
+ cpdef list preprocess_batch(
450
+ self,
451
+ List[str] batch,
452
+ str src_lang,
453
+ str tgt_lang=None,
454
+ bint is_target=False,
455
+ bint visualize=False
456
+ ):
457
+ """
458
+ Preprocess an array of sentences (normalize, tokenize, transliterate).
459
+ This is exposed for external use.
460
+ """
461
+ cdef object normalizer = None
462
+ cdef str iso_code = self._flores_codes.get(src_lang, "hi")
463
+ cdef object iterator
464
+ cdef list results
465
+ cdef int i
466
+ cdef int n = len(batch)
467
+
468
+ if src_lang != "eng_Latn":
469
+ normalizer = IndicNormalizerFactory().get_normalizer(iso_code)
470
+
471
+ if visualize:
472
+ iterator = tqdm(batch, total=n, desc=f" | > Pre-processing {src_lang}", unit="line")
473
+ else:
474
+ iterator = batch
475
+
476
+ return [self._preprocess(s, src_lang, tgt_lang, normalizer, is_target) for s in iterator]
477
+
478
+ # Exposed Method: Postprocess a Batch of Sentences
479
+ cpdef list postprocess_batch(
480
+ self,
481
+ List[str] sents,
482
+ str lang="hin_Deva",
483
+ bint visualize=False
484
+ ):
485
+ """
486
+ Postprocess a batch of sentences:
487
+ Restore placeholders, fix script issues, and detokenize.
488
+ This is exposed for external use.
489
+ """
490
+ cdef object iterator
491
+ cdef list results
492
+ cdef int i
493
+ cdef int n = len(sents)
494
+
495
+ if visualize:
496
+ iterator = tqdm(sents, total=n, desc=f" | > Post-processing {lang}", unit="line")
497
+ else:
498
+ iterator = sents
499
+
500
+ results = [self._postprocess(s, lang) for s in iterator]
501
+ self._placeholder_entity_maps.queue.clear()
502
+
503
+ return results
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/version.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = "1.0.3"
IndicTrans2/huggingface_interface/IndicTransToolkit/IndicTransToolkit/version.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 1.0.3
IndicTrans2/huggingface_interface/IndicTransToolkit/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) Varun Gumma.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE
IndicTrans2/huggingface_interface/IndicTransToolkit/README.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IndicTransToolkit
2
+
3
+ ## About
4
+ The goal of this repository is to provide a simple, modular, and extendable toolkit for [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2) and be compatible with the HuggingFace models released. Please refer to the `CHANGELOG.md` for latest developments.
5
+
6
+ ## Pre-requisites
7
+ - `Python 3.8+`
8
+ - [Indic NLP Library](https://github.com/VarunGumma/indic_nlp_library)
9
+ - Other requirements as listed in `requirements.txt`
10
+
11
+ ## Configuration
12
+ - Editable installation (Note, this may take a while):
13
+ ```bash
14
+ git clone https://github.com/VarunGumma/IndicTransToolkit
15
+ cd IndicTransToolkit
16
+
17
+ pip install --editable . --use-pep517 # required for pip >= 25.0
18
+
19
+ # in case it fails, try:
20
+ # pip install --editable . --use-pep517 --config-settings editable_mode=compat
21
+ ```
22
+
23
+ ## Examples
24
+ For the training usecase, please refer [here](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface).
25
+
26
+ ### PreTainedTokenizer
27
+ ```python
28
+ import torch
29
+ from IndicTransToolkit.processor import IndicProcessor # NOW IMPLEMENTED IN CYTHON !!
30
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
31
+
32
+ ip = IndicProcessor(inference=True)
33
+ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
34
+ model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
35
+
36
+ sentences = [
37
+ "This is a test sentence.",
38
+ "This is another longer different test sentence.",
39
+ "Please send an SMS to 9876543210 and an email on [email protected] by 15th October, 2023.",
40
+ ]
41
+
42
+ batch = ip.preprocess_batch(sentences, src_lang="eng_Latn", tgt_lang="hin_Deva", visualize=False) # set it to visualize=True to print a progress bar
43
+ batch = tokenizer(batch, padding="longest", truncation=True, max_length=256, return_tensors="pt")
44
+
45
+ with torch.inference_mode():
46
+ outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)
47
+
48
+ with tokenizer.as_target_tokenizer():
49
+ # This scoping is absolutely necessary, as it will instruct the tokenizer to tokenize using the target vocabulary.
50
+ # Failure to use this scoping will result in gibberish/unexpected predictions as the output will be de-tokenized with the source vocabulary instead.
51
+ outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
52
+
53
+ outputs = ip.postprocess_batch(outputs, lang="hin_Deva")
54
+ print(outputs)
55
+
56
+ >>> ['यह एक परीक्षण वाक्य है।', 'यह एक और लंबा अलग परीक्षण वाक्य है।', 'कृपया 9876543210 पर एक एस. एम. एस. भेजें और 15 अक्टूबर, 2023 तक [email protected] पर एक ईमेल भेजें।']
57
+ ```
58
+
59
+ ### Evaluation
60
+ - `IndicEvaluator` is a python implementation of [compute_metrics.sh](https://github.com/AI4Bharat/IndicTrans2/blob/main/compute_metrics.sh).
61
+ - We have found that this python implementation gives slightly lower scores than the original `compute_metrics.sh`. So, please use this function cautiously, and feel free to raise a PR if you have found the bug/fix.
62
+ ```python
63
+ from IndicTransToolkit import IndicEvaluator
64
+
65
+ # this method returns a dictionary with BLEU and ChrF2++ scores with appropriate signatures
66
+ evaluator = IndicEvaluator()
67
+ scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=pred_file, refs=ref_file)
68
+
69
+ # alternatively, you can pass the list of predictions and references instead of files
70
+ # scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=preds, refs=refs)
71
+ ```
72
+
73
+ ## Authors
74
+ - Varun Gumma ([email protected])
75
+ - Jay Gala ([email protected])
76
+ - Pranjal Agadh Chitale ([email protected])
77
+ - Raj Dabre ([email protected])
78
+
79
+
80
+ ## Bugs and Contribution
81
+ Since this a bleeding-edge module, you may encounter broken stuff and import issues once in a while. In case you encounter any bugs or want additional functionalities, please feel free to raise `Issues`/`Pull Requests` or contact the authors.
82
+
83
+
84
+ ## Citation
85
+ If you use our codebase, or models, please do cite the following paper:
86
+ ```bibtex
87
+ @article{
88
+ gala2023indictrans,
89
+ title={IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages},
90
+ author={Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan},
91
+ journal={Transactions on Machine Learning Research},
92
+ issn={2835-8856},
93
+ year={2023},
94
+ url={https://openreview.net/forum?id=vfT4YuzAYA},
95
+ note={}
96
+ }
97
+ ```
IndicTrans2/huggingface_interface/IndicTransToolkit/app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
4
+ from IndicTransToolkit import IndicProcessor
5
+ import speech_recognition as sr
6
+
7
+ # Constants
8
+ BATCH_SIZE = 4
9
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
+ quantization = None
11
+
12
+ # ---- IndicTrans2 Model Initialization ----
13
+ def initialize_model_and_tokenizer(ckpt_dir, quantization):
14
+ if quantization == "4-bit":
15
+ qconfig = BitsAndBytesConfig(
16
+ load_in_4bit=True,
17
+ bnb_4bit_use_double_quant=True,
18
+ bnb_4bit_compute_dtype=torch.bfloat16,
19
+ )
20
+ elif quantization == "8-bit":
21
+ qconfig = BitsAndBytesConfig(
22
+ load_in_8bit=True,
23
+ bnb_8bit_use_double_quant=True,
24
+ bnb_8bit_compute_dtype=torch.bfloat16,
25
+ )
26
+ else:
27
+ qconfig = None
28
+
29
+ tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
30
+ model = AutoModelForSeq2SeqLM.from_pretrained(
31
+ ckpt_dir,
32
+ trust_remote_code=True,
33
+ low_cpu_mem_usage=True,
34
+ quantization_config=qconfig,
35
+ )
36
+
37
+ if qconfig is None:
38
+ model = model.to(DEVICE)
39
+ if DEVICE == "cuda":
40
+ model.half()
41
+
42
+ model.eval()
43
+ return tokenizer, model
44
+
45
+ def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
46
+ translations = []
47
+ for i in range(0, len(input_sentences), BATCH_SIZE):
48
+ batch = input_sentences[i : i + BATCH_SIZE]
49
+ batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)
50
+ inputs = tokenizer(
51
+ batch,
52
+ truncation=True,
53
+ padding="longest",
54
+ return_tensors="pt",
55
+ return_attention_mask=True,
56
+ ).to(DEVICE)
57
+
58
+ with torch.no_grad():
59
+ generated_tokens = model.generate(
60
+ **inputs,
61
+ use_cache=True,
62
+ min_length=0,
63
+ max_length=256,
64
+ num_beams=5,
65
+ num_return_sequences=1,
66
+ )
67
+
68
+ with tokenizer.as_target_tokenizer():
69
+ generated_tokens = tokenizer.batch_decode(
70
+ generated_tokens.detach().cpu().tolist(),
71
+ skip_special_tokens=True,
72
+ clean_up_tokenization_spaces=True,
73
+ )
74
+
75
+ translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)
76
+ del inputs
77
+ torch.cuda.empty_cache()
78
+
79
+ return translations
80
+
81
+ # Initialize IndicTrans2
82
+ en_indic_ckpt_dir = "ai4bharat/indictrans2-indic-en-1B"
83
+ en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, quantization)
84
+ ip = IndicProcessor(inference=True)
85
+
86
+ # ---- Gradio Function ----
87
+ def transcribe_and_translate(audio):
88
+ recognizer = sr.Recognizer()
89
+ with sr.AudioFile(audio) as source:
90
+ audio_data = recognizer.record(source)
91
+ try:
92
+ # Malayalam transcription using Google API
93
+ malayalam_text = recognizer.recognize_google(audio_data, language="ml-IN")
94
+ except sr.UnknownValueError:
95
+ return "Could not understand audio", ""
96
+ except sr.RequestError as e:
97
+ return f"Google API Error: {e}", ""
98
+
99
+ # Translation
100
+ en_sents = [malayalam_text]
101
+ src_lang, tgt_lang = "mal_Mlym", "eng_Latn"
102
+ translations = batch_translate(en_sents, src_lang, tgt_lang, en_indic_model, en_indic_tokenizer, ip)
103
+
104
+ return malayalam_text, translations[0]
105
+
106
+ # ---- Gradio Interface ----
107
+ iface = gr.Interface(
108
+ fn=transcribe_and_translate,
109
+ inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
110
+ outputs=[
111
+ gr.Textbox(label="Malayalam Transcription"),
112
+ gr.Textbox(label="English Translation")
113
+ ],
114
+ title="Malayalam Speech Recognition & Translation",
115
+ description="Speak in Malayalam → Transcribe using Google Speech Recognition → Translate to English using IndicTrans2."
116
+ )
117
+
118
+ iface.launch(debug=True)
IndicTrans2/huggingface_interface/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.cpython-310-x86_64-linux-gnu.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d57d4239b3638a272e4b70292f10494ee4a0fee201a9d74c62fc35a3d263a45
3
+ size 260304
IndicTrans2/huggingface_interface/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1a101ecb27adaf367f00c90b3f8e96e7fbda3bf0560d48c368fec3750a040a4
3
+ size 229200
IndicTrans2/huggingface_interface/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.o ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9e82df38b208dc0a9b468ff669c9da159c7deaabcb389fcfacd43e038504fec
3
+ size 347184
IndicTrans2/huggingface_interface/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/processor.o ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d27c2cc00c97a89f97f7c28bc9175c5c403a0e2a372a0b39f1c5fe8609adda09
3
+ size 303696
IndicTrans2/huggingface_interface/IndicTransToolkit/main.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
3
+ from IndicTransToolkit import IndicProcessor
4
+
5
+ # Constants
6
+ BATCH_SIZE = 4
7
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
8
+ quantization = None
9
+
10
+ def initialize_model_and_tokenizer(ckpt_dir, quantization):
11
+ """Initialize the model and tokenizer with optional quantization."""
12
+ if quantization == "4-bit":
13
+ qconfig = BitsAndBytesConfig(
14
+ load_in_4bit=True,
15
+ bnb_4bit_use_double_quant=True,
16
+ bnb_4bit_compute_dtype=torch.bfloat16,
17
+ )
18
+ elif quantization == "8-bit":
19
+ qconfig = BitsAndBytesConfig(
20
+ load_in_8bit=True,
21
+ bnb_8bit_use_double_quant=True,
22
+ bnb_8bit_compute_dtype=torch.bfloat16,
23
+ )
24
+ else:
25
+ qconfig = None
26
+
27
+ tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
28
+ model = AutoModelForSeq2SeqLM.from_pretrained(
29
+ ckpt_dir,
30
+ trust_remote_code=True,
31
+ low_cpu_mem_usage=True,
32
+ quantization_config=qconfig,
33
+ )
34
+
35
+ if qconfig is None:
36
+ model = model.to(DEVICE)
37
+ if DEVICE == "cuda":
38
+ model.half()
39
+
40
+ model.eval()
41
+ return tokenizer, model
42
+
43
+
44
+ def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
45
+ """Batch translate sentences from src_lang to tgt_lang."""
46
+ translations = []
47
+
48
+ for i in range(0, len(input_sentences), BATCH_SIZE):
49
+ batch = input_sentences[i : i + BATCH_SIZE]
50
+
51
+ # Preprocess the batch and extract entity mappings
52
+ batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)
53
+
54
+ # Tokenize the batch and generate input encodings
55
+ inputs = tokenizer(
56
+ batch,
57
+ truncation=True,
58
+ padding="longest",
59
+ return_tensors="pt",
60
+ return_attention_mask=True,
61
+ ).to(DEVICE)
62
+
63
+ # Generate translations using the model
64
+ with torch.no_grad():
65
+ generated_tokens = model.generate(
66
+ **inputs,
67
+ use_cache=True,
68
+ min_length=0,
69
+ max_length=256,
70
+ num_beams=5,
71
+ num_return_sequences=1,
72
+ )
73
+
74
+ # Decode the generated tokens into text
75
+ with tokenizer.as_target_tokenizer():
76
+ generated_tokens = tokenizer.batch_decode(
77
+ generated_tokens.detach().cpu().tolist(),
78
+ skip_special_tokens=True,
79
+ clean_up_tokenization_spaces=True,
80
+ )
81
+
82
+ # Postprocess the translations, including entity replacement
83
+ translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)
84
+
85
+ del inputs
86
+ torch.cuda.empty_cache()
87
+
88
+ return translations
89
+
90
+
91
+ # Initialize the model and processor
92
+ en_indic_ckpt_dir = "ai4bharat/indictrans2-indic-en-1B"
93
+ en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, quantization)
94
+ ip = IndicProcessor(inference=True)
95
+
96
+ # Sample sentences
97
+ en_sents = [
98
+ """ഹലോ ഫ്രണ്ട്സ് കോളേജ് സ്കൂളിൻറെ മറ്റൊരു അധ്യായത്തിലേക്ക് ഏവർക്കും സ്വാഗതം ഇന്ന് ഞാൻ വന്നിരിക്കുന്നത് ചെറിയ കുട്ടികൾക്കായുള്ള ഒരു മലയാളം പ്രസംഗവും ആയിട്ടാണ് പ്രസംഗ വിഷയം ഇന്ത്യ എൻറെ രാജ്യം ആയിരക്കണക്കിന് വർഷങ്ങളുടെ പാരമ്പര്യം പേറുന്ന മഹത്തായ രാജ്യമാണ് ഇന്ത്യ 1947 ൽ ബ്രിട്ടീഷുകാരിൽ നിന്നും സ്വാതന്ത്ര്യം നേടിയ നമ്മുടെ ഭാരതം അനേകം നാട്ടുരാജ്യങ്ങൾ ചേർന്ന് ഏറ്റവും വലിയ ജനാധിപത്യ രാജ്യമായി ആശയുടെ അടിസ്ഥാനത്തിൽ നല്ല ഭരണത്തിന് സഹായകമാകും വിധം സംസ്ഥാനങ്ങൾ രൂപം കൊണ്ടും എന്ന് 28 സംസ്ഥാനങ്ങൾ ആണ് ഇന്ത്യയിൽ ഉള്ളത് നാനാത്വത്തിലെ ഏകത്വം എന്ന ചിന്ത വിവിധ ഭാഷകളും ജാതികളും മതങ്ങളും ആചാരങ്ങളും ജീവിതരീതികളും ഉള്ള ഒരു വലിയ ജനതയെ ഒറ്റക്കെട്ടായി നിർത്തുന്നു അതാണ് ഭാരതത്തിൻറെ വിജയം നേടിയ ലോകമേ തറവാട് എന്നതാണ് ഭാരത സംസ്കാരം അതുകൊണ്ട് തന്നെ ഇന്ത്യക്കാരെ മാത്രമല്ല ലോകം മുഴുവനും ഉള്ള എല്ലാവരെയും ഭാരതം സന്തോഷത്തോടെ ഉൾക്കൊള്ളുകയും സ്നേഹിക്കുകയും ചെയ്യുന്ന പ്രസിഡണ്ടും പ്രധാനമന്ത്രിയും മന്ത്രിമാരും ചേർന്ന് നമ്മുടെ രാജ്യം ഭരിക്കുന്നു മുഖ്യമന്ത്രിയും മന്ത്രിമാരും ചേർന്ന് സംസ്ഥാനങ്ങളെയും പരിപാലിക്കുന്നു എൻറെ ഇന്ത്യ അഭിമാനമാണ് സംസ്കാരങ്ങൾ ചേർന്ന് മനോഹരിയായി പുഞ്ചിരിക്കുന്ന എൻറെ അമ്മ ഭാരതമെന്നു കേട്ടാൽ തിളക്കണം ചോര നമുക്ക് ഞരമ്പുകളിൽ"""
99
+ ]
100
+
101
+ # Translation
102
+ src_lang, tgt_lang = "mal_Mlym", "eng_Latn"
103
+ hi_translations = batch_translate(en_sents, src_lang, tgt_lang, en_indic_model, en_indic_tokenizer, ip)
104
+
105
+ # Print translations
106
+ print(f"\n{src_lang} - {tgt_lang}")
107
+ for input_sentence, translation in zip(en_sents, hi_translations):
108
+ print(f"{src_lang}: {input_sentence}")
109
+ print(f"{tgt_lang}: {translation}")
110
+
111
+ # Free GPU memory
112
+ del en_indic_tokenizer, en_indic_model
113
+ torch.cuda.empty_cache()
IndicTrans2/huggingface_interface/IndicTransToolkit/pyproject.toml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = [
3
+ "setuptools>=68.2.2",
4
+ "wheel",
5
+ "Cython",
6
+ ]
7
+ build-backend = "setuptools.build_meta"
8
+
9
+ [tool.black]
10
+ # Black configuration for code formatting
11
+ line-length = 88
12
+ target-version = ['py38']
13
+ exclude = '''
14
+ /(
15
+ \.git
16
+ | \.hg
17
+ | \.mypy_cache
18
+ | \.tox
19
+ | \.venv
20
+ | _build
21
+ | buck-out
22
+ | build
23
+ | dist
24
+ )/
25
+ '''