stt-ml / IndicTrans2 /scripts /convert_to_flores_codes.py
nitikdias's picture
Upload 114 files
74ee63f verified
import os
import sys
from flores_codes_map_indic import flores_to_iso
def convert_iso_to_flores(data_dir: str):
"""
Converts ISO language code to flores language code for a given directory of language pairs.
Assumes that each subdirectory of the given directory corresponds to a language pair, and
that each subdirectory contains files named according to the ISO language codes of the source
and target languages.
Args:
data_dir (str): path of the directory containing the data files for language pairs in ISO language code.
"""
pairs = os.listdir(data_dir)
iso_to_flores = {v:k for k, v in flores_to_iso.items()}
for pair in pairs:
print(pair)
path = os.path.join(data_dir, pair)
src_lang_iso, tgt_lang_iso = pair.split('-')
src_lang = iso_to_flores[src_lang_iso]
tgt_lang = iso_to_flores[tgt_lang_iso]
for fname in os.listdir(os.path.join(data_dir, pair)):
if fname.endswith(src_lang_iso):
old_fname = os.path.join(path, fname)
new_fname = os.path.join(path, fname.replace(src_lang_iso, src_lang))
os.rename(old_fname, new_fname)
if fname.endswith(tgt_lang_iso):
old_fname = os.path.join(path, fname)
new_fname = os.path.join(path, fname.replace(tgt_lang_iso, tgt_lang))
os.rename(old_fname, new_fname)
new_pair ="{}-{}".format(src_lang, tgt_lang)
new_path = os.path.join(data_dir, new_pair)
os.rename(path, new_path)
if __name__ == "__main__":
data_dir = sys.argv[1]
convert_iso_to_flores(data_dir)