|
import os
|
|
from sys import argv
|
|
import multiprocessing as mp
|
|
|
|
|
|
def process_language(lang):
|
|
|
|
all_pairs = []
|
|
print(f"lang: {lang}")
|
|
|
|
for domain in domains:
|
|
src_fname = f"{base_path}/{domain}/eng_Latn-{lang}/train.eng_Latn"
|
|
tgt_fname = f"{base_path}/{domain}/eng_Latn-{lang}/train.{lang}"
|
|
|
|
try:
|
|
with open(src_fname, "r", encoding="utf-8") as f1, open(
|
|
tgt_fname, "r", encoding="utf-8"
|
|
) as f2:
|
|
src_sents = [x.strip() for x in f1]
|
|
tgt_sents = [x.strip() for x in f2]
|
|
all_pairs.extend([(a, b) for (a, b) in zip(src_sents, tgt_sents)])
|
|
except Exception as e:
|
|
pass
|
|
|
|
all_pairs = list(set(all_pairs))
|
|
src_sents, tgt_sents = zip(*all_pairs)
|
|
|
|
os.makedirs(f"{out_dir}/eng_Latn-{lang}", exist_ok=True)
|
|
with open(
|
|
f"{out_dir}/eng_Latn-{lang}/train.eng_Latn", "w", encoding="utf-8"
|
|
) as f1, open(
|
|
f"{out_dir}/eng_Latn-{lang}/train.{lang}", "w", encoding="utf-8"
|
|
) as f2:
|
|
f1.write("\n".join(src_sents))
|
|
f2.write("\n".join(tgt_sents))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
base_path = argv[1]
|
|
out_dir = argv[2]
|
|
|
|
language_codes = [
|
|
'asm_Beng', 'ben_Beng', 'brx_Deva', 'doi_Deva', 'gom_Deva',
|
|
'guj_Gujr', 'hin_Deva', 'kan_Knda', 'kas_Arab', 'kas_Deva',
|
|
'mai_Deva', 'mal_Mlym', 'mar_Deva', 'mni_Beng', 'mni_Mtei',
|
|
'npi_Deva', 'ory_Orya', 'pan_Guru', 'san_Deva', 'sat_Olck',
|
|
'snd_Arab', 'snd_Deva', 'tam_Taml', 'tel_Telu', 'urd_Arab'
|
|
]
|
|
|
|
domains = os.listdir(base_path)
|
|
|
|
with mp.Pool(mp.cpu_count()) as pool:
|
|
pool.map(process_language, language_codes)
|
|
|