|
import os
|
|
import sys
|
|
from tqdm import tqdm
|
|
from typing import List
|
|
|
|
|
|
def concat_data(
|
|
data_dir: str,
|
|
out_dir: str,
|
|
lang_pair_list: List[List[str]],
|
|
out_src_lang: str = "SRC",
|
|
out_tgt_lang: str = "TGT",
|
|
split: str = "train",
|
|
):
|
|
"""
|
|
Concatenate data files from different language pairs and writes the output to a specified directory.
|
|
|
|
Args:
|
|
data_dir (str): path of the directory containing the data files for language pairs.
|
|
out_dir (str): path of the directory where the output files will be saved.
|
|
lang_pair_list (List[List[str]]): a list of language pairs, where each pair is a list of two strings.
|
|
out_src_lang (str, optional): suffix to use for the source language (default: "SRC").
|
|
out_tgt_lang (str, optional): suffix to use for the source language (default: "TGT").
|
|
split (str, optional): name of the split (e.g. "train", "dev", "test") to concatenate (default: "train").
|
|
"""
|
|
os.makedirs(out_dir, exist_ok=True)
|
|
|
|
out_src_fname = os.path.join(out_dir, f"{split}.{out_src_lang}")
|
|
out_tgt_fname = os.path.join(out_dir, f"{split}.{out_tgt_lang}")
|
|
|
|
print()
|
|
print(out_src_fname)
|
|
print(out_tgt_fname)
|
|
|
|
|
|
if os.path.isfile(out_src_fname):
|
|
os.unlink(out_src_fname)
|
|
if os.path.isfile(out_tgt_fname):
|
|
os.unlink(out_tgt_fname)
|
|
|
|
for src_lang, tgt_lang in tqdm(lang_pair_list):
|
|
print("src: {}, tgt:{}".format(src_lang, tgt_lang))
|
|
|
|
in_src_fname = os.path.join(data_dir, f"{src_lang}-{tgt_lang}", f"{split}.{src_lang}")
|
|
in_tgt_fname = os.path.join(data_dir, f"{src_lang}-{tgt_lang}", f"{split}.{tgt_lang}")
|
|
|
|
if not os.path.exists(in_src_fname) or not os.path.exists(in_tgt_fname):
|
|
continue
|
|
|
|
print(in_src_fname)
|
|
os.system("cat {} >> {}".format(in_src_fname, out_src_fname))
|
|
|
|
print(in_tgt_fname)
|
|
os.system("cat {} >> {}".format(in_tgt_fname, out_tgt_fname))
|
|
|
|
corpus_stats(data_dir, out_dir, lang_pair_list, split)
|
|
|
|
|
|
def corpus_stats(data_dir: str, out_dir: str, lang_pair_list: List[List[str]], split: str):
|
|
"""
|
|
Computes statistics for the given language pairs in a corpus and
|
|
writes the results to a file in the output directory.
|
|
|
|
Args:
|
|
data_dir (str): path of the directory containing the corpus data.
|
|
out_dir (str): path of the directory where the output file should be written.
|
|
lang_pair_list (List[List[str]]): a list of language pairs as lists of strings in the form "`[src_lang, tgt_lang]`".
|
|
split (str): a string indicating the split (e.g. 'train', 'dev', 'test') of the corpus to consider.
|
|
"""
|
|
meta_fname = os.path.join(out_dir, f"{split}_lang_pairs.txt")
|
|
with open(meta_fname, "w", encoding="utf-8") as lp_file:
|
|
|
|
for src_lang, tgt_lang in tqdm(lang_pair_list):
|
|
print("src: {}, tgt:{}".format(src_lang, tgt_lang))
|
|
|
|
in_src_fname = os.path.join(data_dir, f"{src_lang}-{tgt_lang}", f"{split}.{src_lang}")
|
|
if not os.path.exists(in_src_fname):
|
|
continue
|
|
|
|
print(in_src_fname)
|
|
|
|
corpus_size = 0
|
|
with open(in_src_fname, "r", encoding="utf-8") as infile:
|
|
corpus_size = sum(map(lambda x: 1, infile))
|
|
|
|
lp_file.write(f"{src_lang}\t{tgt_lang}\t{corpus_size}\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
in_dir = sys.argv[1]
|
|
out_dir = sys.argv[2]
|
|
split = sys.argv[3]
|
|
lang_pair_list = []
|
|
|
|
pairs = os.listdir(in_dir)
|
|
for pair in pairs:
|
|
src_lang, tgt_lang = pair.split("-")
|
|
lang_pair_list.append([src_lang, tgt_lang])
|
|
|
|
concat_data(in_dir, out_dir, lang_pair_list, split=split)
|
|
|