import sys def clean_vocab(in_vocab_fname: str, out_vocab_fname: str): """ Cleans a vocabulary file by filtering out invalid lines. Args: in_vocab_fname (str): path of the input vocabulary file. out_vocab_fname (str): path of the input vocabulary file. """ with open(in_vocab_fname, "r", encoding="utf-8") as infile, open( out_vocab_fname, "w", encoding="utf-8" ) as outfile: for i, line in enumerate(infile): fields = line.strip("\r\n ").split(" ") if len(fields) == 2: outfile.write(line) if len(fields) != 2: print(f"{i}: {line.strip()}") for c in line: print(f"{c}:{hex(ord(c))}") if __name__ == "__main__": in_vocab_fname = sys.argv[1] out_vocab_fname = sys.argv[2] clean_vocab(in_vocab_fname, out_vocab_fname)