|
import sys |
|
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
dataset = sys.argv[1] |
|
model_name_or_path = sys.argv[2] |
|
max_len = int(sys.argv[3]) |
|
|
|
subword_len_counter = 0 |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) |
|
max_len -= tokenizer.num_special_tokens_to_add() |
|
|
|
with open(dataset, "rt") as f_p: |
|
for line in f_p: |
|
line = line.rstrip() |
|
|
|
if not line: |
|
print(line) |
|
subword_len_counter = 0 |
|
continue |
|
|
|
token = line.split()[0] |
|
|
|
current_subwords_len = len(tokenizer.tokenize(token)) |
|
|
|
|
|
|
|
if current_subwords_len == 0: |
|
continue |
|
|
|
if (subword_len_counter + current_subwords_len) > max_len: |
|
print("") |
|
print(line) |
|
subword_len_counter = current_subwords_len |
|
continue |
|
|
|
subword_len_counter += current_subwords_len |
|
|
|
print(line) |
|
|