Spaces:
Paused
Paused
Update train.py
Browse files
train.py
CHANGED
@@ -126,24 +126,24 @@ def configure_tokenizer(tokenizer):
|
|
126 |
|
127 |
def update_tokenizer(tokenizer, dataset, batch_size=1000):
|
128 |
existing_vocab = tokenizer.get_vocab()
|
129 |
-
|
130 |
oov_tokens = set()
|
131 |
-
|
132 |
for i in range(0, len(dataset['text']), batch_size):
|
133 |
-
batch = dataset['text'][i
|
134 |
-
|
135 |
for text in batch:
|
136 |
-
|
137 |
-
|
138 |
-
for
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
for token in tokens:
|
143 |
-
if token not in existing_vocab:
|
144 |
oov_tokens.add(token)
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
|
|
147 |
|
148 |
def train_model(model, tokenizer, dataset, push, isinst):
|
149 |
args = TrainingArguments(
|
@@ -215,8 +215,8 @@ def main(push_to_hub=True, is_inst_finetune=False):
|
|
215 |
print("Loaded Tokenizer.")
|
216 |
|
217 |
print("Adding Tokens..")
|
218 |
-
update_tokenizer(tokenizer, dataset)
|
219 |
-
print("Added
|
220 |
|
221 |
|
222 |
if INIT == 0:
|
|
|
126 |
|
127 |
def update_tokenizer(tokenizer, dataset, batch_size=1000):
|
128 |
existing_vocab = tokenizer.get_vocab()
|
|
|
129 |
oov_tokens = set()
|
130 |
+
|
131 |
for i in range(0, len(dataset['text']), batch_size):
|
132 |
+
batch = dataset['text'][i:i + batch_size]
|
133 |
+
|
134 |
for text in batch:
|
135 |
+
token_ids = tokenizer.encode(text, add_special_tokens=False)
|
136 |
+
|
137 |
+
for token_id in token_ids:
|
138 |
+
token = tokenizer.decode([token_id])
|
139 |
+
if token.strip() and token not in existing_vocab:
|
|
|
|
|
|
|
140 |
oov_tokens.add(token)
|
141 |
+
|
142 |
+
if oov_tokens:
|
143 |
+
num_added = tokenizer.add_tokens(list(oov_tokens))
|
144 |
+
return num_added
|
145 |
+
|
146 |
+
return 0
|
147 |
|
148 |
def train_model(model, tokenizer, dataset, push, isinst):
|
149 |
args = TrainingArguments(
|
|
|
215 |
print("Loaded Tokenizer.")
|
216 |
|
217 |
print("Adding Tokens..")
|
218 |
+
num_new_tokens = update_tokenizer(tokenizer, dataset)
|
219 |
+
print(f"Added {num_new_tokens} new tokens to the vocabulary")
|
220 |
|
221 |
|
222 |
if INIT == 0:
|