nroggendorff commited on
Commit
f3687e0
·
verified ·
1 Parent(s): b225b76

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +16 -16
train.py CHANGED
@@ -126,24 +126,24 @@ def configure_tokenizer(tokenizer):
126
 
127
  def update_tokenizer(tokenizer, dataset, batch_size=1000):
128
  existing_vocab = tokenizer.get_vocab()
129
-
130
  oov_tokens = set()
131
-
132
  for i in range(0, len(dataset['text']), batch_size):
133
- batch = dataset['text'][i : i + batch_size]
134
-
135
  for text in batch:
136
- tokens = []
137
-
138
- for textier_text in tokenizer.encode(text):
139
- stringified_token = tokenizer.decode(textier_text)
140
- tokens.append(stringified_token)
141
-
142
- for token in tokens:
143
- if token not in existing_vocab:
144
  oov_tokens.add(token)
145
-
146
- tokenizer.add_tokens(list(oov_tokens))
 
 
 
 
147
 
148
  def train_model(model, tokenizer, dataset, push, isinst):
149
  args = TrainingArguments(
@@ -215,8 +215,8 @@ def main(push_to_hub=True, is_inst_finetune=False):
215
  print("Loaded Tokenizer.")
216
 
217
  print("Adding Tokens..")
218
- update_tokenizer(tokenizer, dataset)
219
- print("Added Tokens.")
220
 
221
 
222
  if INIT == 0:
 
126
 
127
  def update_tokenizer(tokenizer, dataset, batch_size=1000):
128
  existing_vocab = tokenizer.get_vocab()
 
129
  oov_tokens = set()
130
+
131
  for i in range(0, len(dataset['text']), batch_size):
132
+ batch = dataset['text'][i:i + batch_size]
133
+
134
  for text in batch:
135
+ token_ids = tokenizer.encode(text, add_special_tokens=False)
136
+
137
+ for token_id in token_ids:
138
+ token = tokenizer.decode([token_id])
139
+ if token.strip() and token not in existing_vocab:
 
 
 
140
  oov_tokens.add(token)
141
+
142
+ if oov_tokens:
143
+ num_added = tokenizer.add_tokens(list(oov_tokens))
144
+ return num_added
145
+
146
+ return 0
147
 
148
  def train_model(model, tokenizer, dataset, push, isinst):
149
  args = TrainingArguments(
 
215
  print("Loaded Tokenizer.")
216
 
217
  print("Adding Tokens..")
218
+ num_new_tokens = update_tokenizer(tokenizer, dataset)
219
+ print(f"Added {num_new_tokens} new tokens to the vocabulary")
220
 
221
 
222
  if INIT == 0: