nroggendorff commited on
Commit
c7feb81
·
verified ·
1 Parent(s): 298affb

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +19 -9
train.py CHANGED
@@ -117,13 +117,23 @@ def configure_tokenizer(tokenizer):
117
  chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
118
  tokenizer.chat_template = chat_template
119
 
120
- def update_tokenizer(tokenizer, corpus):
121
- tokens = tokenizer.encode(corpus).tokens
122
-
123
- pre_vocab = tokenizer.get_vocab()
124
-
125
- oov_tokens = [token for token in tokens if token not in pre_vocab]
126
- tokenizer.add_tokens(oov_tokens)
 
 
 
 
 
 
 
 
 
 
127
 
128
  def train_model(model, tokenizer, dataset, push, isinst):
129
  args = TrainingArguments(
@@ -180,13 +190,13 @@ def train_model(model, tokenizer, dataset, push, isinst):
180
 
181
  def main(push_to_hub=True, is_inst_finetune=False):
182
  dataset = load_data()
183
- training_corpus = get_training_corpus(dataset)
184
 
185
  if not is_inst_finetune and INIT == 0:
 
186
  tokenizer = create_tokenizer(training_corpus)
187
  else:
188
  tokenizer = load_tokenizer()
189
- update_tokenizer(tokenizer, training_corpus)
190
 
191
  configure_tokenizer(tokenizer)
192
 
 
117
  chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
118
  tokenizer.chat_template = chat_template
119
 
120
+ def update_tokenizer(tokenizer, dataset, batch_size=1000):
121
+ existing_vocab = tokenizer.get_vocab()
122
+
123
+ oov_tokens = set()
124
+
125
+ for i in range(0, len(dataset['text']), batch_size):
126
+ batch = dataset['text'][i : i + batch_size]
127
+
128
+ batch_tokens = tokenizer.encode_batch(batch)
129
+
130
+ for encoded in batch_tokens:
131
+ for token in encoded.tokens:
132
+ if token not in existing_vocab:
133
+ oov_tokens.add(token)
134
+
135
+ tokenizer.add_tokens(list(oov_tokens))
136
+
137
 
138
  def train_model(model, tokenizer, dataset, push, isinst):
139
  args = TrainingArguments(
 
190
 
191
  def main(push_to_hub=True, is_inst_finetune=False):
192
  dataset = load_data()
 
193
 
194
  if not is_inst_finetune and INIT == 0:
195
+ training_corpus = get_training_corpus(dataset)
196
  tokenizer = create_tokenizer(training_corpus)
197
  else:
198
  tokenizer = load_tokenizer()
199
+ update_tokenizer(tokenizer, dataset)
200
 
201
  configure_tokenizer(tokenizer)
202