nroggendorff commited on
Commit
871e408
·
verified ·
1 Parent(s): a2692eb

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +12 -6
train.py CHANGED
@@ -110,10 +110,6 @@ def configure_tokenizer(tokenizer):
110
  special_tokens["additional_special_tokens"] = ["<|user|>", "<|bot|>", "<|end|>"]
111
  tokenizer.add_special_tokens(special_tokens)
112
 
113
- tokenizer.pad_token_id = MAX_SEQ_LENGTH - 1
114
- tokenizer.bos_token_id = MAX_SEQ_LENGTH - 2
115
- tokenizer.eos_token_id = MAX_SEQ_LENGTH - 3
116
-
117
  if INSTRUCT_FINETUNE_BOOL:
118
  tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
119
  tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
@@ -121,6 +117,14 @@ def configure_tokenizer(tokenizer):
121
  chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
122
  tokenizer.chat_template = chat_template
123
 
 
 
 
 
 
 
 
 
124
  def train_model(model, tokenizer, dataset, push, isinst):
125
  args = TrainingArguments(
126
  output_dir="model",
@@ -176,12 +180,14 @@ def train_model(model, tokenizer, dataset, push, isinst):
176
 
177
  def main(push_to_hub=True, is_inst_finetune=False):
178
  dataset = load_data()
 
 
179
  if not is_inst_finetune and INIT == 0:
180
- training_corpus = get_training_corpus(dataset)
181
  tokenizer = create_tokenizer(training_corpus)
182
  else:
183
  tokenizer = load_tokenizer()
184
-
 
185
  configure_tokenizer(tokenizer)
186
 
187
  if is_inst_finetune:
 
110
  special_tokens["additional_special_tokens"] = ["<|user|>", "<|bot|>", "<|end|>"]
111
  tokenizer.add_special_tokens(special_tokens)
112
 
 
 
 
 
113
  if INSTRUCT_FINETUNE_BOOL:
114
  tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
115
  tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
 
117
  chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
118
  tokenizer.chat_template = chat_template
119
 
120
+ def update_tokenizer(tokenizer, corpus):
121
+ tokens = tokenizer.encode(corpus).tokens
122
+
123
+ pre_vocab = tokenizer.get_vocab()
124
+
125
+ oov_tokens = [token for token in tokens if token not in existing_vocab]
126
+ tokenizer.add_tokens(oov_tokens)
127
+
128
  def train_model(model, tokenizer, dataset, push, isinst):
129
  args = TrainingArguments(
130
  output_dir="model",
 
180
 
181
  def main(push_to_hub=True, is_inst_finetune=False):
182
  dataset = load_data()
183
+ training_corpus = get_training_corpus(dataset)
184
+
185
  if not is_inst_finetune and INIT == 0:
 
186
  tokenizer = create_tokenizer(training_corpus)
187
  else:
188
  tokenizer = load_tokenizer()
189
+ update_tokenizer(tokenizer, training_corpus)
190
+
191
  configure_tokenizer(tokenizer)
192
 
193
  if is_inst_finetune: