Spaces:
Runtime error
Runtime error
Update train.py
Browse files
train.py
CHANGED
@@ -110,10 +110,6 @@ def configure_tokenizer(tokenizer):
|
|
110 |
special_tokens["additional_special_tokens"] = ["<|user|>", "<|bot|>", "<|end|>"]
|
111 |
tokenizer.add_special_tokens(special_tokens)
|
112 |
|
113 |
-
tokenizer.pad_token_id = MAX_SEQ_LENGTH - 1
|
114 |
-
tokenizer.bos_token_id = MAX_SEQ_LENGTH - 2
|
115 |
-
tokenizer.eos_token_id = MAX_SEQ_LENGTH - 3
|
116 |
-
|
117 |
if INSTRUCT_FINETUNE_BOOL:
|
118 |
tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
|
119 |
tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
|
@@ -121,6 +117,14 @@ def configure_tokenizer(tokenizer):
|
|
121 |
chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
|
122 |
tokenizer.chat_template = chat_template
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
def train_model(model, tokenizer, dataset, push, isinst):
|
125 |
args = TrainingArguments(
|
126 |
output_dir="model",
|
@@ -176,12 +180,14 @@ def train_model(model, tokenizer, dataset, push, isinst):
|
|
176 |
|
177 |
def main(push_to_hub=True, is_inst_finetune=False):
|
178 |
dataset = load_data()
|
|
|
|
|
179 |
if not is_inst_finetune and INIT == 0:
|
180 |
-
training_corpus = get_training_corpus(dataset)
|
181 |
tokenizer = create_tokenizer(training_corpus)
|
182 |
else:
|
183 |
tokenizer = load_tokenizer()
|
184 |
-
|
|
|
185 |
configure_tokenizer(tokenizer)
|
186 |
|
187 |
if is_inst_finetune:
|
|
|
110 |
special_tokens["additional_special_tokens"] = ["<|user|>", "<|bot|>", "<|end|>"]
|
111 |
tokenizer.add_special_tokens(special_tokens)
|
112 |
|
|
|
|
|
|
|
|
|
113 |
if INSTRUCT_FINETUNE_BOOL:
|
114 |
tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
|
115 |
tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
|
|
|
117 |
chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
|
118 |
tokenizer.chat_template = chat_template
|
119 |
|
120 |
+
def update_tokenizer(tokenizer, corpus):
|
121 |
+
tokens = tokenizer.encode(corpus).tokens
|
122 |
+
|
123 |
+
pre_vocab = tokenizer.get_vocab()
|
124 |
+
|
125 |
+
oov_tokens = [token for token in tokens if token not in existing_vocab]
|
126 |
+
tokenizer.add_tokens(oov_tokens)
|
127 |
+
|
128 |
def train_model(model, tokenizer, dataset, push, isinst):
|
129 |
args = TrainingArguments(
|
130 |
output_dir="model",
|
|
|
180 |
|
181 |
def main(push_to_hub=True, is_inst_finetune=False):
|
182 |
dataset = load_data()
|
183 |
+
training_corpus = get_training_corpus(dataset)
|
184 |
+
|
185 |
if not is_inst_finetune and INIT == 0:
|
|
|
186 |
tokenizer = create_tokenizer(training_corpus)
|
187 |
else:
|
188 |
tokenizer = load_tokenizer()
|
189 |
+
update_tokenizer(tokenizer, training_corpus)
|
190 |
+
|
191 |
configure_tokenizer(tokenizer)
|
192 |
|
193 |
if is_inst_finetune:
|