Spaces:
Starting
on
L40S
Starting
on
L40S
Update train.py
Browse files
train.py
CHANGED
|
@@ -117,13 +117,23 @@ def configure_tokenizer(tokenizer):
|
|
| 117 |
chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
|
| 118 |
tokenizer.chat_template = chat_template
|
| 119 |
|
| 120 |
-
def update_tokenizer(tokenizer,
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
def train_model(model, tokenizer, dataset, push, isinst):
|
| 129 |
args = TrainingArguments(
|
|
@@ -180,13 +190,13 @@ def train_model(model, tokenizer, dataset, push, isinst):
|
|
| 180 |
|
| 181 |
def main(push_to_hub=True, is_inst_finetune=False):
|
| 182 |
dataset = load_data()
|
| 183 |
-
training_corpus = get_training_corpus(dataset)
|
| 184 |
|
| 185 |
if not is_inst_finetune and INIT == 0:
|
|
|
|
| 186 |
tokenizer = create_tokenizer(training_corpus)
|
| 187 |
else:
|
| 188 |
tokenizer = load_tokenizer()
|
| 189 |
-
update_tokenizer(tokenizer,
|
| 190 |
|
| 191 |
configure_tokenizer(tokenizer)
|
| 192 |
|
|
|
|
| 117 |
chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
|
| 118 |
tokenizer.chat_template = chat_template
|
| 119 |
|
| 120 |
+
def update_tokenizer(tokenizer, dataset, batch_size=1000):
|
| 121 |
+
existing_vocab = tokenizer.get_vocab()
|
| 122 |
+
|
| 123 |
+
oov_tokens = set()
|
| 124 |
+
|
| 125 |
+
for i in range(0, len(dataset['text']), batch_size):
|
| 126 |
+
batch = dataset['text'][i : i + batch_size]
|
| 127 |
+
|
| 128 |
+
batch_tokens = tokenizer.encode_batch(batch)
|
| 129 |
+
|
| 130 |
+
for encoded in batch_tokens:
|
| 131 |
+
for token in encoded.tokens:
|
| 132 |
+
if token not in existing_vocab:
|
| 133 |
+
oov_tokens.add(token)
|
| 134 |
+
|
| 135 |
+
tokenizer.add_tokens(list(oov_tokens))
|
| 136 |
+
|
| 137 |
|
| 138 |
def train_model(model, tokenizer, dataset, push, isinst):
|
| 139 |
args = TrainingArguments(
|
|
|
|
| 190 |
|
| 191 |
def main(push_to_hub=True, is_inst_finetune=False):
|
| 192 |
dataset = load_data()
|
|
|
|
| 193 |
|
| 194 |
if not is_inst_finetune and INIT == 0:
|
| 195 |
+
training_corpus = get_training_corpus(dataset)
|
| 196 |
tokenizer = create_tokenizer(training_corpus)
|
| 197 |
else:
|
| 198 |
tokenizer = load_tokenizer()
|
| 199 |
+
update_tokenizer(tokenizer, dataset)
|
| 200 |
|
| 201 |
configure_tokenizer(tokenizer)
|
| 202 |
|