Spaces:
Runtime error
Runtime error
Update train.py
Browse files
train.py
CHANGED
@@ -5,12 +5,12 @@ from transformers import (
|
|
5 |
AutoTokenizer, LlamaConfig, AutoModelForCausalLM, LlamaForCausalLM,
|
6 |
TrainingArguments, PreTrainedTokenizerFast, AdamW, get_cosine_schedule_with_warmup
|
7 |
)
|
8 |
-
from datasets import load_dataset
|
9 |
from tokenizers import ByteLevelBPETokenizer
|
10 |
from torch.utils.data import DataLoader
|
11 |
from torch.cuda.amp import autocast, GradScaler
|
12 |
|
13 |
-
BATCH_SIZE =
|
14 |
EPOCHS = 1
|
15 |
LEARNING_RATE = 1e-4
|
16 |
FACTOR = 768
|
@@ -32,9 +32,9 @@ NUM_WORKERS = 4
|
|
32 |
def load_data():
|
33 |
if not INSTRUCT_FINETUNE_BOOL:
|
34 |
dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
|
35 |
-
dataset =
|
36 |
else:
|
37 |
-
dataset = load_dataset(INSTRUCT_DATASET, split="train")
|
38 |
return dataset
|
39 |
|
40 |
def create_tokenizer(training_corpus):
|
@@ -54,10 +54,6 @@ def create_tokenizer(training_corpus):
|
|
54 |
def load_tokenizer():
|
55 |
return AutoTokenizer.from_pretrained(OUTPUT_REPO)
|
56 |
|
57 |
-
def get_training_corpus(dataset):
|
58 |
-
for i in range(0, len(dataset['text']), 1000):
|
59 |
-
yield dataset['text'][i : i + 1000]
|
60 |
-
|
61 |
def format_prompts(examples, tokenizer, isinst):
|
62 |
texts = []
|
63 |
for text in examples['text']:
|
@@ -139,53 +135,36 @@ def train_model(model, tokenizer, dataset, push, isinst):
|
|
139 |
scheduler = get_cosine_schedule_with_warmup(
|
140 |
optimizer,
|
141 |
num_warmup_steps=args.warmup_steps,
|
142 |
-
num_training_steps=(
|
143 |
-
)
|
144 |
-
|
145 |
-
dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names)
|
146 |
-
|
147 |
-
trainer = trl.SFTTrainer(
|
148 |
-
model=model,
|
149 |
-
tokenizer=tokenizer,
|
150 |
-
args=args,
|
151 |
-
train_dataset=dataset,
|
152 |
-
dataset_text_field='text',
|
153 |
-
max_seq_length=MAX_SEQ_LENGTH,
|
154 |
-
optimizers=(optimizer, scheduler)
|
155 |
)
|
156 |
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
trained_model = trainer.model
|
160 |
trained_tokenizer = trainer.tokenizer
|
161 |
|
162 |
if push:
|
163 |
repo_id = OUTPUT_REPO + "-it" if INSTRUCT_FINETUNE_BOOL else OUTPUT_REPO
|
164 |
-
msg =
|
165 |
trained_model.push_to_hub(repo_id, commit_message=msg, force=True)
|
166 |
trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
|
167 |
else:
|
168 |
trained_model.save_pretrained("model")
|
169 |
trained_tokenizer.save_pretrained("tokenizer")
|
170 |
|
171 |
-
def main(push_to_hub=True, is_inst_finetune=False):
|
172 |
-
dataset = load_data()
|
173 |
-
if not is_inst_finetune and INIT == 0:
|
174 |
-
training_corpus = get_training_corpus(dataset)
|
175 |
-
tokenizer = create_tokenizer(training_corpus)
|
176 |
-
else:
|
177 |
-
tokenizer = load_tokenizer()
|
178 |
-
|
179 |
-
configure_tokenizer(tokenizer)
|
180 |
-
|
181 |
-
if is_inst_finetune:
|
182 |
-
model = load_model()
|
183 |
-
model.resize_token_embeddings(len(tokenizer))
|
184 |
-
else:
|
185 |
-
model = create_model(tokenizer) if INIT == 0 else load_model()
|
186 |
-
|
187 |
-
train_model(model, tokenizer, dataset, push_to_hub, is_inst_finetune)
|
188 |
-
|
189 |
if __name__ == "__main__":
|
190 |
main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
|
191 |
raise Exception("Done baking!")
|
|
|
5 |
AutoTokenizer, LlamaConfig, AutoModelForCausalLM, LlamaForCausalLM,
|
6 |
TrainingArguments, PreTrainedTokenizerFast, AdamW, get_cosine_schedule_with_warmup
|
7 |
)
|
8 |
+
from datasets import load_dataset
|
9 |
from tokenizers import ByteLevelBPETokenizer
|
10 |
from torch.utils.data import DataLoader
|
11 |
from torch.cuda.amp import autocast, GradScaler
|
12 |
|
13 |
+
BATCH_SIZE = 32
|
14 |
EPOCHS = 1
|
15 |
LEARNING_RATE = 1e-4
|
16 |
FACTOR = 768
|
|
|
32 |
def load_data():
|
33 |
if not INSTRUCT_FINETUNE_BOOL:
|
34 |
dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
|
35 |
+
dataset = dataset.take(int(8e+6)) # Keep streaming, no conversion to in-memory dataset
|
36 |
else:
|
37 |
+
dataset = load_dataset(INSTRUCT_DATASET, split="train", streaming=True)
|
38 |
return dataset
|
39 |
|
40 |
def create_tokenizer(training_corpus):
|
|
|
54 |
def load_tokenizer():
|
55 |
return AutoTokenizer.from_pretrained(OUTPUT_REPO)
|
56 |
|
|
|
|
|
|
|
|
|
57 |
def format_prompts(examples, tokenizer, isinst):
|
58 |
texts = []
|
59 |
for text in examples['text']:
|
|
|
135 |
scheduler = get_cosine_schedule_with_warmup(
|
136 |
optimizer,
|
137 |
num_warmup_steps=args.warmup_steps,
|
138 |
+
num_training_steps=(SHARD_SIZE // args.per_device_train_batch_size) * args.num_train_epochs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
)
|
140 |
|
141 |
+
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
|
142 |
+
|
143 |
+
for batch in dataloader:
|
144 |
+
batch = format_prompts(batch, tokenizer, isinst)
|
145 |
+
trainer = trl.SFTTrainer(
|
146 |
+
model=model,
|
147 |
+
tokenizer=tokenizer,
|
148 |
+
args=args,
|
149 |
+
train_dataset=batch,
|
150 |
+
dataset_text_field='text',
|
151 |
+
max_seq_length=MAX_SEQ_LENGTH,
|
152 |
+
optimizers=(optimizer, scheduler)
|
153 |
+
)
|
154 |
+
trainer.train()
|
155 |
|
156 |
trained_model = trainer.model
|
157 |
trained_tokenizer = trainer.tokenizer
|
158 |
|
159 |
if push:
|
160 |
repo_id = OUTPUT_REPO + "-it" if INSTRUCT_FINETUNE_BOOL else OUTPUT_REPO
|
161 |
+
msg = "Training completed."
|
162 |
trained_model.push_to_hub(repo_id, commit_message=msg, force=True)
|
163 |
trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
|
164 |
else:
|
165 |
trained_model.save_pretrained("model")
|
166 |
trained_tokenizer.save_pretrained("tokenizer")
|
167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
if __name__ == "__main__":
|
169 |
main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
|
170 |
raise Exception("Done baking!")
|