nroggendorff commited on
Commit
5ad337b
·
verified ·
1 Parent(s): 253a653

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +20 -41
train.py CHANGED
@@ -5,12 +5,12 @@ from transformers import (
5
  AutoTokenizer, LlamaConfig, AutoModelForCausalLM, LlamaForCausalLM,
6
  TrainingArguments, PreTrainedTokenizerFast, AdamW, get_cosine_schedule_with_warmup
7
  )
8
- from datasets import load_dataset, Dataset
9
  from tokenizers import ByteLevelBPETokenizer
10
  from torch.utils.data import DataLoader
11
  from torch.cuda.amp import autocast, GradScaler
12
 
13
- BATCH_SIZE = 8
14
  EPOCHS = 1
15
  LEARNING_RATE = 1e-4
16
  FACTOR = 768
@@ -32,9 +32,9 @@ NUM_WORKERS = 4
32
  def load_data():
33
  if not INSTRUCT_FINETUNE_BOOL:
34
  dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
35
- dataset = Dataset.from_generator(lambda: dataset.take(int(8e+6)))
36
  else:
37
- dataset = load_dataset(INSTRUCT_DATASET, split="train")
38
  return dataset
39
 
40
  def create_tokenizer(training_corpus):
@@ -54,10 +54,6 @@ def create_tokenizer(training_corpus):
54
  def load_tokenizer():
55
  return AutoTokenizer.from_pretrained(OUTPUT_REPO)
56
 
57
- def get_training_corpus(dataset):
58
- for i in range(0, len(dataset['text']), 1000):
59
- yield dataset['text'][i : i + 1000]
60
-
61
  def format_prompts(examples, tokenizer, isinst):
62
  texts = []
63
  for text in examples['text']:
@@ -139,53 +135,36 @@ def train_model(model, tokenizer, dataset, push, isinst):
139
  scheduler = get_cosine_schedule_with_warmup(
140
  optimizer,
141
  num_warmup_steps=args.warmup_steps,
142
- num_training_steps=(len(dataset) // args.per_device_train_batch_size) * args.num_train_epochs
143
- )
144
-
145
- dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names)
146
-
147
- trainer = trl.SFTTrainer(
148
- model=model,
149
- tokenizer=tokenizer,
150
- args=args,
151
- train_dataset=dataset,
152
- dataset_text_field='text',
153
- max_seq_length=MAX_SEQ_LENGTH,
154
- optimizers=(optimizer, scheduler)
155
  )
156
 
157
- train = trainer.train()
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  trained_model = trainer.model
160
  trained_tokenizer = trainer.tokenizer
161
 
162
  if push:
163
  repo_id = OUTPUT_REPO + "-it" if INSTRUCT_FINETUNE_BOOL else OUTPUT_REPO
164
- msg = f"Training loss: {train.training_loss:.4f}"
165
  trained_model.push_to_hub(repo_id, commit_message=msg, force=True)
166
  trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
167
  else:
168
  trained_model.save_pretrained("model")
169
  trained_tokenizer.save_pretrained("tokenizer")
170
 
171
- def main(push_to_hub=True, is_inst_finetune=False):
172
- dataset = load_data()
173
- if not is_inst_finetune and INIT == 0:
174
- training_corpus = get_training_corpus(dataset)
175
- tokenizer = create_tokenizer(training_corpus)
176
- else:
177
- tokenizer = load_tokenizer()
178
-
179
- configure_tokenizer(tokenizer)
180
-
181
- if is_inst_finetune:
182
- model = load_model()
183
- model.resize_token_embeddings(len(tokenizer))
184
- else:
185
- model = create_model(tokenizer) if INIT == 0 else load_model()
186
-
187
- train_model(model, tokenizer, dataset, push_to_hub, is_inst_finetune)
188
-
189
  if __name__ == "__main__":
190
  main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
191
  raise Exception("Done baking!")
 
5
  AutoTokenizer, LlamaConfig, AutoModelForCausalLM, LlamaForCausalLM,
6
  TrainingArguments, PreTrainedTokenizerFast, AdamW, get_cosine_schedule_with_warmup
7
  )
8
+ from datasets import load_dataset
9
  from tokenizers import ByteLevelBPETokenizer
10
  from torch.utils.data import DataLoader
11
  from torch.cuda.amp import autocast, GradScaler
12
 
13
+ BATCH_SIZE = 32
14
  EPOCHS = 1
15
  LEARNING_RATE = 1e-4
16
  FACTOR = 768
 
32
  def load_data():
33
  if not INSTRUCT_FINETUNE_BOOL:
34
  dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
35
+ dataset = dataset.take(int(8e+6)) # Keep streaming, no conversion to in-memory dataset
36
  else:
37
+ dataset = load_dataset(INSTRUCT_DATASET, split="train", streaming=True)
38
  return dataset
39
 
40
  def create_tokenizer(training_corpus):
 
54
  def load_tokenizer():
55
  return AutoTokenizer.from_pretrained(OUTPUT_REPO)
56
 
 
 
 
 
57
  def format_prompts(examples, tokenizer, isinst):
58
  texts = []
59
  for text in examples['text']:
 
135
  scheduler = get_cosine_schedule_with_warmup(
136
  optimizer,
137
  num_warmup_steps=args.warmup_steps,
138
+ num_training_steps=(SHARD_SIZE // args.per_device_train_batch_size) * args.num_train_epochs
 
 
 
 
 
 
 
 
 
 
 
 
139
  )
140
 
141
+ dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
142
+
143
+ for batch in dataloader:
144
+ batch = format_prompts(batch, tokenizer, isinst)
145
+ trainer = trl.SFTTrainer(
146
+ model=model,
147
+ tokenizer=tokenizer,
148
+ args=args,
149
+ train_dataset=batch,
150
+ dataset_text_field='text',
151
+ max_seq_length=MAX_SEQ_LENGTH,
152
+ optimizers=(optimizer, scheduler)
153
+ )
154
+ trainer.train()
155
 
156
  trained_model = trainer.model
157
  trained_tokenizer = trainer.tokenizer
158
 
159
  if push:
160
  repo_id = OUTPUT_REPO + "-it" if INSTRUCT_FINETUNE_BOOL else OUTPUT_REPO
161
+ msg = "Training completed."
162
  trained_model.push_to_hub(repo_id, commit_message=msg, force=True)
163
  trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
164
  else:
165
  trained_model.save_pretrained("model")
166
  trained_tokenizer.save_pretrained("tokenizer")
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  if __name__ == "__main__":
169
  main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
170
  raise Exception("Done baking!")