nroggendorff commited on
Commit
ab391c2
·
verified ·
1 Parent(s): 721bf9a

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +31 -30
train.py CHANGED
@@ -5,11 +5,13 @@ from transformers import (
5
  AutoTokenizer, LlamaConfig, AutoModelForCausalLM, LlamaForCausalLM,
6
  TrainingArguments, PreTrainedTokenizerFast, AdamW, get_cosine_schedule_with_warmup
7
  )
8
- from datasets import load_dataset
9
  from tokenizers import ByteLevelBPETokenizer
10
  from torch.utils.data import DataLoader
 
 
11
 
12
- BATCH_SIZE = 8
13
  EPOCHS = 1
14
  LEARNING_RATE = 1e-4
15
  FACTOR = 768
@@ -19,7 +21,8 @@ INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
19
  INSTRUCT_DATASET = "nroggendorff/elephant"
20
  OUTPUT_REPO = "nroggendorff/smallama"
21
  INSTRUCT_FINETUNE_BOOL = False
22
- INIT = 0
 
23
  FP16 = True
24
  WARMUP_STEPS = 1000
25
  WEIGHT_DECAY = 0.01
@@ -30,20 +33,12 @@ NUM_WORKERS = 4
30
  def load_data():
31
  if not INSTRUCT_FINETUNE_BOOL:
32
  dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
33
- dataset = custom_shard_stream(dataset)
 
34
  else:
35
  dataset = load_dataset(INSTRUCT_DATASET, split="train")
36
  return dataset
37
 
38
- def custom_shard_stream(dataset, shard_size=5e5, shard_index=0):
39
- def shard_generator():
40
- count = 0
41
- for example in dataset:
42
- if count % shard_size == shard_index:
43
- yield example
44
- count += 1
45
- return shard_generator()
46
-
47
  def create_tokenizer(training_corpus):
48
  tokenizer = ByteLevelBPETokenizer()
49
  special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
@@ -59,11 +54,11 @@ def create_tokenizer(training_corpus):
59
  return fast_tokenizer
60
 
61
  def load_tokenizer():
62
- return AutoTokenizer.from_pretrained(OUTPUT_REPO)
63
 
64
  def get_training_corpus(dataset):
65
- for example in dataset:
66
- yield example['text']
67
 
68
  def format_prompts(examples, tokenizer, isinst):
69
  texts = []
@@ -140,38 +135,44 @@ def train_model(model, tokenizer, dataset, push, isinst):
140
  save_total_limit=2,
141
  )
142
 
 
 
143
  optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=WEIGHT_DECAY)
144
  scheduler = get_cosine_schedule_with_warmup(
145
  optimizer,
146
  num_warmup_steps=args.warmup_steps,
147
- num_training_steps=args.num_train_epochs
148
  )
149
-
150
  dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names)
151
-
152
  trainer = trl.SFTTrainer(
153
  model=model,
154
  tokenizer=tokenizer,
155
  args=args,
156
  train_dataset=dataset,
157
- optimizers=(optimizer, scheduler),
158
- max_seq_length=MAX_SEQ_LENGTH
 
159
  )
160
-
161
- train_result = trainer.train()
162
-
 
 
 
163
  if push:
164
  repo_id = OUTPUT_REPO + "-it" if INSTRUCT_FINETUNE_BOOL else OUTPUT_REPO
165
- msg = f"Training loss: {train_result.training_loss:.4f}"
166
- trainer.model.push_to_hub(repo_id, commit_message=msg, force=True)
167
- trainer.tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
168
  else:
169
- trainer.model.save_pretrained("model")
170
- trainer.tokenizer.save_pretrained("tokenizer")
171
 
172
  def main(push_to_hub=True, is_inst_finetune=False):
173
  dataset = load_data()
174
- if not is_inst_finetune and INIT == 0:
175
  training_corpus = get_training_corpus(dataset)
176
  tokenizer = create_tokenizer(training_corpus)
177
  else:
 
5
  AutoTokenizer, LlamaConfig, AutoModelForCausalLM, LlamaForCausalLM,
6
  TrainingArguments, PreTrainedTokenizerFast, AdamW, get_cosine_schedule_with_warmup
7
  )
8
+ from datasets import load_dataset, Dataset
9
  from tokenizers import ByteLevelBPETokenizer
10
  from torch.utils.data import DataLoader
11
+ from torch.cuda.amp import autocast, GradScaler
12
+ from itertools import islice
13
 
14
+ BATCH_SIZE = 32
15
  EPOCHS = 1
16
  LEARNING_RATE = 1e-4
17
  FACTOR = 768
 
21
  INSTRUCT_DATASET = "nroggendorff/elephant"
22
  OUTPUT_REPO = "nroggendorff/smallama"
23
  INSTRUCT_FINETUNE_BOOL = False
24
+ INIT = 1#/16
25
+ SHARD_SIZE = int(5e+5)
26
  FP16 = True
27
  WARMUP_STEPS = 1000
28
  WEIGHT_DECAY = 0.01
 
33
  def load_data():
34
  if not INSTRUCT_FINETUNE_BOOL:
35
  dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
36
+ start = INIT * SHARD_SIZE
37
+ dataset = Dataset.from_dict({'text': [example['text'] for example in islice(dataset, start, start + SHARD_SIZE)]})
38
  else:
39
  dataset = load_dataset(INSTRUCT_DATASET, split="train")
40
  return dataset
41
 
 
 
 
 
 
 
 
 
 
42
  def create_tokenizer(training_corpus):
43
  tokenizer = ByteLevelBPETokenizer()
44
  special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
 
54
  return fast_tokenizer
55
 
56
  def load_tokenizer():
57
+ return AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")#OUTPUT_REPO)
58
 
59
  def get_training_corpus(dataset):
60
+ for i in range(0, len(dataset['text']), 1000):
61
+ yield dataset['text'][i : i + 1000]
62
 
63
  def format_prompts(examples, tokenizer, isinst):
64
  texts = []
 
135
  save_total_limit=2,
136
  )
137
 
138
+ dataset = dataset.shard(num_shards=len(dataset) // SHARD_SIZE, index=INIT)
139
+
140
  optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=WEIGHT_DECAY)
141
  scheduler = get_cosine_schedule_with_warmup(
142
  optimizer,
143
  num_warmup_steps=args.warmup_steps,
144
+ num_training_steps=(len(dataset) // args.per_device_train_batch_size) * args.num_train_epochs
145
  )
146
+
147
  dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names)
148
+
149
  trainer = trl.SFTTrainer(
150
  model=model,
151
  tokenizer=tokenizer,
152
  args=args,
153
  train_dataset=dataset,
154
+ dataset_text_field='text',
155
+ max_seq_length=MAX_SEQ_LENGTH,
156
+ optimizers=(optimizer, scheduler)
157
  )
158
+
159
+ train = trainer.train()
160
+
161
+ trained_model = trainer.model
162
+ trained_tokenizer = trainer.tokenizer
163
+
164
  if push:
165
  repo_id = OUTPUT_REPO + "-it" if INSTRUCT_FINETUNE_BOOL else OUTPUT_REPO
166
+ msg = f"Training loss: {train.training_loss:.4f}"
167
+ trained_model.push_to_hub(repo_id, commit_message=msg, force=True)
168
+ trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
169
  else:
170
+ trained_model.save_pretrained("model")
171
+ trained_tokenizer.save_pretrained("tokenizer")
172
 
173
  def main(push_to_hub=True, is_inst_finetune=False):
174
  dataset = load_data()
175
+ if not is_inst_finetune and INIT == 0 and False:
176
  training_corpus = get_training_corpus(dataset)
177
  tokenizer = create_tokenizer(training_corpus)
178
  else: