nroggendorff commited on
Commit
a1b6148
·
verified ·
1 Parent(s): fdff0a0

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +6 -2
train.py CHANGED
@@ -17,7 +17,7 @@ INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
17
  INSTRUCT_DATASET = "nroggendorff/elephant"
18
  OUTPUT_REPO = "nroggendorff/smallama"
19
  INSTRUCT_FINETUNE_BOOL = False
20
- INIT = 1#/13
21
  SHARD_SIZE = int(3e+6)
22
  FP16 = True
23
  WARMUP_STEPS = 0
@@ -29,7 +29,7 @@ def load_data():
29
  if not INSTRUCT_FINETUNE_BOOL:
30
  dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train")#, streaming=True)
31
  # dataset = Dataset.from_generator(lambda: dataset.take(int(5e+6)))
32
- dataset = dataset.shard(num_shards=len(dataset) // SHARD_SIZE, index=INIT)
33
  else:
34
  dataset = load_dataset(INSTRUCT_DATASET, split="train")#, streaming=True)
35
  # dataset = Dataset.from_generator(lambda: dataset.take(int(5e+6)))
@@ -138,13 +138,17 @@ def train_model(model, tokenizer, dataset, push, isinst):
138
  logging_steps=10
139
  )
140
 
 
 
141
  optimizer = AdamW(model.parameters(), lr=args.learning_rate)
142
  scheduler = get_cosine_schedule_with_warmup(
143
  optimizer,
144
  num_warmup_steps=args.warmup_steps,
145
  num_training_steps=(len(dataset) // args.per_device_train_batch_size) * args.num_train_epochs
146
  )
 
147
  dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names)
 
148
  trainer = trl.SFTTrainer(
149
  model=model,
150
  tokenizer=tokenizer,
 
17
  INSTRUCT_DATASET = "nroggendorff/elephant"
18
  OUTPUT_REPO = "nroggendorff/smallama"
19
  INSTRUCT_FINETUNE_BOOL = False
20
+ INIT = 0#/13
21
  SHARD_SIZE = int(3e+6)
22
  FP16 = True
23
  WARMUP_STEPS = 0
 
29
  if not INSTRUCT_FINETUNE_BOOL:
30
  dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train")#, streaming=True)
31
  # dataset = Dataset.from_generator(lambda: dataset.take(int(5e+6)))
32
+ # dataset = dataset.shard(num_shards=len(dataset) // SHARD_SIZE, index=INIT)
33
  else:
34
  dataset = load_dataset(INSTRUCT_DATASET, split="train")#, streaming=True)
35
  # dataset = Dataset.from_generator(lambda: dataset.take(int(5e+6)))
 
138
  logging_steps=10
139
  )
140
 
141
+ dataset = dataset.shard(num_shards=len(dataset) // SHARD_SIZE, index=INIT)
142
+
143
  optimizer = AdamW(model.parameters(), lr=args.learning_rate)
144
  scheduler = get_cosine_schedule_with_warmup(
145
  optimizer,
146
  num_warmup_steps=args.warmup_steps,
147
  num_training_steps=(len(dataset) // args.per_device_train_batch_size) * args.num_train_epochs
148
  )
149
+
150
  dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names)
151
+
152
  trainer = trl.SFTTrainer(
153
  model=model,
154
  tokenizer=tokenizer,