nroggendorff commited on
Commit
4be0152
·
verified ·
1 Parent(s): 861cd57

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +3 -2
train.py CHANGED
@@ -15,7 +15,7 @@ BATCH_SIZE = 32
15
  EPOCHS = 1
16
  LEARNING_RATE = 1e-4
17
  FACTOR = 768
18
- MAX_SEQ_LENGTH = 128
19
  VOCAB_SIZE = 32000
20
  INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
21
  INSTRUCT_DATASET = "nroggendorff/elephant"
@@ -110,6 +110,8 @@ def configure_tokenizer(tokenizer):
110
  special_tokens["additional_special_tokens"] = ["<|user|>", "<|bot|>", "<|end|>"]
111
  tokenizer.add_special_tokens(special_tokens)
112
 
 
 
113
  if INSTRUCT_FINETUNE_BOOL:
114
  tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
115
  tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
@@ -185,7 +187,6 @@ def main(push_to_hub=True, is_inst_finetune=False):
185
  model.resize_token_embeddings(len(tokenizer))
186
  else:
187
  model = create_model(tokenizer) if INIT == 0 else load_model()
188
- model.resize_token_embeddings(len(tokenizer))
189
 
190
  train_model(model, tokenizer, dataset, push_to_hub, is_inst_finetune)
191
 
 
15
  EPOCHS = 1
16
  LEARNING_RATE = 1e-4
17
  FACTOR = 768
18
+ MAX_SEQ_LENGTH = 512
19
  VOCAB_SIZE = 32000
20
  INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
21
  INSTRUCT_DATASET = "nroggendorff/elephant"
 
110
  special_tokens["additional_special_tokens"] = ["<|user|>", "<|bot|>", "<|end|>"]
111
  tokenizer.add_special_tokens(special_tokens)
112
 
113
+ tokenizer.pad_token_id = MAX_SEQ_LENGTH - 1
114
+
115
  if INSTRUCT_FINETUNE_BOOL:
116
  tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
117
  tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
 
187
  model.resize_token_embeddings(len(tokenizer))
188
  else:
189
  model = create_model(tokenizer) if INIT == 0 else load_model()
 
190
 
191
  train_model(model, tokenizer, dataset, push_to_hub, is_inst_finetune)
192