Vishwas1 commited on
Commit
88dbd55
·
verified ·
1 Parent(s): 8e48af5

Update train_model.py

Browse files
Files changed (1) hide show
  1. train_model.py +48 -23
train_model.py CHANGED
@@ -12,12 +12,14 @@ from transformers import (
12
  DataCollatorForLanguageModeling,
13
  DataCollatorWithPadding,
14
  )
15
- from datasets import load_dataset
16
  import torch
17
  import os
18
  from huggingface_hub import login, HfApi, HfFolder
19
  import logging
20
 
 
 
21
  def setup_logging(log_file_path):
22
  """
23
  Sets up logging to both console and a file.
@@ -65,21 +67,21 @@ def load_and_prepare_dataset(task, dataset_name, tokenizer, sequence_length):
65
  logging.info(f"Loading dataset '{dataset_name}' for task '{task}'...")
66
  try:
67
  if task == "generation":
68
- # Check if dataset_name includes config
69
  if '/' in dataset_name:
70
  dataset, config = dataset_name.split('/', 1)
71
- dataset = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1", split='train[:1%]')
72
  else:
73
- dataset = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1", split='train[:1%]')
74
  logging.info("Dataset loaded successfully for generation task.")
75
  def tokenize_function(examples):
76
  return tokenizer(examples['text'], truncation=True, max_length=sequence_length)
77
  elif task == "classification":
78
  if '/' in dataset_name:
79
  dataset, config = dataset_name.split('/', 1)
80
- dataset = load_dataset("stanfordnlp/imdb", split='train[:1%]')
81
  else:
82
- dataset = load_dataset("stanfordnlp/imdb", split='train[:1%]')
83
  logging.info("Dataset loaded successfully for classification task.")
84
  # Assuming the dataset has 'text' and 'label' columns
85
  def tokenize_function(examples):
@@ -136,6 +138,12 @@ def initialize_model(task, model_name, vocab_size, sequence_length, hidden_size,
136
  logging.error(f"Error initializing model: {str(e)}")
137
  raise e
138
 
 
 
 
 
 
 
139
  def main():
140
  # Parse arguments
141
  args = parse_arguments()
@@ -172,6 +180,31 @@ def main():
172
  else:
173
  raise ValueError("Unsupported task type")
174
  logging.info("Tokenizer initialized successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  except Exception as e:
176
  logging.error(f"Error initializing tokenizer: {str(e)}")
177
  raise e
@@ -188,20 +221,8 @@ def main():
188
  logging.error("Failed to load and prepare dataset.")
189
  raise e
190
 
191
- # Initialize model
192
- try:
193
- model = initialize_model(
194
- task=args.task,
195
- model_name=args.model_name,
196
- vocab_size=args.vocab_size,
197
- sequence_length=args.sequence_length,
198
- hidden_size=args.hidden_size,
199
- num_layers=args.num_layers,
200
- attention_heads=args.attention_heads
201
- )
202
- except Exception as e:
203
- logging.error("Failed to initialize model.")
204
- raise e
205
 
206
  # Define data collator
207
  if args.task == "generation":
@@ -223,7 +244,8 @@ def main():
223
  logging_steps=500,
224
  learning_rate=5e-4,
225
  remove_unused_columns=False,
226
- push_to_hub=False # We'll handle pushing manually
 
227
  )
228
  elif args.task == "classification":
229
  training_args = TrainingArguments(
@@ -236,18 +258,20 @@ def main():
236
  logging_steps=500,
237
  learning_rate=5e-5,
238
  remove_unused_columns=False,
239
- push_to_hub=False # We'll handle pushing manually
 
240
  )
241
  else:
242
  logging.error("Unsupported task type for training arguments.")
243
  raise ValueError("Unsupported task type for training arguments.")
244
 
245
- # Initialize Trainer
246
  trainer = Trainer(
247
  model=model,
248
  args=training_args,
249
  train_dataset=tokenized_datasets,
250
  data_collator=data_collator,
 
251
  )
252
 
253
  # Start training
@@ -293,3 +317,4 @@ if __name__ == "__main__":
293
 
294
 
295
 
 
 
12
  DataCollatorForLanguageModeling,
13
  DataCollatorWithPadding,
14
  )
15
+ from datasets import load_dataset, Dataset
16
  import torch
17
  import os
18
  from huggingface_hub import login, HfApi, HfFolder
19
  import logging
20
 
21
+ from torch.optim import AdamW # Import PyTorch's AdamW
22
+
23
  def setup_logging(log_file_path):
24
  """
25
  Sets up logging to both console and a file.
 
67
  logging.info(f"Loading dataset '{dataset_name}' for task '{task}'...")
68
  try:
69
  if task == "generation":
70
+ # Check if dataset_name includes a configuration
71
  if '/' in dataset_name:
72
  dataset, config = dataset_name.split('/', 1)
73
+ dataset = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1", split='train')
74
  else:
75
+ dataset = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1", split='train')
76
  logging.info("Dataset loaded successfully for generation task.")
77
  def tokenize_function(examples):
78
  return tokenizer(examples['text'], truncation=True, max_length=sequence_length)
79
  elif task == "classification":
80
  if '/' in dataset_name:
81
  dataset, config = dataset_name.split('/', 1)
82
+ dataset = load_dataset(dataset, config, split='train', use_auth_token=True)
83
  else:
84
+ dataset = load_dataset(dataset_name, split='train', use_auth_token=True)
85
  logging.info("Dataset loaded successfully for classification task.")
86
  # Assuming the dataset has 'text' and 'label' columns
87
  def tokenize_function(examples):
 
138
  logging.error(f"Error initializing model: {str(e)}")
139
  raise e
140
 
141
+ def get_optimizer(model, learning_rate):
142
+ """
143
+ Returns the AdamW optimizer from PyTorch.
144
+ """
145
+ return AdamW(model.parameters(), lr=learning_rate)
146
+
147
  def main():
148
  # Parse arguments
149
  args = parse_arguments()
 
180
  else:
181
  raise ValueError("Unsupported task type")
182
  logging.info("Tokenizer initialized successfully.")
183
+
184
+ # Set pad_token to eos_token if not already set
185
+ if tokenizer.pad_token is None:
186
+ logging.info("Setting pad_token to eos_token.")
187
+ tokenizer.pad_token = tokenizer.eos_token
188
+ model = initialize_model(
189
+ task=args.task,
190
+ model_name=args.model_name,
191
+ vocab_size=args.vocab_size,
192
+ sequence_length=args.sequence_length,
193
+ hidden_size=args.hidden_size,
194
+ num_layers=args.num_layers,
195
+ attention_heads=args.attention_heads
196
+ )
197
+ model.resize_token_embeddings(len(tokenizer))
198
+ else:
199
+ model = initialize_model(
200
+ task=args.task,
201
+ model_name=args.model_name,
202
+ vocab_size=args.vocab_size,
203
+ sequence_length=args.sequence_length,
204
+ hidden_size=args.hidden_size,
205
+ num_layers=args.num_layers,
206
+ attention_heads=args.attention_heads
207
+ )
208
  except Exception as e:
209
  logging.error(f"Error initializing tokenizer: {str(e)}")
210
  raise e
 
221
  logging.error("Failed to load and prepare dataset.")
222
  raise e
223
 
224
+ # Initialize model (Already initialized above)
225
+ # model = initialize_model(...) # Moved above to handle pad_token
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  # Define data collator
228
  if args.task == "generation":
 
244
  logging_steps=500,
245
  learning_rate=5e-4,
246
  remove_unused_columns=False,
247
+ push_to_hub=False, # We'll handle pushing manually
248
+ no_deprecation_warning=True # Suppress FutureWarning
249
  )
250
  elif args.task == "classification":
251
  training_args = TrainingArguments(
 
258
  logging_steps=500,
259
  learning_rate=5e-5,
260
  remove_unused_columns=False,
261
+ push_to_hub=False, # We'll handle pushing manually
262
+ no_deprecation_warning=True # Suppress FutureWarning
263
  )
264
  else:
265
  logging.error("Unsupported task type for training arguments.")
266
  raise ValueError("Unsupported task type for training arguments.")
267
 
268
+ # Initialize Trainer with PyTorch's AdamW optimizer
269
  trainer = Trainer(
270
  model=model,
271
  args=training_args,
272
  train_dataset=tokenized_datasets,
273
  data_collator=data_collator,
274
+ optimizers=(get_optimizer(model, training_args.learning_rate), None) # None for scheduler
275
  )
276
 
277
  # Start training
 
317
 
318
 
319
 
320
+