Vishwas1 commited on
Commit
870245a
·
verified ·
1 Parent(s): 2ac79ea

Update train_model.py

Browse files
Files changed (1) hide show
  1. train_model.py +213 -98
train_model.py CHANGED
@@ -2,10 +2,15 @@
2
 
3
  import argparse
4
  from transformers import (
5
- GPT2Config, GPT2LMHeadModel,
6
- BertConfig, BertForSequenceClassification,
7
- Trainer, TrainingArguments, AutoTokenizer,
8
- DataCollatorForLanguageModeling, DataCollatorWithPadding
 
 
 
 
 
9
  )
10
  from datasets import load_dataset, Dataset
11
  import torch
@@ -13,110 +18,196 @@ import os
13
  from huggingface_hub import HfApi, HfFolder
14
  import logging
15
 
16
- def main():
17
- # ... existing code ...
18
- if args.task == "generation":
19
- dataset = load_dataset(args.dataset_name, split='train') # Load dataset by name
20
- elif args.task == "classification":
21
- dataset = load_dataset(args.dataset_name, split='train') # Adjust if necessary
22
- else:
23
- raise ValueError("Unsupported task type")
24
- # ... existing code ...
 
 
 
25
 
 
 
 
 
 
26
 
27
- parser = argparse.ArgumentParser()
28
- parser.add_argument("--task", type=str, required=True, help="Task type: generation or classification")
 
 
 
 
 
 
 
 
 
29
  parser.add_argument("--model_name", type=str, required=True, help="Name of the model")
30
- parser.add_argument("--dataset_name", type=str, required=True, help="Name of the Hugging Face dataset")
31
- parser.add_argument("--num_layers", type=int, default=12)
32
- parser.add_argument("--attention_heads", type=int, default=1)
33
- parser.add_argument("--hidden_size", type=int, default=64)
34
- parser.add_argument("--vocab_size", type=int, default=30000)
35
- parser.add_argument("--sequence_length", type=int, default=512)
36
  args = parser.parse_args()
37
-
38
- logging.info(f"Starting training for model: {args.model_name}, Task: {args.task}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- # Define output directory
41
- output_dir = f"./models/{args.model_name}"
42
- os.makedirs(output_dir, exist_ok=True)
43
-
44
  # Initialize Hugging Face API
45
  api = HfApi()
46
  hf_token = HfFolder.get_token()
47
-
48
- # Initialize tokenizer (adjust based on task)
49
- if args.task == "generation":
50
- tokenizer = AutoTokenizer.from_pretrained("gpt2")
51
- elif args.task == "classification":
52
- tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
53
- else:
54
- raise ValueError("Unsupported task type")
55
-
 
 
 
 
 
 
 
 
 
56
  # Load and prepare dataset
57
- if args.task == "generation":
58
- dataset = load_dataset(args.dataset_name, split='train')
59
- def tokenize_function(examples):
60
- return tokenizer(examples['text'], truncation=True, max_length=args.sequence_length)
61
- elif args.task == "classification":
62
- dataset = load_dataset(args.dataset_name, split='train')
63
- # Assuming the dataset has 'text' and 'label' columns
64
- def tokenize_function(examples):
65
- return tokenizer(examples['text'], truncation=True, max_length=args.sequence_length)
66
- else:
67
- raise ValueError("Unsupported task type")
68
 
69
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
70
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  if args.task == "generation":
72
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
73
  elif args.task == "classification":
74
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
75
-
76
- # Initialize model based on task
77
- if args.task == "generation":
78
- config = GPT2Config(
79
- vocab_size=args.vocab_size,
80
- n_positions=args.sequence_length,
81
- n_ctx=args.sequence_length,
82
- n_embd=args.hidden_size,
83
- num_hidden_layers=args.num_layers,
84
- num_attention_heads=args.attention_heads,
85
- intermediate_size=4 * args.hidden_size,
86
- hidden_act='gelu',
87
- use_cache=True
88
- )
89
- model = GPT2LMHeadModel(config)
90
- elif args.task == "classification":
91
- config = BertConfig(
92
- vocab_size=args.vocab_size,
93
- max_position_embeddings=args.sequence_length,
94
- hidden_size=args.hidden_size,
95
- num_hidden_layers=args.num_layers,
96
- num_attention_heads=args.attention_heads,
97
- intermediate_size=4 * args.hidden_size,
98
- hidden_act='gelu',
99
- num_labels=2 # Adjust based on your classification task
100
- )
101
- model = BertForSequenceClassification(config)
102
  else:
103
- raise ValueError("Unsupported task type")
104
-
 
105
  # Define training arguments
106
  if args.task == "generation":
107
  training_args = TrainingArguments(
108
- output_dir=output_dir,
109
  num_train_epochs=3,
110
  per_device_train_batch_size=8,
111
  save_steps=5000,
112
  save_total_limit=2,
113
  logging_steps=500,
114
  learning_rate=5e-4,
115
- remove_unused_columns=False
 
116
  )
117
  elif args.task == "classification":
118
  training_args = TrainingArguments(
119
- output_dir=output_dir,
120
  num_train_epochs=3,
121
  per_device_train_batch_size=16,
122
  evaluation_strategy="epoch",
@@ -124,37 +215,61 @@ def main():
124
  save_total_limit=2,
125
  logging_steps=500,
126
  learning_rate=5e-5,
127
- remove_unused_columns=False
 
128
  )
129
-
 
 
 
130
  # Initialize Trainer
131
  trainer = Trainer(
132
  model=model,
133
  args=training_args,
134
- train_dataset=tokenized_datasets['train'],
135
  data_collator=data_collator,
136
  )
137
-
138
  # Start training
139
- trainer.train()
140
-
141
- # Save the final model
142
- trainer.save_model(output_dir)
143
- tokenizer.save_pretrained(output_dir)
144
-
145
- # Push to Hugging Face Hub
146
- model_repo = f"your-username/{args.model_name}" # Replace 'your-username' with your actual username
147
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  api.create_repo(repo_id=model_repo, private=False, token=hf_token)
 
149
  except Exception as e:
150
- logging.warning(f"Repository might already exist: {e}")
151
- model.push_to_hub(model_repo, use_auth_token=hf_token)
152
- tokenizer.push_to_hub(model_repo, use_auth_token=hf_token)
153
-
154
- logging.info(f"Model '{args.model_name}' trained and pushed to Hugging Face Hub at '{model_repo}'.")
155
 
 
 
 
 
 
 
 
 
 
 
156
  if __name__ == "__main__":
157
  main()
158
 
159
 
160
 
 
 
2
 
3
  import argparse
4
  from transformers import (
5
+ GPT2Config,
6
+ GPT2LMHeadModel,
7
+ BertConfig,
8
+ BertForSequenceClassification,
9
+ Trainer,
10
+ TrainingArguments,
11
+ AutoTokenizer,
12
+ DataCollatorForLanguageModeling,
13
+ DataCollatorWithPadding,
14
  )
15
  from datasets import load_dataset, Dataset
16
  import torch
 
18
  from huggingface_hub import HfApi, HfFolder
19
  import logging
20
 
21
+ def setup_logging(log_file_path):
22
+ """
23
+ Sets up logging to both console and a file.
24
+ """
25
+ logger = logging.getLogger()
26
+ logger.setLevel(logging.INFO)
27
+
28
+ # Create handlers
29
+ c_handler = logging.StreamHandler()
30
+ f_handler = logging.FileHandler(log_file_path)
31
+ c_handler.setLevel(logging.INFO)
32
+ f_handler.setLevel(logging.INFO)
33
 
34
+ # Create formatters and add to handlers
35
+ c_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
36
+ f_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
37
+ c_handler.setFormatter(c_format)
38
+ f_handler.setFormatter(f_format)
39
 
40
+ # Add handlers to the logger
41
+ logger.addHandler(c_handler)
42
+ logger.addHandler(f_handler)
43
+
44
+ def parse_arguments():
45
+ """
46
+ Parses command-line arguments.
47
+ """
48
+ parser = argparse.ArgumentParser(description="Train a custom LLM.")
49
+ parser.add_argument("--task", type=str, required=True, choices=["generation", "classification"],
50
+ help="Task type: 'generation' or 'classification'")
51
  parser.add_argument("--model_name", type=str, required=True, help="Name of the model")
52
+ parser.add_argument("--dataset_name", type=str, required=True, help="Name of the Hugging Face dataset (e.g., 'username/dataset')")
53
+ parser.add_argument("--num_layers", type=int, default=12, help="Number of hidden layers")
54
+ parser.add_argument("--attention_heads", type=int, default=1, help="Number of attention heads")
55
+ parser.add_argument("--hidden_size", type=int, default=64, help="Hidden size of the model")
56
+ parser.add_argument("--vocab_size", type=int, default=30000, help="Vocabulary size")
57
+ parser.add_argument("--sequence_length", type=int, default=512, help="Maximum sequence length")
58
  args = parser.parse_args()
59
+ return args
60
+
61
+ def load_and_prepare_dataset(task, dataset_name, tokenizer, sequence_length):
62
+ """
63
+ Loads and tokenizes the dataset based on the task.
64
+ """
65
+ logging.info(f"Loading dataset '{dataset_name}' for task '{task}'...")
66
+ try:
67
+ if task == "generation":
68
+ dataset = load_dataset(dataset_name, split='train')
69
+ logging.info("Dataset loaded successfully for generation task.")
70
+ def tokenize_function(examples):
71
+ return tokenizer(examples['text'], truncation=True, max_length=sequence_length)
72
+ elif task == "classification":
73
+ dataset = load_dataset(dataset_name, split='train')
74
+ logging.info("Dataset loaded successfully for classification task.")
75
+ # Assuming the dataset has 'text' and 'label' columns
76
+ def tokenize_function(examples):
77
+ return tokenizer(examples['text'], truncation=True, max_length=sequence_length)
78
+ else:
79
+ raise ValueError("Unsupported task type")
80
+
81
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
82
+ logging.info("Dataset tokenization complete.")
83
+ return tokenized_datasets
84
+ except Exception as e:
85
+ logging.error(f"Error loading or tokenizing dataset: {str(e)}")
86
+ raise e
87
+
88
+ def initialize_model(task, model_name, vocab_size, sequence_length, hidden_size, num_layers, attention_heads):
89
+ """
90
+ Initializes the model configuration and model based on the task.
91
+ """
92
+ logging.info(f"Initializing model for task '{task}'...")
93
+ try:
94
+ if task == "generation":
95
+ config = GPT2Config(
96
+ vocab_size=vocab_size,
97
+ n_positions=sequence_length,
98
+ n_ctx=sequence_length,
99
+ n_embd=hidden_size,
100
+ num_hidden_layers=num_layers,
101
+ num_attention_heads=attention_heads,
102
+ intermediate_size=4 * hidden_size,
103
+ hidden_act='gelu',
104
+ use_cache=True
105
+ )
106
+ model = GPT2LMHeadModel(config)
107
+ logging.info("GPT2LMHeadModel initialized successfully.")
108
+ elif task == "classification":
109
+ config = BertConfig(
110
+ vocab_size=vocab_size,
111
+ max_position_embeddings=sequence_length,
112
+ hidden_size=hidden_size,
113
+ num_hidden_layers=num_layers,
114
+ num_attention_heads=attention_heads,
115
+ intermediate_size=4 * hidden_size,
116
+ hidden_act='gelu',
117
+ num_labels=2 # Adjust based on your classification task
118
+ )
119
+ model = BertForSequenceClassification(config)
120
+ logging.info("BertForSequenceClassification initialized successfully.")
121
+ else:
122
+ raise ValueError("Unsupported task type")
123
+
124
+ return model
125
+ except Exception as e:
126
+ logging.error(f"Error initializing model: {str(e)}")
127
+ raise e
128
+
129
+ def main():
130
+ # Parse arguments
131
+ args = parse_arguments()
132
+
133
+ # Setup logging
134
+ log_file = "training.log"
135
+ setup_logging(log_file)
136
+ logging.info("Training script started.")
137
 
 
 
 
 
138
  # Initialize Hugging Face API
139
  api = HfApi()
140
  hf_token = HfFolder.get_token()
141
+ if not hf_token:
142
+ logging.error("HF_API_TOKEN is not set. Please set it as an environment variable.")
143
+ raise ValueError("HF_API_TOKEN is not set.")
144
+
145
+ # Initialize tokenizer
146
+ try:
147
+ logging.info("Initializing tokenizer...")
148
+ if args.task == "generation":
149
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
150
+ elif args.task == "classification":
151
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
152
+ else:
153
+ raise ValueError("Unsupported task type")
154
+ logging.info("Tokenizer initialized successfully.")
155
+ except Exception as e:
156
+ logging.error(f"Error initializing tokenizer: {str(e)}")
157
+ raise e
158
+
159
  # Load and prepare dataset
160
+ try:
161
+ tokenized_datasets = load_and_prepare_dataset(
162
+ task=args.task,
163
+ dataset_name=args.dataset_name,
164
+ tokenizer=tokenizer,
165
+ sequence_length=args.sequence_length
166
+ )
167
+ except Exception as e:
168
+ logging.error("Failed to load and prepare dataset.")
169
+ raise e
 
170
 
171
+ # Initialize model
172
+ try:
173
+ model = initialize_model(
174
+ task=args.task,
175
+ model_name=args.model_name,
176
+ vocab_size=args.vocab_size,
177
+ sequence_length=args.sequence_length,
178
+ hidden_size=args.hidden_size,
179
+ num_layers=args.num_layers,
180
+ attention_heads=args.attention_heads
181
+ )
182
+ except Exception as e:
183
+ logging.error("Failed to initialize model.")
184
+ raise e
185
+
186
+ # Define data collator
187
  if args.task == "generation":
188
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
189
  elif args.task == "classification":
190
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  else:
192
+ logging.error("Unsupported task type for data collator.")
193
+ raise ValueError("Unsupported task type for data collator.")
194
+
195
  # Define training arguments
196
  if args.task == "generation":
197
  training_args = TrainingArguments(
198
+ output_dir=f"./models/{args.model_name}",
199
  num_train_epochs=3,
200
  per_device_train_batch_size=8,
201
  save_steps=5000,
202
  save_total_limit=2,
203
  logging_steps=500,
204
  learning_rate=5e-4,
205
+ remove_unused_columns=False,
206
+ push_to_hub=False # We'll handle pushing manually
207
  )
208
  elif args.task == "classification":
209
  training_args = TrainingArguments(
210
+ output_dir=f"./models/{args.model_name}",
211
  num_train_epochs=3,
212
  per_device_train_batch_size=16,
213
  evaluation_strategy="epoch",
 
215
  save_total_limit=2,
216
  logging_steps=500,
217
  learning_rate=5e-5,
218
+ remove_unused_columns=False,
219
+ push_to_hub=False # We'll handle pushing manually
220
  )
221
+ else:
222
+ logging.error("Unsupported task type for training arguments.")
223
+ raise ValueError("Unsupported task type for training arguments.")
224
+
225
  # Initialize Trainer
226
  trainer = Trainer(
227
  model=model,
228
  args=training_args,
229
+ train_dataset=tokenized_datasets,
230
  data_collator=data_collator,
231
  )
232
+
233
  # Start training
234
+ logging.info("Starting training...")
 
 
 
 
 
 
 
235
  try:
236
+ trainer.train()
237
+ logging.info("Training completed successfully.")
238
+ except Exception as e:
239
+ logging.error(f"Error during training: {str(e)}")
240
+ raise e
241
+
242
+ # Save the final model and tokenizer
243
+ try:
244
+ trainer.save_model(training_args.output_dir)
245
+ tokenizer.save_pretrained(training_args.output_dir)
246
+ logging.info(f"Model and tokenizer saved to '{training_args.output_dir}'.")
247
+ except Exception as e:
248
+ logging.error(f"Error saving model or tokenizer: {str(e)}")
249
+ raise e
250
+
251
+ # Push the model to Hugging Face Hub
252
+ model_repo = f"{api.whoami(token=hf_token)['name']}/{args.model_name}"
253
+ try:
254
+ logging.info(f"Pushing model to Hugging Face Hub at '{model_repo}'...")
255
  api.create_repo(repo_id=model_repo, private=False, token=hf_token)
256
+ logging.info(f"Repository '{model_repo}' created successfully.")
257
  except Exception as e:
258
+ logging.warning(f"Repository might already exist: {str(e)}")
 
 
 
 
259
 
260
+ try:
261
+ model.push_to_hub(model_repo, use_auth_token=hf_token)
262
+ tokenizer.push_to_hub(model_repo, use_auth_token=hf_token)
263
+ logging.info(f"Model and tokenizer pushed to Hugging Face Hub at '{model_repo}'.")
264
+ except Exception as e:
265
+ logging.error(f"Error pushing model to Hugging Face Hub: {str(e)}")
266
+ raise e
267
+
268
+ logging.info("Training script finished successfully.")
269
+
270
  if __name__ == "__main__":
271
  main()
272
 
273
 
274
 
275
+