LLMTrainingPro / train_model.py
Vishwas1's picture
Update train_model.py
f291561 verified
raw
history blame
5.8 kB
# training_space/train_model.py (Training Script)
import argparse
from transformers import (
GPT2Config, GPT2LMHeadModel,
BertConfig, BertForSequenceClassification,
Trainer, TrainingArguments, AutoTokenizer,
DataCollatorForLanguageModeling, DataCollatorWithPadding
)
from datasets import load_dataset, Dataset
import torch
import os
from huggingface_hub import HfApi, HfFolder
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--task", type=str, required=True, help="Task type: generation or classification")
parser.add_argument("--model_name", type=str, required=True, help="Name of the model")
parser.add_argument("--dataset", type=str, required=True, help="Path to the dataset")
parser.add_argument("--num_layers", type=int, default=12)
parser.add_argument("--attention_heads", type=int, default=1)
parser.add_argument("--hidden_size", type=int, default=64)
parser.add_argument("--vocab_size", type=int, default=30000)
parser.add_argument("--sequence_length", type=int, default=512)
args = parser.parse_args()
# Define output directory
output_dir = f"./models/{args.model_name}"
os.makedirs(output_dir, exist_ok=True)
# Initialize Hugging Face API
api = HfApi()
hf_token = HfFolder.get_token()
# Initialize tokenizer (adjust based on task)
if args.task == "generation":
tokenizer = AutoTokenizer.from_pretrained("gpt2")
elif args.task == "classification":
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
else:
raise ValueError("Unsupported task type")
# Load and prepare dataset
if args.task == "generation":
dataset = load_dataset('text', data_files={'train': args.dataset})
def tokenize_function(examples):
return tokenizer(examples['text'], truncation=True, max_length=args.sequence_length)
elif args.task == "classification":
# For classification, assume the dataset is a simple text file with "text\tlabel" per line
with open(args.dataset, "r", encoding="utf-8") as f:
lines = f.readlines()
texts = []
labels = []
for line in lines:
parts = line.strip().split("\t")
if len(parts) == 2:
texts.append(parts[0])
labels.append(int(parts[1]))
dataset = Dataset.from_dict({"text": texts, "label": labels})
def tokenize_function(examples):
return tokenizer(examples['text'], truncation=True, max_length=args.sequence_length)
else:
raise ValueError("Unsupported task type")
tokenized_datasets = dataset.map(tokenize_function, batched=True)
if args.task == "generation":
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
elif args.task == "classification":
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Initialize model based on task
if args.task == "generation":
config = GPT2Config(
vocab_size=args.vocab_size,
n_positions=args.sequence_length,
n_ctx=args.sequence_length,
n_embd=args.hidden_size,
num_hidden_layers=args.num_layers,
num_attention_heads=args.attention_heads,
intermediate_size=4 * args.hidden_size,
hidden_act='gelu',
use_cache=True
)
model = GPT2LMHeadModel(config)
elif args.task == "classification":
config = BertConfig(
vocab_size=args.vocab_size,
max_position_embeddings=args.sequence_length,
hidden_size=args.hidden_size,
num_hidden_layers=args.num_layers,
num_attention_heads=args.attention_heads,
intermediate_size=4 * args.hidden_size,
hidden_act='gelu',
num_labels=2 # Adjust based on your classification task
)
model = BertForSequenceClassification(config)
else:
raise ValueError("Unsupported task type")
# Define training arguments
if args.task == "generation":
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=8,
save_steps=5000,
save_total_limit=2,
logging_steps=500,
learning_rate=5e-4,
remove_unused_columns=False
)
elif args.task == "classification":
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=16,
evaluation_strategy="epoch",
save_steps=5000,
save_total_limit=2,
logging_steps=500,
learning_rate=5e-5,
remove_unused_columns=False
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets['train'],
data_collator=data_collator,
)
# Start training
trainer.train()
# Save the final model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
# Push to Hugging Face Hub
model_repo = f"your-username/{args.model_name}" # Replace 'your-username' with your actual username
try:
api.create_repo(repo_id=model_repo, private=False, token=hf_token)
except Exception as e:
print(f"Repository might already exist: {e}")
model.push_to_hub(model_repo, use_auth_token=hf_token)
tokenizer.push_to_hub(model_repo, use_auth_token=hf_token)
print(f"Model '{args.model_name}' trained and pushed to Hugging Face Hub at '{model_repo}'.")
if __name__ == "__main__":
main()