# training_space/train_model.py (Training Script) import argparse from transformers import ( GPT2Config, GPT2LMHeadModel, BertConfig, BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, DataCollatorForLanguageModeling, DataCollatorWithPadding ) from datasets import load_dataset, Dataset import torch import os from huggingface_hub import HfApi, HfFolder def main(): parser = argparse.ArgumentParser() parser.add_argument("--task", type=str, required=True, help="Task type: generation or classification") parser.add_argument("--model_name", type=str, required=True, help="Name of the model") parser.add_argument("--dataset", type=str, required=True, help="Path to the dataset") parser.add_argument("--num_layers", type=int, default=12) parser.add_argument("--attention_heads", type=int, default=1) parser.add_argument("--hidden_size", type=int, default=64) parser.add_argument("--vocab_size", type=int, default=30000) parser.add_argument("--sequence_length", type=int, default=512) args = parser.parse_args() # Define output directory output_dir = f"./models/{args.model_name}" os.makedirs(output_dir, exist_ok=True) # Initialize Hugging Face API api = HfApi() hf_token = HfFolder.get_token() # Initialize tokenizer (adjust based on task) if args.task == "generation": tokenizer = AutoTokenizer.from_pretrained("gpt2") elif args.task == "classification": tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") else: raise ValueError("Unsupported task type") # Load and prepare dataset if args.task == "generation": dataset = load_dataset('text', data_files={'train': args.dataset}) def tokenize_function(examples): return tokenizer(examples['text'], truncation=True, max_length=args.sequence_length) elif args.task == "classification": # For classification, assume the dataset is a simple text file with "text\tlabel" per line with open(args.dataset, "r", encoding="utf-8") as f: lines = f.readlines() texts = [] labels = [] for line in lines: parts = line.strip().split("\t") if len(parts) == 2: texts.append(parts[0]) labels.append(int(parts[1])) dataset = Dataset.from_dict({"text": texts, "label": labels}) def tokenize_function(examples): return tokenizer(examples['text'], truncation=True, max_length=args.sequence_length) else: raise ValueError("Unsupported task type") tokenized_datasets = dataset.map(tokenize_function, batched=True) if args.task == "generation": data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) elif args.task == "classification": data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Initialize model based on task if args.task == "generation": config = GPT2Config( vocab_size=args.vocab_size, n_positions=args.sequence_length, n_ctx=args.sequence_length, n_embd=args.hidden_size, num_hidden_layers=args.num_layers, num_attention_heads=args.attention_heads, intermediate_size=4 * args.hidden_size, hidden_act='gelu', use_cache=True ) model = GPT2LMHeadModel(config) elif args.task == "classification": config = BertConfig( vocab_size=args.vocab_size, max_position_embeddings=args.sequence_length, hidden_size=args.hidden_size, num_hidden_layers=args.num_layers, num_attention_heads=args.attention_heads, intermediate_size=4 * args.hidden_size, hidden_act='gelu', num_labels=2 # Adjust based on your classification task ) model = BertForSequenceClassification(config) else: raise ValueError("Unsupported task type") # Define training arguments if args.task == "generation": training_args = TrainingArguments( output_dir=output_dir, num_train_epochs=3, per_device_train_batch_size=8, save_steps=5000, save_total_limit=2, logging_steps=500, learning_rate=5e-4, remove_unused_columns=False ) elif args.task == "classification": training_args = TrainingArguments( output_dir=output_dir, num_train_epochs=3, per_device_train_batch_size=16, evaluation_strategy="epoch", save_steps=5000, save_total_limit=2, logging_steps=500, learning_rate=5e-5, remove_unused_columns=False ) # Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets['train'], data_collator=data_collator, ) # Start training trainer.train() # Save the final model trainer.save_model(output_dir) tokenizer.save_pretrained(output_dir) # Push to Hugging Face Hub model_repo = f"your-username/{args.model_name}" api.create_repo(repo_id=model_repo, private=False, token=hf_token) model.push_to_hub(model_repo, use_auth_token=hf_token) tokenizer.push_to_hub(model_repo, use_auth_token=hf_token) print(f"Model '{args.model_name}' trained and pushed to Hugging Face Hub at '{model_repo}'.") if __name__ == "__main__": main()