Spaces:
Sleeping
Sleeping
# training_space/train_model.py (Training Script) | |
import argparse | |
from transformers import ( | |
GPT2Config, GPT2LMHeadModel, | |
BertConfig, BertForSequenceClassification, | |
Trainer, TrainingArguments, AutoTokenizer, | |
DataCollatorForLanguageModeling, DataCollatorWithPadding | |
) | |
from datasets import load_dataset, Dataset | |
import torch | |
import os | |
from huggingface_hub import HfApi, HfFolder | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--task", type=str, required=True, help="Task type: generation or classification") | |
parser.add_argument("--model_name", type=str, required=True, help="Name of the model") | |
parser.add_argument("--dataset", type=str, required=True, help="Path to the dataset") | |
parser.add_argument("--num_layers", type=int, default=12) | |
parser.add_argument("--attention_heads", type=int, default=1) | |
parser.add_argument("--hidden_size", type=int, default=64) | |
parser.add_argument("--vocab_size", type=int, default=30000) | |
parser.add_argument("--sequence_length", type=int, default=512) | |
args = parser.parse_args() | |
# Define output directory | |
output_dir = f"./models/{args.model_name}" | |
os.makedirs(output_dir, exist_ok=True) | |
# Initialize Hugging Face API | |
api = HfApi() | |
hf_token = HfFolder.get_token() | |
# Initialize tokenizer (adjust based on task) | |
if args.task == "generation": | |
tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
elif args.task == "classification": | |
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") | |
else: | |
raise ValueError("Unsupported task type") | |
# Load and prepare dataset | |
if args.task == "generation": | |
dataset = load_dataset('text', data_files={'train': args.dataset}) | |
def tokenize_function(examples): | |
return tokenizer(examples['text'], truncation=True, max_length=args.sequence_length) | |
elif args.task == "classification": | |
# For classification, assume the dataset is a simple text file with "text\tlabel" per line | |
with open(args.dataset, "r", encoding="utf-8") as f: | |
lines = f.readlines() | |
texts = [] | |
labels = [] | |
for line in lines: | |
parts = line.strip().split("\t") | |
if len(parts) == 2: | |
texts.append(parts[0]) | |
labels.append(int(parts[1])) | |
dataset = Dataset.from_dict({"text": texts, "label": labels}) | |
def tokenize_function(examples): | |
return tokenizer(examples['text'], truncation=True, max_length=args.sequence_length) | |
else: | |
raise ValueError("Unsupported task type") | |
tokenized_datasets = dataset.map(tokenize_function, batched=True) | |
if args.task == "generation": | |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
elif args.task == "classification": | |
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
# Initialize model based on task | |
if args.task == "generation": | |
config = GPT2Config( | |
vocab_size=args.vocab_size, | |
n_positions=args.sequence_length, | |
n_ctx=args.sequence_length, | |
n_embd=args.hidden_size, | |
num_hidden_layers=args.num_layers, | |
num_attention_heads=args.attention_heads, | |
intermediate_size=4 * args.hidden_size, | |
hidden_act='gelu', | |
use_cache=True | |
) | |
model = GPT2LMHeadModel(config) | |
elif args.task == "classification": | |
config = BertConfig( | |
vocab_size=args.vocab_size, | |
max_position_embeddings=args.sequence_length, | |
hidden_size=args.hidden_size, | |
num_hidden_layers=args.num_layers, | |
num_attention_heads=args.attention_heads, | |
intermediate_size=4 * args.hidden_size, | |
hidden_act='gelu', | |
num_labels=2 # Adjust based on your classification task | |
) | |
model = BertForSequenceClassification(config) | |
else: | |
raise ValueError("Unsupported task type") | |
# Define training arguments | |
if args.task == "generation": | |
training_args = TrainingArguments( | |
output_dir=output_dir, | |
num_train_epochs=3, | |
per_device_train_batch_size=8, | |
save_steps=5000, | |
save_total_limit=2, | |
logging_steps=500, | |
learning_rate=5e-4, | |
remove_unused_columns=False | |
) | |
elif args.task == "classification": | |
training_args = TrainingArguments( | |
output_dir=output_dir, | |
num_train_epochs=3, | |
per_device_train_batch_size=16, | |
evaluation_strategy="epoch", | |
save_steps=5000, | |
save_total_limit=2, | |
logging_steps=500, | |
learning_rate=5e-5, | |
remove_unused_columns=False | |
) | |
# Initialize Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_datasets['train'], | |
data_collator=data_collator, | |
) | |
# Start training | |
trainer.train() | |
# Save the final model | |
trainer.save_model(output_dir) | |
tokenizer.save_pretrained(output_dir) | |
# Push to Hugging Face Hub | |
model_repo = f"your-username/{args.model_name}" # Replace 'your-username' with your actual username | |
try: | |
api.create_repo(repo_id=model_repo, private=False, token=hf_token) | |
except Exception as e: | |
print(f"Repository might already exist: {e}") | |
model.push_to_hub(model_repo, use_auth_token=hf_token) | |
tokenizer.push_to_hub(model_repo, use_auth_token=hf_token) | |
print(f"Model '{args.model_name}' trained and pushed to Hugging Face Hub at '{model_repo}'.") | |
if __name__ == "__main__": | |
main() | |