Spaces:
Sleeping
Sleeping
File size: 5,644 Bytes
8da495a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# training_space/train_model.py (Training Script)
import argparse
from transformers import (
GPT2Config, GPT2LMHeadModel,
BertConfig, BertForSequenceClassification,
Trainer, TrainingArguments, AutoTokenizer,
DataCollatorForLanguageModeling, DataCollatorWithPadding
)
from datasets import load_dataset, Dataset
import torch
import os
from huggingface_hub import HfApi, HfFolder
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--task", type=str, required=True, help="Task type: generation or classification")
parser.add_argument("--model_name", type=str, required=True, help="Name of the model")
parser.add_argument("--dataset", type=str, required=True, help="Path to the dataset")
parser.add_argument("--num_layers", type=int, default=12)
parser.add_argument("--attention_heads", type=int, default=1)
parser.add_argument("--hidden_size", type=int, default=64)
parser.add_argument("--vocab_size", type=int, default=30000)
parser.add_argument("--sequence_length", type=int, default=512)
args = parser.parse_args()
# Define output directory
output_dir = f"./models/{args.model_name}"
os.makedirs(output_dir, exist_ok=True)
# Initialize Hugging Face API
api = HfApi()
hf_token = HfFolder.get_token()
# Initialize tokenizer (adjust based on task)
if args.task == "generation":
tokenizer = AutoTokenizer.from_pretrained("gpt2")
elif args.task == "classification":
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
else:
raise ValueError("Unsupported task type")
# Load and prepare dataset
if args.task == "generation":
dataset = load_dataset('text', data_files={'train': args.dataset})
def tokenize_function(examples):
return tokenizer(examples['text'], truncation=True, max_length=args.sequence_length)
elif args.task == "classification":
# For classification, assume the dataset is a simple text file with "text\tlabel" per line
with open(args.dataset, "r", encoding="utf-8") as f:
lines = f.readlines()
texts = []
labels = []
for line in lines:
parts = line.strip().split("\t")
if len(parts) == 2:
texts.append(parts[0])
labels.append(int(parts[1]))
dataset = Dataset.from_dict({"text": texts, "label": labels})
def tokenize_function(examples):
return tokenizer(examples['text'], truncation=True, max_length=args.sequence_length)
else:
raise ValueError("Unsupported task type")
tokenized_datasets = dataset.map(tokenize_function, batched=True)
if args.task == "generation":
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
elif args.task == "classification":
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Initialize model based on task
if args.task == "generation":
config = GPT2Config(
vocab_size=args.vocab_size,
n_positions=args.sequence_length,
n_ctx=args.sequence_length,
n_embd=args.hidden_size,
num_hidden_layers=args.num_layers,
num_attention_heads=args.attention_heads,
intermediate_size=4 * args.hidden_size,
hidden_act='gelu',
use_cache=True
)
model = GPT2LMHeadModel(config)
elif args.task == "classification":
config = BertConfig(
vocab_size=args.vocab_size,
max_position_embeddings=args.sequence_length,
hidden_size=args.hidden_size,
num_hidden_layers=args.num_layers,
num_attention_heads=args.attention_heads,
intermediate_size=4 * args.hidden_size,
hidden_act='gelu',
num_labels=2 # Adjust based on your classification task
)
model = BertForSequenceClassification(config)
else:
raise ValueError("Unsupported task type")
# Define training arguments
if args.task == "generation":
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=8,
save_steps=5000,
save_total_limit=2,
logging_steps=500,
learning_rate=5e-4,
remove_unused_columns=False
)
elif args.task == "classification":
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=16,
evaluation_strategy="epoch",
save_steps=5000,
save_total_limit=2,
logging_steps=500,
learning_rate=5e-5,
remove_unused_columns=False
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets['train'],
data_collator=data_collator,
)
# Start training
trainer.train()
# Save the final model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
# Push to Hugging Face Hub
model_repo = f"your-username/{args.model_name}"
api.create_repo(repo_id=model_repo, private=False, token=hf_token)
model.push_to_hub(model_repo, use_auth_token=hf_token)
tokenizer.push_to_hub(model_repo, use_auth_token=hf_token)
print(f"Model '{args.model_name}' trained and pushed to Hugging Face Hub at '{model_repo}'.")
if __name__ == "__main__":
main()
|