open-human-feedback-chat / ml /dpo_pipeline.py
Riddhi Bhagwat
evaluation pipeline updates
67312ac
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import torch
def train_dpo_model(model, dataset, learning_rate=5e-5, num_train_epochs=3, per_device_train_batch_size=16):
"""
Trains a model using Direct Preference Optimization (DPO).
Args:
model: The language model to be trained.
dataset: The dataset used for training, should be in Hugging Face Dataset format.
learning_rate: Learning rate for the optimizer.
num_train_epochs: Number of epochs to train.
per_device_train_batch_size: Batch size per device during training.
"""
model.train()
training_args = TrainingArguments(
output_dir="./dpo_model",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=learning_rate,
per_device_train_batch_size=per_device_train_batch_size,
per_device_eval_batch_size=per_device_train_batch_size,
num_train_epochs=num_train_epochs,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=100,
save_total_limit=2,
push_to_hub=False,
load_best_model_at_end=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset.get("validation", None),
)
trainer.train()
return model