Charm_15 / utilis.py
GeminiFan207's picture
Create utilis.py
0ed8f16 verified
raw
history blame
2.11 kB
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset
def load_model_and_tokenizer(model_name):
"""
Load the model and tokenizer.
"""
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
return model, tokenizer
def load_and_tokenize_dataset(dataset_name, tokenizer, max_length=512):
"""
Load and tokenize the dataset.
"""
dataset = load_dataset(dataset_name)
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
return tokenized_datasets
def setup_training_args(output_dir="./results", per_device_train_batch_size=2, per_device_eval_batch_size=2,
gradient_accumulation_steps=8, num_train_epochs=3, learning_rate=5e-5, weight_decay=0.01,
warmup_steps=500, logging_steps=100, fp16=True):
"""
Set up training arguments.
"""
training_args = TrainingArguments(
output_dir=output_dir,
evaluation_strategy="epoch",
per_device_train_batch_size=per_device_train_batch_size,
per_device_eval_batch_size=per_device_eval_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
num_train_epochs=num_train_epochs,
save_strategy="epoch",
save_total_limit=2,
logging_dir="./logs",
logging_steps=logging_steps,
report_to="none",
fp16=fp16,
learning_rate=learning_rate,
weight_decay=weight_decay,
warmup_steps=warmup_steps,
dataloader_num_workers=4,
push_to_hub=False
)
return training_args
def save_model_and_tokenizer(model, tokenizer, save_dir):
"""
Save the model and tokenizer.
"""
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Model and tokenizer saved at {save_dir}")