|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer |
|
from datasets import load_dataset |
|
import os |
|
|
|
|
|
MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1" |
|
OUTPUT_DIR = "./mixtral_finetuned" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
tokenizer.pad_token_id = tokenizer.eos_token_id |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
MODEL_NAME, |
|
torch_dtype=torch.bfloat16, |
|
device_map="auto", |
|
low_cpu_mem_usage=True |
|
) |
|
|
|
|
|
try: |
|
dataset = load_dataset("text", data_files={"train": "train.txt", "validation": "val.txt"}) |
|
except FileNotFoundError: |
|
print("Error: train.txt or val.txt not found. Please provide valid files.") |
|
exit(1) |
|
|
|
|
|
def tokenize_function(examples): |
|
tokenized = tokenizer( |
|
examples["text"], |
|
padding="max_length", |
|
truncation=True, |
|
max_length=512, |
|
return_tensors="pt" |
|
) |
|
tokenized["labels"] = tokenized["input_ids"].clone() |
|
return tokenized |
|
|
|
tokenized_datasets = dataset.map( |
|
tokenize_function, |
|
batched=True, |
|
remove_columns=["text"] |
|
) |
|
|
|
|
|
train_dataset = tokenized_datasets["train"] |
|
eval_dataset = tokenized_datasets["validation"] if "validation" in tokenized_datasets else None |
|
if not train_dataset or (eval_dataset and len(eval_dataset) == 0): |
|
print("Error: Empty training or validation dataset.") |
|
exit(1) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir=OUTPUT_DIR, |
|
evaluation_strategy="epoch" if eval_dataset else "no", |
|
per_device_train_batch_size=1, |
|
per_device_eval_batch_size=1, |
|
num_train_epochs=3, |
|
learning_rate=2e-5, |
|
weight_decay=0.01, |
|
gradient_accumulation_steps=8, |
|
bf16=True, |
|
fp16=False, |
|
save_strategy="epoch", |
|
save_total_limit=2, |
|
logging_dir="./logs", |
|
logging_steps=10, |
|
load_best_model_at_end=bool(eval_dataset), |
|
metric_for_best_model="loss", |
|
report_to="none" |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=eval_dataset, |
|
) |
|
|
|
|
|
try: |
|
trainer.train() |
|
except RuntimeError as e: |
|
print(f"Training failed: {e} (Likely OOM—reduce batch size or max_length)") |
|
exit(1) |
|
|
|
|
|
trainer.save_model(OUTPUT_DIR) |
|
tokenizer.save_pretrained(OUTPUT_DIR) |
|
|
|
|
|
del model |
|
torch.cuda.empty_cache() |
|
print(f"Model and tokenizer saved to {OUTPUT_DIR}") |