File size: 2,868 Bytes
030ed20 4a6099f 030ed20 4a6099f 030ed20 4a6099f 030ed20 4a6099f 030ed20 4a6099f 030ed20 4a6099f 030ed20 4a6099f 030ed20 4a6099f 030ed20 4a6099f 030ed20 4a6099f 030ed20 4a6099f 030ed20 4a6099f 030ed20 4a6099f 030ed20 4a6099f 030ed20 4a6099f 030ed20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import os
# Model and tokenizer setup
MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
OUTPUT_DIR = "./mixtral_finetuned"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token # Fallback if undefined
tokenizer.pad_token_id = tokenizer.eos_token_id
# Load model with optimizations
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.bfloat16,
device_map="auto",
low_cpu_mem_usage=True
)
# Load dataset (local text files)
try:
dataset = load_dataset("text", data_files={"train": "train.txt", "validation": "val.txt"})
except FileNotFoundError:
print("Error: train.txt or val.txt not found. Please provide valid files.")
exit(1)
# Tokenize dataset
def tokenize_function(examples):
tokenized = tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=512, # Adjust to 2048 or 4096 if needed
return_tensors="pt"
)
tokenized["labels"] = tokenized["input_ids"].clone()
return tokenized
tokenized_datasets = dataset.map(
tokenize_function,
batched=True,
remove_columns=["text"]
)
# Split dataset with validation check
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"] if "validation" in tokenized_datasets else None
if not train_dataset or (eval_dataset and len(eval_dataset) == 0):
print("Error: Empty training or validation dataset.")
exit(1)
# Define training arguments
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
evaluation_strategy="epoch" if eval_dataset else "no", # Skip eval if no validation
per_device_train_batch_size=1, # Lowered for smaller GPUs; adjust up if possible
per_device_eval_batch_size=1,
num_train_epochs=3,
learning_rate=2e-5,
weight_decay=0.01,
gradient_accumulation_steps=8, # Effective batch size = 8
bf16=True,
fp16=False,
save_strategy="epoch",
save_total_limit=2,
logging_dir="./logs",
logging_steps=10,
load_best_model_at_end=bool(eval_dataset), # Only if eval exists
metric_for_best_model="loss",
report_to="none"
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
# Train the model
try:
trainer.train()
except RuntimeError as e:
print(f"Training failed: {e} (Likely OOM—reduce batch size or max_length)")
exit(1)
# Save locally
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
# Clean up
del model
torch.cuda.empty_cache()
print(f"Model and tokenizer saved to {OUTPUT_DIR}") |