File size: 2,868 Bytes
030ed20
 
 
 
 
 
4a6099f
030ed20
 
 
 
4a6099f
 
 
030ed20
4a6099f
030ed20
 
4a6099f
 
 
030ed20
 
4a6099f
 
 
 
 
 
030ed20
 
 
 
 
 
 
4a6099f
030ed20
 
4a6099f
030ed20
 
 
 
 
4a6099f
030ed20
 
4a6099f
030ed20
4a6099f
 
 
 
030ed20
 
 
 
4a6099f
 
 
 
 
 
 
 
 
 
 
030ed20
 
4a6099f
030ed20
4a6099f
030ed20
 
 
 
 
 
 
 
 
 
 
4a6099f
 
 
 
 
030ed20
 
 
 
 
4a6099f
030ed20
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import os

# Model and tokenizer setup
MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
OUTPUT_DIR = "./mixtral_finetuned"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Fallback if undefined
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Load model with optimizations
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True
)

# Load dataset (local text files)
try:
    dataset = load_dataset("text", data_files={"train": "train.txt", "validation": "val.txt"})
except FileNotFoundError:
    print("Error: train.txt or val.txt not found. Please provide valid files.")
    exit(1)

# Tokenize dataset
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,  # Adjust to 2048 or 4096 if needed
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# Split dataset with validation check
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"] if "validation" in tokenized_datasets else None
if not train_dataset or (eval_dataset and len(eval_dataset) == 0):
    print("Error: Empty training or validation dataset.")
    exit(1)

# Define training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch" if eval_dataset else "no",  # Skip eval if no validation
    per_device_train_batch_size=1,     # Lowered for smaller GPUs; adjust up if possible
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    gradient_accumulation_steps=8,     # Effective batch size = 8
    bf16=True,
    fp16=False,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=bool(eval_dataset),  # Only if eval exists
    metric_for_best_model="loss",
    report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
try:
    trainer.train()
except RuntimeError as e:
    print(f"Training failed: {e} (Likely OOM—reduce batch size or max_length)")
    exit(1)

# Save locally
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Clean up
del model
torch.cuda.empty_cache()
print(f"Model and tokenizer saved to {OUTPUT_DIR}")