GeminiFan207 commited on
Commit
030ed20
·
verified ·
1 Parent(s): e9b78af

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +88 -0
train.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
3
+ from datasets import load_dataset
4
+ import os
5
+
6
+ # Model and tokenizer setup
7
+ MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1" # Real Mixtral model
8
+ OUTPUT_DIR = "./mixtral_finetuned"
9
+
10
+ # Load tokenizer
11
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
12
+ tokenizer.pad_token = tokenizer.eos_token # Set pad token if missing
13
+
14
+ # Load model with memory optimizations
15
+ model = AutoModelForCausalLM.from_pretrained(
16
+ MODEL_NAME,
17
+ torch_dtype=torch.bfloat16, # Efficient precision
18
+ device_map="auto", # Auto-distribute across GPU/CPU
19
+ low_cpu_mem_usage=True # Minimize RAM usage
20
+ )
21
+
22
+ # Load dataset (local or predefined)
23
+ # Example: local text files; replace with your paths
24
+ dataset = load_dataset("text", data_files={"train": "train.txt", "validation": "val.txt"})
25
+ # Or use a Hugging Face dataset locally: dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
26
+
27
+ # Tokenize dataset
28
+ def tokenize_function(examples):
29
+ tokenized = tokenizer(
30
+ examples["text"],
31
+ padding="max_length",
32
+ truncation=True,
33
+ max_length=512, # Adjustable; matches earlier intent
34
+ return_tensors="pt"
35
+ )
36
+ tokenized["labels"] = tokenized["input_ids"].clone() # Causal LM needs labels
37
+ return tokenized
38
+
39
+ tokenized_datasets = dataset.map(
40
+ tokenize_function,
41
+ batched=True,
42
+ remove_columns=["text"] # Save memory
43
+ )
44
+
45
+ # Split dataset
46
+ train_dataset = tokenized_datasets["train"]
47
+ eval_dataset = tokenized_datasets["validation"]
48
+
49
+ # Define training arguments
50
+ training_args = TrainingArguments(
51
+ output_dir=OUTPUT_DIR,
52
+ evaluation_strategy="epoch", # Eval each epoch
53
+ per_device_train_batch_size=2, # Adjust for your GPU
54
+ per_device_eval_batch_size=2,
55
+ num_train_epochs=3, # Default; tweak as needed
56
+ learning_rate=2e-5, # Safe for fine-tuning
57
+ weight_decay=0.01, # Regularization
58
+ gradient_accumulation_steps=4, # Effective batch size = 8
59
+ bf16=True, # Matches bfloat16 dtype
60
+ fp16=False, # Avoid if using bf16
61
+ save_strategy="epoch", # Save each epoch
62
+ save_total_limit=2, # Keep 2 latest checkpoints
63
+ logging_dir="./logs",
64
+ logging_steps=10,
65
+ load_best_model_at_end=True, # Load best based on eval loss
66
+ metric_for_best_model="loss",
67
+ report_to="none" # No external logging
68
+ )
69
+
70
+ # Initialize Trainer
71
+ trainer = Trainer(
72
+ model=model,
73
+ args=training_args,
74
+ train_dataset=train_dataset,
75
+ eval_dataset=eval_dataset,
76
+ )
77
+
78
+ # Train the model
79
+ trainer.train()
80
+
81
+ # Save locally
82
+ trainer.save_model(OUTPUT_DIR)
83
+ tokenizer.save_pretrained(OUTPUT_DIR)
84
+
85
+ # Clean up memory
86
+ del model
87
+ torch.cuda.empty_cache()
88
+ print(f"Model and tokenizer saved to {OUTPUT_DIR}")