GeminiFan207 commited on
Commit
4a6099f
·
verified ·
1 Parent(s): 030ed20

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +41 -30
train.py CHANGED
@@ -4,25 +4,29 @@ from datasets import load_dataset
4
  import os
5
 
6
  # Model and tokenizer setup
7
- MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1" # Real Mixtral model
8
  OUTPUT_DIR = "./mixtral_finetuned"
9
 
10
  # Load tokenizer
11
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
12
- tokenizer.pad_token = tokenizer.eos_token # Set pad token if missing
 
 
13
 
14
- # Load model with memory optimizations
15
  model = AutoModelForCausalLM.from_pretrained(
16
  MODEL_NAME,
17
- torch_dtype=torch.bfloat16, # Efficient precision
18
- device_map="auto", # Auto-distribute across GPU/CPU
19
- low_cpu_mem_usage=True # Minimize RAM usage
20
  )
21
 
22
- # Load dataset (local or predefined)
23
- # Example: local text files; replace with your paths
24
- dataset = load_dataset("text", data_files={"train": "train.txt", "validation": "val.txt"})
25
- # Or use a Hugging Face dataset locally: dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
 
 
26
 
27
  # Tokenize dataset
28
  def tokenize_function(examples):
@@ -30,41 +34,44 @@ def tokenize_function(examples):
30
  examples["text"],
31
  padding="max_length",
32
  truncation=True,
33
- max_length=512, # Adjustable; matches earlier intent
34
  return_tensors="pt"
35
  )
36
- tokenized["labels"] = tokenized["input_ids"].clone() # Causal LM needs labels
37
  return tokenized
38
 
39
  tokenized_datasets = dataset.map(
40
  tokenize_function,
41
  batched=True,
42
- remove_columns=["text"] # Save memory
43
  )
44
 
45
- # Split dataset
46
  train_dataset = tokenized_datasets["train"]
47
- eval_dataset = tokenized_datasets["validation"]
 
 
 
48
 
49
  # Define training arguments
50
  training_args = TrainingArguments(
51
  output_dir=OUTPUT_DIR,
52
- evaluation_strategy="epoch", # Eval each epoch
53
- per_device_train_batch_size=2, # Adjust for your GPU
54
- per_device_eval_batch_size=2,
55
- num_train_epochs=3, # Default; tweak as needed
56
- learning_rate=2e-5, # Safe for fine-tuning
57
- weight_decay=0.01, # Regularization
58
- gradient_accumulation_steps=4, # Effective batch size = 8
59
- bf16=True, # Matches bfloat16 dtype
60
- fp16=False, # Avoid if using bf16
61
- save_strategy="epoch", # Save each epoch
62
- save_total_limit=2, # Keep 2 latest checkpoints
63
  logging_dir="./logs",
64
  logging_steps=10,
65
- load_best_model_at_end=True, # Load best based on eval loss
66
  metric_for_best_model="loss",
67
- report_to="none" # No external logging
68
  )
69
 
70
  # Initialize Trainer
@@ -76,13 +83,17 @@ trainer = Trainer(
76
  )
77
 
78
  # Train the model
79
- trainer.train()
 
 
 
 
80
 
81
  # Save locally
82
  trainer.save_model(OUTPUT_DIR)
83
  tokenizer.save_pretrained(OUTPUT_DIR)
84
 
85
- # Clean up memory
86
  del model
87
  torch.cuda.empty_cache()
88
  print(f"Model and tokenizer saved to {OUTPUT_DIR}")
 
4
  import os
5
 
6
  # Model and tokenizer setup
7
+ MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
8
  OUTPUT_DIR = "./mixtral_finetuned"
9
 
10
  # Load tokenizer
11
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
12
+ if tokenizer.pad_token is None:
13
+ tokenizer.pad_token = tokenizer.eos_token # Fallback if undefined
14
+ tokenizer.pad_token_id = tokenizer.eos_token_id
15
 
16
+ # Load model with optimizations
17
  model = AutoModelForCausalLM.from_pretrained(
18
  MODEL_NAME,
19
+ torch_dtype=torch.bfloat16,
20
+ device_map="auto",
21
+ low_cpu_mem_usage=True
22
  )
23
 
24
+ # Load dataset (local text files)
25
+ try:
26
+ dataset = load_dataset("text", data_files={"train": "train.txt", "validation": "val.txt"})
27
+ except FileNotFoundError:
28
+ print("Error: train.txt or val.txt not found. Please provide valid files.")
29
+ exit(1)
30
 
31
  # Tokenize dataset
32
  def tokenize_function(examples):
 
34
  examples["text"],
35
  padding="max_length",
36
  truncation=True,
37
+ max_length=512, # Adjust to 2048 or 4096 if needed
38
  return_tensors="pt"
39
  )
40
+ tokenized["labels"] = tokenized["input_ids"].clone()
41
  return tokenized
42
 
43
  tokenized_datasets = dataset.map(
44
  tokenize_function,
45
  batched=True,
46
+ remove_columns=["text"]
47
  )
48
 
49
+ # Split dataset with validation check
50
  train_dataset = tokenized_datasets["train"]
51
+ eval_dataset = tokenized_datasets["validation"] if "validation" in tokenized_datasets else None
52
+ if not train_dataset or (eval_dataset and len(eval_dataset) == 0):
53
+ print("Error: Empty training or validation dataset.")
54
+ exit(1)
55
 
56
  # Define training arguments
57
  training_args = TrainingArguments(
58
  output_dir=OUTPUT_DIR,
59
+ evaluation_strategy="epoch" if eval_dataset else "no", # Skip eval if no validation
60
+ per_device_train_batch_size=1, # Lowered for smaller GPUs; adjust up if possible
61
+ per_device_eval_batch_size=1,
62
+ num_train_epochs=3,
63
+ learning_rate=2e-5,
64
+ weight_decay=0.01,
65
+ gradient_accumulation_steps=8, # Effective batch size = 8
66
+ bf16=True,
67
+ fp16=False,
68
+ save_strategy="epoch",
69
+ save_total_limit=2,
70
  logging_dir="./logs",
71
  logging_steps=10,
72
+ load_best_model_at_end=bool(eval_dataset), # Only if eval exists
73
  metric_for_best_model="loss",
74
+ report_to="none"
75
  )
76
 
77
  # Initialize Trainer
 
83
  )
84
 
85
  # Train the model
86
+ try:
87
+ trainer.train()
88
+ except RuntimeError as e:
89
+ print(f"Training failed: {e} (Likely OOM—reduce batch size or max_length)")
90
+ exit(1)
91
 
92
  # Save locally
93
  trainer.save_model(OUTPUT_DIR)
94
  tokenizer.save_pretrained(OUTPUT_DIR)
95
 
96
+ # Clean up
97
  del model
98
  torch.cuda.empty_cache()
99
  print(f"Model and tokenizer saved to {OUTPUT_DIR}")