Twelve2five commited on
Commit
f55ecaa
·
verified ·
1 Parent(s): 3e7a8e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -83
app.py CHANGED
@@ -31,19 +31,21 @@ local_download_path = "./downloaded_dataset_files"
31
 
32
  # Training parameters
33
  NUM_EPOCHS = 1
34
- BATCH_SIZE_PER_DEVICE = 2
35
- GRAD_ACCUMULATION_STEPS = 4
36
  LEARNING_RATE = 1e-4
37
  WEIGHT_DECAY = 0.01
38
  WARMUP_RATIO = 0.03
39
  LR_SCHEDULER = "cosine"
40
  OPTIMIZER = "paged_adamw_8bit"
 
 
41
 
42
  # Multi-GPU configuration
43
  accelerator = Accelerator()
44
 
45
  # Configure environment for multi-GPU
46
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
47
 
48
  # Print GPU information
49
  print(f"Available GPUs: {torch.cuda.device_count()}")
@@ -127,34 +129,44 @@ def prepare_for_dataset(batch):
127
  return output
128
 
129
  def load_model():
130
- # For HF Spaces, we use the system CUDA if available
131
- DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
132
  print(f"Loading base model architecture from: {hf_model_repo_id}")
133
- print(f"Using device: {DEVICE}")
134
 
135
- # --- Quantization Configuration ---
136
  bnb_config = BitsAndBytesConfig(
137
  load_in_4bit=True,
138
  bnb_4bit_quant_type="nf4",
139
- bnb_4bit_compute_dtype=torch.bfloat16,
140
  bnb_4bit_use_double_quant=True,
141
  )
142
 
143
- # Configure for multi-GPU
144
- device_map = "auto" # Let the library automatically distribute across GPUs
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- # For 4x L4 GPUs (24GB each)
147
- max_memory = {i: "22GB" for i in range(torch.cuda.device_count())}
148
- max_memory["cpu"] = "32GB" # Allow some CPU offloading if needed
149
 
 
150
  model = AutoModelForCausalLM.from_pretrained(
151
  hf_model_repo_id,
152
  quantization_config=bnb_config,
153
- device_map=device_map,
154
  max_memory=max_memory,
155
  trust_remote_code=True,
156
  use_cache=False,
157
  torch_dtype=torch.float16,
 
158
  )
159
  print(f"Loaded model vocab size: {model.config.vocab_size}")
160
  print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
@@ -247,94 +259,69 @@ def load_dataset():
247
  return hf_dataset
248
 
249
  def train_model(progress=gr.Progress()):
250
- # Create directories
251
- os.makedirs(OUTPUT_TRAINING_DIR, exist_ok=True)
252
- os.makedirs(LOGGING_DIR, exist_ok=True)
253
 
254
- progress(0, desc="Loading model...")
255
- model_to_train = load_model()
256
- if model_to_train is None:
257
- return "Failed to load model."
258
 
259
- progress(0.2, desc="Loading dataset...")
 
260
  train_dataset = load_dataset()
261
- if train_dataset is None:
262
- return "Failed to load dataset."
263
 
264
- progress(0.4, desc="Setting up trainer...")
265
- # Calculate steps and warmup
266
- total_train_batch_size = BATCH_SIZE_PER_DEVICE * GRAD_ACCUMULATION_STEPS
267
- num_training_steps = math.ceil((len(train_dataset) * NUM_EPOCHS) / total_train_batch_size)
268
- num_warmup_steps = int(num_training_steps * WARMUP_RATIO)
269
 
270
- # Logging frequency
271
- steps_per_epoch = math.ceil(len(train_dataset) / total_train_batch_size)
272
- LOGGING_STEPS = max(10, steps_per_epoch // 15)
273
- SAVE_STEPS = max(50, steps_per_epoch // 10)
274
-
275
- training_args = TrainingArguments(
276
- output_dir=OUTPUT_TRAINING_DIR,
277
- num_train_epochs=NUM_EPOCHS,
278
- per_device_train_batch_size=BATCH_SIZE_PER_DEVICE,
279
- gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
280
- optim=OPTIMIZER,
281
- logging_dir=LOGGING_DIR,
282
- logging_strategy="steps",
283
- logging_steps=LOGGING_STEPS,
284
- save_strategy="steps",
285
- save_steps=SAVE_STEPS,
286
- save_total_limit=2,
287
- learning_rate=LEARNING_RATE,
288
- weight_decay=WEIGHT_DECAY,
289
- warmup_steps=num_warmup_steps,
290
- lr_scheduler_type=LR_SCHEDULER,
291
- report_to="tensorboard",
292
- fp16=True,
293
- bf16=False,
294
- gradient_checkpointing=True,
295
- gradient_checkpointing_kwargs={"use_reentrant": False},
296
- ddp_find_unused_parameters=False,
297
- local_rank=int(os.getenv("LOCAL_RANK", -1)),
298
- dataloader_num_workers=4,
299
- )
300
 
 
301
  trainer = Trainer(
302
- model=model_to_train,
303
  args=training_args,
304
  train_dataset=train_dataset,
305
- data_collator=seq2seq_causal_collator,
306
  )
307
 
308
- # Print memory usage before training
 
309
  for i in range(torch.cuda.device_count()):
310
  print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
311
-
312
- progress(0.5, desc="Starting training...")
313
- # Clear cache before starting
314
- gc.collect()
315
- if torch.cuda.is_available():
316
- torch.cuda.empty_cache()
317
 
318
  try:
319
- # Train distributed across GPUs
 
 
 
 
 
 
320
  train_result = trainer.train()
321
 
 
322
  progress(0.9, desc="Saving model...")
323
- # Save final model and training state
324
- final_save_path = os.path.join(training_args.output_dir, "final_checkpoint")
325
- trainer.save_model(final_save_path)
326
- trainer.save_state()
327
 
328
- # Log metrics
329
- metrics = train_result.metrics
330
- trainer.log_metrics("train", metrics)
331
- trainer.save_metrics("train", metrics)
332
-
333
- progress(1.0, desc="Training complete!")
334
- return f"Training completed successfully. Model saved to {final_save_path}"
335
-
336
  except Exception as e:
337
- return f"An error occurred during training: {str(e)}"
 
 
 
 
 
 
 
 
338
 
339
  # Create Gradio interface
340
  def create_ui():
@@ -386,4 +373,13 @@ if __name__ == "__main__":
386
 
387
  # Create and launch the UI
388
  demo = create_ui()
389
- demo.launch()
 
 
 
 
 
 
 
 
 
 
31
 
32
  # Training parameters
33
  NUM_EPOCHS = 1
34
+ BATCH_SIZE_PER_DEVICE = 1
35
+ GRAD_ACCUMULATION_STEPS = 64
36
  LEARNING_RATE = 1e-4
37
  WEIGHT_DECAY = 0.01
38
  WARMUP_RATIO = 0.03
39
  LR_SCHEDULER = "cosine"
40
  OPTIMIZER = "paged_adamw_8bit"
41
+ MAX_SEQ_LENGTH = 256
42
+ MICRO_BATCH_SIZE = 1
43
 
44
  # Multi-GPU configuration
45
  accelerator = Accelerator()
46
 
47
  # Configure environment for multi-GPU
48
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
49
 
50
  # Print GPU information
51
  print(f"Available GPUs: {torch.cuda.device_count()}")
 
129
  return output
130
 
131
  def load_model():
132
+ clean_memory() # Start with clean memory
133
+
134
  print(f"Loading base model architecture from: {hf_model_repo_id}")
 
135
 
136
+ # Even more extreme quantization
137
  bnb_config = BitsAndBytesConfig(
138
  load_in_4bit=True,
139
  bnb_4bit_quant_type="nf4",
140
+ bnb_4bit_compute_dtype=torch.float16, # Use float16 instead of bfloat16
141
  bnb_4bit_use_double_quant=True,
142
  )
143
 
144
+ # Use DeepSpeed if available
145
+ try:
146
+ from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live
147
+ use_deepspeed = True
148
+ print("DeepSpeed available, will use ZeRO-3")
149
+ except ImportError:
150
+ use_deepspeed = False
151
+ print("DeepSpeed not available, falling back to standard distribution")
152
+
153
+ # Calculate per-GPU reserved memory (be very conservative)
154
+ n_gpus = max(1, torch.cuda.device_count())
155
+ max_memory = {i: f"{int(torch.cuda.get_device_properties(i).total_memory / 1e9) - 4}GB" for i in range(n_gpus)}
156
+ max_memory["cpu"] = "32GB"
157
 
158
+ print(f"Using {n_gpus} GPUs with memory configuration: {max_memory}")
 
 
159
 
160
+ # Load model with proper device distribution
161
  model = AutoModelForCausalLM.from_pretrained(
162
  hf_model_repo_id,
163
  quantization_config=bnb_config,
164
+ device_map="balanced_low_0", # Distribute evenly with priority to minimize GPU 0 usage
165
  max_memory=max_memory,
166
  trust_remote_code=True,
167
  use_cache=False,
168
  torch_dtype=torch.float16,
169
+ low_cpu_mem_usage=True,
170
  )
171
  print(f"Loaded model vocab size: {model.config.vocab_size}")
172
  print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
 
259
  return hf_dataset
260
 
261
  def train_model(progress=gr.Progress()):
262
+ # Clean memory before starting
263
+ clean_memory()
 
264
 
265
+ # Load model with optimized memory settings
266
+ model = load_model()
 
 
267
 
268
+ # Load and prepare dataset
269
+ progress(0.1, desc="Loading dataset...")
270
  train_dataset = load_dataset()
 
 
271
 
272
+ # Initialize trainer with memory-optimized settings
273
+ progress(0.2, desc="Initializing trainer...")
 
 
 
274
 
275
+ # Optional: try a custom data collator that explicitly caps sequence length
276
+ def data_capped_collator(examples):
277
+ # Call your existing collator
278
+ batch = seq2seq_causal_collator(examples)
279
+
280
+ # Ensure we cap to MAX_SEQ_LENGTH
281
+ for k, v in batch.items():
282
+ if isinstance(v, torch.Tensor) and v.dim() >= 2:
283
+ batch[k] = v[:, :MAX_SEQ_LENGTH]
284
+
285
+ return batch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
+ # Initialize trainer
288
  trainer = Trainer(
289
+ model=model,
290
  args=training_args,
291
  train_dataset=train_dataset,
292
+ data_collator=data_capped_collator, # Use our capped collator
293
  )
294
 
295
+ # Print memory status before training
296
+ progress(0.3, desc="Ready to train, checking memory...")
297
  for i in range(torch.cuda.device_count()):
298
  print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
 
 
 
 
 
 
299
 
300
  try:
301
+ # Clean again just before training
302
+ clean_memory()
303
+
304
+ # Start with smaller gradient accumulation and increase
305
+ progress(0.4, desc="Starting training with conservative settings...")
306
+
307
+ # Train with multi-GPU support
308
  train_result = trainer.train()
309
 
310
+ # Save the final model
311
  progress(0.9, desc="Saving model...")
312
+ trainer.save_model(OUTPUT_TRAINING_DIR)
 
 
 
313
 
314
+ return "Training completed successfully!"
 
 
 
 
 
 
 
315
  except Exception as e:
316
+ error_msg = str(e)
317
+ print(f"Training error: {error_msg}")
318
+
319
+ # Add memory diagnostics to error message
320
+ mem_info = "\nMemory status at error time:\n"
321
+ for i in range(torch.cuda.device_count()):
322
+ mem_info += f"GPU {i}: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved\n"
323
+
324
+ return f"An error occurred during training: {error_msg}\n{mem_info}"
325
 
326
  # Create Gradio interface
327
  def create_ui():
 
373
 
374
  # Create and launch the UI
375
  demo = create_ui()
376
+ demo.launch()
377
+
378
+ # Memory cleaning function
379
+ def clean_memory():
380
+ gc.collect()
381
+ if torch.cuda.is_available():
382
+ for i in range(torch.cuda.device_count()):
383
+ with torch.cuda.device(f'cuda:{i}'):
384
+ torch.cuda.empty_cache()
385
+ torch.cuda.reset_peak_memory_stats()