Twelve2five commited on
Commit
b2842e8
·
verified ·
1 Parent(s): 2784605

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -75
app.py CHANGED
@@ -280,94 +280,56 @@ def train_model(progress=gr.Progress()):
280
  progress(0.1, desc="Loading dataset...")
281
  train_dataset = load_dataset()
282
 
283
- # Initialize trainer with memory-optimized settings
 
 
 
 
284
  progress(0.2, desc="Initializing trainer...")
285
 
286
- # Setup DeepSpeed config if available
287
- try:
288
- from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live
289
- use_deepspeed = True
290
- print("DeepSpeed available, will use ZeRO-3")
291
-
292
- ds_config = {
293
- "zero_optimization": {
294
- "stage": 3,
295
- "offload_optimizer": {
296
- "device": "cpu",
297
- "pin_memory": True
298
- },
299
- "offload_param": {
300
- "device": "cpu",
301
- "pin_memory": True
302
- },
303
- "overlap_comm": True,
304
- "contiguous_gradients": True,
305
- "reduce_bucket_size": 5e7,
306
- "stage3_prefetch_bucket_size": 5e7,
307
- "stage3_param_persistence_threshold": 1e5
308
- },
309
- "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE,
310
- "gradient_accumulation_steps": GRAD_ACCUMULATION_STEPS,
311
- "fp16": {"enabled": True},
312
- "zero_allow_untested_optimizer": True,
313
- "aio": {"block_size": 1048576, "queue_depth": 8, "thread_count": 1}
314
- }
315
- except ImportError:
316
- use_deepspeed = False
317
- print("DeepSpeed not available, falling back to standard distribution")
318
- ds_config = None
319
-
320
- # Define training arguments inside the function
321
  training_args = TrainingArguments(
322
  output_dir=OUTPUT_TRAINING_DIR,
323
  logging_dir=LOGGING_DIR,
324
- num_train_epochs=NUM_EPOCHS,
325
- per_device_train_batch_size=BATCH_SIZE_PER_DEVICE,
326
- gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
327
  learning_rate=LEARNING_RATE,
328
  weight_decay=WEIGHT_DECAY,
329
  warmup_ratio=WARMUP_RATIO,
330
  lr_scheduler_type=LR_SCHEDULER,
331
  report_to="tensorboard",
332
  fp16=True,
333
- bf16=False,
334
 
335
- # Memory optimization
336
- optim="adamw_torch_fused",
337
- gradient_checkpointing=True,
338
- gradient_checkpointing_kwargs={"use_reentrant": False},
339
-
340
- # Explicit model distribution
341
  ddp_find_unused_parameters=False,
342
- deepspeed=ds_config if use_deepspeed else None,
343
 
344
- # Other memory-saving settings
345
- save_strategy="steps",
346
- save_steps=50,
347
- logging_steps=10,
348
- dataloader_num_workers=0, # Avoid extra memory usage with workers
349
- group_by_length=True, # Group samples of similar length
350
- max_grad_norm=0.5,
 
 
 
 
351
  )
352
 
353
- # Optional: try a custom data collator that explicitly caps sequence length
354
- def data_capped_collator(examples):
355
- # Call your existing collator
356
- batch = seq2seq_causal_collator(examples)
357
-
358
- # Ensure we cap to MAX_SEQ_LENGTH
359
- for k, v in batch.items():
360
- if isinstance(v, torch.Tensor) and v.dim() >= 2:
361
- batch[k] = v[:, :MAX_SEQ_LENGTH]
362
-
363
- return batch
364
 
365
- # Initialize trainer
366
  trainer = Trainer(
367
  model=model,
368
  args=training_args,
369
  train_dataset=train_dataset,
370
- data_collator=data_capped_collator, # Use our capped collator
371
  )
372
 
373
  # Print memory status before training
@@ -376,20 +338,37 @@ def train_model(progress=gr.Progress()):
376
  print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
377
 
378
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  # Clean again just before training
380
  clean_memory()
381
 
382
- # Start with smaller gradient accumulation and increase
383
- progress(0.4, desc="Starting training with conservative settings...")
384
 
385
- # Train with multi-GPU support
386
- train_result = trainer.train()
387
 
388
- # Save the final model
389
- progress(0.9, desc="Saving model...")
390
- trainer.save_model(OUTPUT_TRAINING_DIR)
391
 
392
- return "Training completed successfully!"
 
 
 
 
 
393
  except Exception as e:
394
  error_msg = str(e)
395
  print(f"Training error: {error_msg}")
 
280
  progress(0.1, desc="Loading dataset...")
281
  train_dataset = load_dataset()
282
 
283
+ # Add verbose logging
284
+ import logging
285
+ logging.basicConfig(level=logging.INFO)
286
+
287
+ # Initialize trainer with debug flags
288
  progress(0.2, desc="Initializing trainer...")
289
 
290
+ from transformers import TrainingArguments
291
+
292
+ # Ensure we're using the simplest training setup for first success
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  training_args = TrainingArguments(
294
  output_dir=OUTPUT_TRAINING_DIR,
295
  logging_dir=LOGGING_DIR,
296
+ num_train_epochs=1,
297
+ per_device_train_batch_size=1,
298
+ gradient_accumulation_steps=16, # Reduced for faster iterations
299
  learning_rate=LEARNING_RATE,
300
  weight_decay=WEIGHT_DECAY,
301
  warmup_ratio=WARMUP_RATIO,
302
  lr_scheduler_type=LR_SCHEDULER,
303
  report_to="tensorboard",
304
  fp16=True,
 
305
 
306
+ # Simplified training - disable fancy features
307
+ local_rank=-1, # Disable distributed training for debugging
 
 
 
 
308
  ddp_find_unused_parameters=False,
309
+ deepspeed=None,
310
 
311
+ # More frequent logging to see progress
312
+ logging_steps=1, # Log every step
313
+ save_strategy="no", # Don't save during initial test
314
+
315
+ # Other settings
316
+ optim="adamw_torch", # Use simpler optimizer
317
+ gradient_checkpointing=True,
318
+ gradient_checkpointing_kwargs={"use_reentrant": False},
319
+ dataloader_num_workers=0,
320
+ group_by_length=False, # Disable grouping for debugging
321
+ max_grad_norm=1.0,
322
  )
323
 
324
+ # Use a simpler data collator for testing
325
+ from transformers import default_data_collator
 
 
 
 
 
 
 
 
 
326
 
327
+ # Initialize trainer with simplified settings
328
  trainer = Trainer(
329
  model=model,
330
  args=training_args,
331
  train_dataset=train_dataset,
332
+ data_collator=default_data_collator, # Use default collator for testing
333
  )
334
 
335
  # Print memory status before training
 
338
  print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
339
 
340
  try:
341
+ # Add a timeout mechanism
342
+ import signal
343
+
344
+ class TimeoutException(Exception):
345
+ pass
346
+
347
+ def timeout_handler(signum, frame):
348
+ raise TimeoutException("Training step is taking too long")
349
+
350
+ # Set 30-minute timeout for training (adjust as needed)
351
+ signal.signal(signal.SIGALRM, timeout_handler)
352
+ signal.alarm(1800) # 30 minutes in seconds
353
+
354
  # Clean again just before training
355
  clean_memory()
356
 
357
+ print("Starting training with verbose logging...")
358
+ progress(0.4, desc="Starting training (this may take a while for the first step)...")
359
 
360
+ # Try training with only a few steps first to test
361
+ trainer.train(max_steps=3) # Just try 3 steps to see if it works
362
 
363
+ # Cancel the alarm if training succeeds
364
+ signal.alarm(0)
 
365
 
366
+ progress(0.9, desc="Initial training successful! You can now run full training.")
367
+ return "Initial training completed successfully! The system is working. You can now adjust parameters for a full training run."
368
+
369
+ except TimeoutException as e:
370
+ return f"Training timed out: {str(e)}. Try reducing model parameters further or switching to a smaller model like LLaMA 3 3B."
371
+
372
  except Exception as e:
373
  error_msg = str(e)
374
  print(f"Training error: {error_msg}")