Spaces:

Twelve2five
/

qlora-llama3-finetuning

Sleeping

App Files Files Community

Twelve2five commited on Apr 9

Commit

b2842e8

verified ·

1 Parent(s): 2784605

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -75

app.py CHANGED Viewed

@@ -280,94 +280,56 @@ def train_model(progress=gr.Progress()):
     progress(0.1, desc="Loading dataset...")
     train_dataset = load_dataset()
-    # Initialize trainer with memory-optimized settings
     progress(0.2, desc="Initializing trainer...")
-    # Setup DeepSpeed config if available
-    try:
-        from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live
-        use_deepspeed = True
-        print("DeepSpeed available, will use ZeRO-3")
-        ds_config = {
-            "zero_optimization": {
-                "stage": 3,
-                "offload_optimizer": {
-                    "device": "cpu",
-                    "pin_memory": True
-                },
-                "offload_param": {
-                    "device": "cpu",
-                    "pin_memory": True
-                },
-                "overlap_comm": True,
-                "contiguous_gradients": True,
-                "reduce_bucket_size": 5e7,
-                "stage3_prefetch_bucket_size": 5e7,
-                "stage3_param_persistence_threshold": 1e5
-            },
-            "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE,
-            "gradient_accumulation_steps": GRAD_ACCUMULATION_STEPS,
-            "fp16": {"enabled": True},
-            "zero_allow_untested_optimizer": True,
-            "aio": {"block_size": 1048576, "queue_depth": 8, "thread_count": 1}
-        }
-    except ImportError:
-        use_deepspeed = False
-        print("DeepSpeed not available, falling back to standard distribution")
-        ds_config = None
-    # Define training arguments inside the function
     training_args = TrainingArguments(
         output_dir=OUTPUT_TRAINING_DIR,
         logging_dir=LOGGING_DIR,
-        num_train_epochs=NUM_EPOCHS,
-        per_device_train_batch_size=BATCH_SIZE_PER_DEVICE,
-        gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
         learning_rate=LEARNING_RATE,
         weight_decay=WEIGHT_DECAY,
         warmup_ratio=WARMUP_RATIO,
         lr_scheduler_type=LR_SCHEDULER,
         report_to="tensorboard",
         fp16=True,
-        bf16=False,
-        # Memory optimization
-        optim="adamw_torch_fused",
-        gradient_checkpointing=True,
-        gradient_checkpointing_kwargs={"use_reentrant": False},
-        # Explicit model distribution
         ddp_find_unused_parameters=False,
-        deepspeed=ds_config if use_deepspeed else None,
-        # Other memory-saving settings
-        save_strategy="steps",
-        save_steps=50,
-        logging_steps=10,
-        dataloader_num_workers=0,  # Avoid extra memory usage with workers
-        group_by_length=True,  # Group samples of similar length
-        max_grad_norm=0.5,
     )
-    # Optional: try a custom data collator that explicitly caps sequence length
-    def data_capped_collator(examples):
-        # Call your existing collator
-        batch = seq2seq_causal_collator(examples)
-        # Ensure we cap to MAX_SEQ_LENGTH
-        for k, v in batch.items():
-            if isinstance(v, torch.Tensor) and v.dim() >= 2:
-                batch[k] = v[:, :MAX_SEQ_LENGTH]
-        return batch
-    # Initialize trainer
     trainer = Trainer(
         model=model,
         args=training_args,
         train_dataset=train_dataset,
-        data_collator=data_capped_collator,  # Use our capped collator
     )
     # Print memory status before training
@@ -376,20 +338,37 @@ def train_model(progress=gr.Progress()):
         print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
     try:
         # Clean again just before training
         clean_memory()
-        # Start with smaller gradient accumulation and increase
-        progress(0.4, desc="Starting training with conservative settings...")
-        # Train with multi-GPU support
-        train_result = trainer.train()
-        # Save the final model
-        progress(0.9, desc="Saving model...")
-        trainer.save_model(OUTPUT_TRAINING_DIR)
-        return "Training completed successfully!"
     except Exception as e:
         error_msg = str(e)
         print(f"Training error: {error_msg}")

     progress(0.1, desc="Loading dataset...")
     train_dataset = load_dataset()
+    # Add verbose logging
+    import logging
+    logging.basicConfig(level=logging.INFO)
+    # Initialize trainer with debug flags
     progress(0.2, desc="Initializing trainer...")
+    from transformers import TrainingArguments
+    # Ensure we're using the simplest training setup for first success
     training_args = TrainingArguments(
         output_dir=OUTPUT_TRAINING_DIR,
         logging_dir=LOGGING_DIR,
+        num_train_epochs=1,
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=16,  # Reduced for faster iterations
         learning_rate=LEARNING_RATE,
         weight_decay=WEIGHT_DECAY,
         warmup_ratio=WARMUP_RATIO,
         lr_scheduler_type=LR_SCHEDULER,
         report_to="tensorboard",
         fp16=True,
+        # Simplified training - disable fancy features
+        local_rank=-1,  # Disable distributed training for debugging
         ddp_find_unused_parameters=False,
+        deepspeed=None,
+        # More frequent logging to see progress
+        logging_steps=1,  # Log every step
+        save_strategy="no",  # Don't save during initial test
+        # Other settings
+        optim="adamw_torch",  # Use simpler optimizer
+        gradient_checkpointing=True,
+        gradient_checkpointing_kwargs={"use_reentrant": False},
+        dataloader_num_workers=0,
+        group_by_length=False,  # Disable grouping for debugging
+        max_grad_norm=1.0,
     )
+    # Use a simpler data collator for testing
+    from transformers import default_data_collator
+    # Initialize trainer with simplified settings
     trainer = Trainer(
         model=model,
         args=training_args,
         train_dataset=train_dataset,
+        data_collator=default_data_collator,  # Use default collator for testing
     )
     # Print memory status before training
         print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
     try:
+        # Add a timeout mechanism
+        import signal
+        class TimeoutException(Exception):
+            pass
+        def timeout_handler(signum, frame):
+            raise TimeoutException("Training step is taking too long")
+        # Set 30-minute timeout for training (adjust as needed)
+        signal.signal(signal.SIGALRM, timeout_handler)
+        signal.alarm(1800)  # 30 minutes in seconds
         # Clean again just before training
         clean_memory()
+        print("Starting training with verbose logging...")
+        progress(0.4, desc="Starting training (this may take a while for the first step)...")
+        # Try training with only a few steps first to test
+        trainer.train(max_steps=3)  # Just try 3 steps to see if it works
+        # Cancel the alarm if training succeeds
+        signal.alarm(0)
+        progress(0.9, desc="Initial training successful! You can now run full training.")
+        return "Initial training completed successfully! The system is working. You can now adjust parameters for a full training run."
+    except TimeoutException as e:
+        return f"Training timed out: {str(e)}. Try reducing model parameters further or switching to a smaller model like LLaMA 3 3B."
     except Exception as e:
         error_msg = str(e)
         print(f"Training error: {error_msg}")