Spaces:

Twelve2five
/

qlora-llama3-finetuning

Sleeping

App Files Files Community

Twelve2five commited on Apr 9

Commit

c93ea92

verified ·

1 Parent(s): b2842e8

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -79

app.py CHANGED Viewed

@@ -280,95 +280,44 @@ def train_model(progress=gr.Progress()):
     progress(0.1, desc="Loading dataset...")
     train_dataset = load_dataset()
-    # Add verbose logging
-    import logging
-    logging.basicConfig(level=logging.INFO)
     # Initialize trainer with debug flags
     progress(0.2, desc="Initializing trainer...")
-    from transformers import TrainingArguments
-    # Ensure we're using the simplest training setup for first success
-    training_args = TrainingArguments(
-        output_dir=OUTPUT_TRAINING_DIR,
-        logging_dir=LOGGING_DIR,
-        num_train_epochs=1,
-        per_device_train_batch_size=1,
-        gradient_accumulation_steps=16,  # Reduced for faster iterations
-        learning_rate=LEARNING_RATE,
-        weight_decay=WEIGHT_DECAY,
-        warmup_ratio=WARMUP_RATIO,
-        lr_scheduler_type=LR_SCHEDULER,
-        report_to="tensorboard",
-        fp16=True,
-        # Simplified training - disable fancy features
-        local_rank=-1,  # Disable distributed training for debugging
-        ddp_find_unused_parameters=False,
-        deepspeed=None,
-        # More frequent logging to see progress
-        logging_steps=1,  # Log every step
-        save_strategy="no",  # Don't save during initial test
-        # Other settings
-        optim="adamw_torch",  # Use simpler optimizer
-        gradient_checkpointing=True,
-        gradient_checkpointing_kwargs={"use_reentrant": False},
-        dataloader_num_workers=0,
-        group_by_length=False,  # Disable grouping for debugging
-        max_grad_norm=1.0,
-    )
-    # Use a simpler data collator for testing
-    from transformers import default_data_collator
-    # Initialize trainer with simplified settings
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset,
-        data_collator=default_data_collator,  # Use default collator for testing
-    )
-    # Print memory status before training
-    progress(0.3, desc="Ready to train, checking memory...")
-    for i in range(torch.cuda.device_count()):
-        print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
     try:
-        # Add a timeout mechanism
-        import signal
-        class TimeoutException(Exception):
-            pass
-        def timeout_handler(signum, frame):
-            raise TimeoutException("Training step is taking too long")
-        # Set 30-minute timeout for training (adjust as needed)
-        signal.signal(signal.SIGALRM, timeout_handler)
-        signal.alarm(1800)  # 30 minutes in seconds
-        # Clean again just before training
-        clean_memory()
-        print("Starting training with verbose logging...")
-        progress(0.4, desc="Starting training (this may take a while for the first step)...")
-        # Try training with only a few steps first to test
-        trainer.train(max_steps=3)  # Just try 3 steps to see if it works
-        # Cancel the alarm if training succeeds
-        signal.alarm(0)
         progress(0.9, desc="Initial training successful! You can now run full training.")
         return "Initial training completed successfully! The system is working. You can now adjust parameters for a full training run."
-    except TimeoutException as e:
-        return f"Training timed out: {str(e)}. Try reducing model parameters further or switching to a smaller model like LLaMA 3 3B."
     except Exception as e:
         error_msg = str(e)
         print(f"Training error: {error_msg}")

     progress(0.1, desc="Loading dataset...")
     train_dataset = load_dataset()
     # Initialize trainer with debug flags
     progress(0.2, desc="Initializing trainer...")
     try:
+        # Set up training args with simplified settings
+        training_args = TrainingArguments(
+            output_dir="./results",
+            num_train_epochs=1,             # Just 1 epoch for testing
+            per_device_train_batch_size=1,  # Minimal batch size
+            gradient_accumulation_steps=4,  # Reduce memory pressure
+            warmup_steps=2,
+            logging_steps=1,                # Log every step
+            save_steps=10000,               # Don't save checkpoints during test
+            learning_rate=2e-4,
+            fp16=False,                     # Disable mixed precision for stability
+            optim="adamw_torch",
+            report_to="none",               # Disable wandb/tensorboard reporting
+            max_steps=3,                    # Just try 3 steps to see if it works
+            logging_first_step=True,        # Force log on first step
+        )
+        # Create a simple trainer
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=train_dataset,
+            data_collator=transformers.DataCollatorForLanguageModeling(
+                tokenizer=None, mlm=False
+            )
+        )
+        # Run training for just 3 steps
+        progress(0.3, desc="Starting training (this may take 5-15 minutes for first step)...")
+        trainer.train()
         progress(0.9, desc="Initial training successful! You can now run full training.")
         return "Initial training completed successfully! The system is working. You can now adjust parameters for a full training run."
     except Exception as e:
         error_msg = str(e)
         print(f"Training error: {error_msg}")