Spaces:

Twelve2five
/

qlora-llama3-finetuning

Sleeping

App Files Files Community

Twelve2five commited on Apr 9

Commit

f55ecaa

verified ·

1 Parent(s): 3e7a8e3

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -83

app.py CHANGED Viewed

@@ -31,19 +31,21 @@ local_download_path = "./downloaded_dataset_files"
 # Training parameters
 NUM_EPOCHS = 1
-BATCH_SIZE_PER_DEVICE = 2
-GRAD_ACCUMULATION_STEPS = 4
 LEARNING_RATE = 1e-4
 WEIGHT_DECAY = 0.01
 WARMUP_RATIO = 0.03
 LR_SCHEDULER = "cosine"
 OPTIMIZER = "paged_adamw_8bit"
 # Multi-GPU configuration
 accelerator = Accelerator()
 # Configure environment for multi-GPU
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
 # Print GPU information
 print(f"Available GPUs: {torch.cuda.device_count()}")
@@ -127,34 +129,44 @@ def prepare_for_dataset(batch):
     return output
 def load_model():
-    # For HF Spaces, we use the system CUDA if available
-    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Loading base model architecture from: {hf_model_repo_id}")
-    print(f"Using device: {DEVICE}")
-    # --- Quantization Configuration ---
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16,
         bnb_4bit_use_double_quant=True,
     )
-    # Configure for multi-GPU
-    device_map = "auto"  # Let the library automatically distribute across GPUs
-    # For 4x L4 GPUs (24GB each)
-    max_memory = {i: "22GB" for i in range(torch.cuda.device_count())}
-    max_memory["cpu"] = "32GB"  # Allow some CPU offloading if needed
     model = AutoModelForCausalLM.from_pretrained(
         hf_model_repo_id,
         quantization_config=bnb_config,
-        device_map=device_map,
         max_memory=max_memory,
         trust_remote_code=True,
         use_cache=False,
         torch_dtype=torch.float16,
     )
     print(f"Loaded model vocab size: {model.config.vocab_size}")
     print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
@@ -247,94 +259,69 @@ def load_dataset():
     return hf_dataset
 def train_model(progress=gr.Progress()):
-    # Create directories
-    os.makedirs(OUTPUT_TRAINING_DIR, exist_ok=True)
-    os.makedirs(LOGGING_DIR, exist_ok=True)
-    progress(0, desc="Loading model...")
-    model_to_train = load_model()
-    if model_to_train is None:
-        return "Failed to load model."
-    progress(0.2, desc="Loading dataset...")
     train_dataset = load_dataset()
-    if train_dataset is None:
-        return "Failed to load dataset."
-    progress(0.4, desc="Setting up trainer...")
-    # Calculate steps and warmup
-    total_train_batch_size = BATCH_SIZE_PER_DEVICE * GRAD_ACCUMULATION_STEPS
-    num_training_steps = math.ceil((len(train_dataset) * NUM_EPOCHS) / total_train_batch_size)
-    num_warmup_steps = int(num_training_steps * WARMUP_RATIO)
-    # Logging frequency
-    steps_per_epoch = math.ceil(len(train_dataset) / total_train_batch_size)
-    LOGGING_STEPS = max(10, steps_per_epoch // 15)
-    SAVE_STEPS = max(50, steps_per_epoch // 10)
-    training_args = TrainingArguments(
-        output_dir=OUTPUT_TRAINING_DIR,
-        num_train_epochs=NUM_EPOCHS,
-        per_device_train_batch_size=BATCH_SIZE_PER_DEVICE,
-        gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
-        optim=OPTIMIZER,
-        logging_dir=LOGGING_DIR,
-        logging_strategy="steps",
-        logging_steps=LOGGING_STEPS,
-        save_strategy="steps",
-        save_steps=SAVE_STEPS,
-        save_total_limit=2,
-        learning_rate=LEARNING_RATE,
-        weight_decay=WEIGHT_DECAY,
-        warmup_steps=num_warmup_steps,
-        lr_scheduler_type=LR_SCHEDULER,
-        report_to="tensorboard",
-        fp16=True,
-        bf16=False,
-        gradient_checkpointing=True,
-        gradient_checkpointing_kwargs={"use_reentrant": False},
-        ddp_find_unused_parameters=False,
-        local_rank=int(os.getenv("LOCAL_RANK", -1)),
-        dataloader_num_workers=4,
-    )
     trainer = Trainer(
-        model=model_to_train,
         args=training_args,
         train_dataset=train_dataset,
-        data_collator=seq2seq_causal_collator,
     )
-    # Print memory usage before training
     for i in range(torch.cuda.device_count()):
         print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
-    progress(0.5, desc="Starting training...")
-    # Clear cache before starting
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
     try:
-        # Train distributed across GPUs
         train_result = trainer.train()
         progress(0.9, desc="Saving model...")
-        # Save final model and training state
-        final_save_path = os.path.join(training_args.output_dir, "final_checkpoint")
-        trainer.save_model(final_save_path)
-        trainer.save_state()
-        # Log metrics
-        metrics = train_result.metrics
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        progress(1.0, desc="Training complete!")
-        return f"Training completed successfully. Model saved to {final_save_path}"
     except Exception as e:
-        return f"An error occurred during training: {str(e)}"
 # Create Gradio interface
 def create_ui():
@@ -386,4 +373,13 @@ if __name__ == "__main__":
     # Create and launch the UI
     demo = create_ui()
-    demo.launch()

 # Training parameters
 NUM_EPOCHS = 1
+BATCH_SIZE_PER_DEVICE = 1
+GRAD_ACCUMULATION_STEPS = 64
 LEARNING_RATE = 1e-4
 WEIGHT_DECAY = 0.01
 WARMUP_RATIO = 0.03
 LR_SCHEDULER = "cosine"
 OPTIMIZER = "paged_adamw_8bit"
+MAX_SEQ_LENGTH = 256
+MICRO_BATCH_SIZE = 1
 # Multi-GPU configuration
 accelerator = Accelerator()
 # Configure environment for multi-GPU
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
 # Print GPU information
 print(f"Available GPUs: {torch.cuda.device_count()}")
     return output
 def load_model():
+    clean_memory()  # Start with clean memory
     print(f"Loading base model architecture from: {hf_model_repo_id}")
+    # Even more extreme quantization
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.float16,  # Use float16 instead of bfloat16
         bnb_4bit_use_double_quant=True,
     )
+    # Use DeepSpeed if available
+    try:
+        from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live
+        use_deepspeed = True
+        print("DeepSpeed available, will use ZeRO-3")
+    except ImportError:
+        use_deepspeed = False
+        print("DeepSpeed not available, falling back to standard distribution")
+    # Calculate per-GPU reserved memory (be very conservative)
+    n_gpus = max(1, torch.cuda.device_count())
+    max_memory = {i: f"{int(torch.cuda.get_device_properties(i).total_memory / 1e9) - 4}GB" for i in range(n_gpus)}
+    max_memory["cpu"] = "32GB"
+    print(f"Using {n_gpus} GPUs with memory configuration: {max_memory}")
+    # Load model with proper device distribution
     model = AutoModelForCausalLM.from_pretrained(
         hf_model_repo_id,
         quantization_config=bnb_config,
+        device_map="balanced_low_0",  # Distribute evenly with priority to minimize GPU 0 usage
         max_memory=max_memory,
         trust_remote_code=True,
         use_cache=False,
         torch_dtype=torch.float16,
+        low_cpu_mem_usage=True,
     )
     print(f"Loaded model vocab size: {model.config.vocab_size}")
     print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
     return hf_dataset
 def train_model(progress=gr.Progress()):
+    # Clean memory before starting
+    clean_memory()
+    # Load model with optimized memory settings
+    model = load_model()
+    # Load and prepare dataset
+    progress(0.1, desc="Loading dataset...")
     train_dataset = load_dataset()
+    # Initialize trainer with memory-optimized settings
+    progress(0.2, desc="Initializing trainer...")
+    # Optional: try a custom data collator that explicitly caps sequence length
+    def data_capped_collator(examples):
+        # Call your existing collator
+        batch = seq2seq_causal_collator(examples)
+        # Ensure we cap to MAX_SEQ_LENGTH
+        for k, v in batch.items():
+            if isinstance(v, torch.Tensor) and v.dim() >= 2:
+                batch[k] = v[:, :MAX_SEQ_LENGTH]
+        return batch
+    # Initialize trainer
     trainer = Trainer(
+        model=model,
         args=training_args,
         train_dataset=train_dataset,
+        data_collator=data_capped_collator,  # Use our capped collator
     )
+    # Print memory status before training
+    progress(0.3, desc="Ready to train, checking memory...")
     for i in range(torch.cuda.device_count()):
         print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
     try:
+        # Clean again just before training
+        clean_memory()
+        # Start with smaller gradient accumulation and increase
+        progress(0.4, desc="Starting training with conservative settings...")
+        # Train with multi-GPU support
         train_result = trainer.train()
+        # Save the final model
         progress(0.9, desc="Saving model...")
+        trainer.save_model(OUTPUT_TRAINING_DIR)
+        return "Training completed successfully!"
     except Exception as e:
+        error_msg = str(e)
+        print(f"Training error: {error_msg}")
+        # Add memory diagnostics to error message
+        mem_info = "\nMemory status at error time:\n"
+        for i in range(torch.cuda.device_count()):
+            mem_info += f"GPU {i}: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved\n"
+        return f"An error occurred during training: {error_msg}\n{mem_info}"
 # Create Gradio interface
 def create_ui():
     # Create and launch the UI
     demo = create_ui()
+    demo.launch()
+# Memory cleaning function
+def clean_memory():
+    gc.collect()
+    if torch.cuda.is_available():
+        for i in range(torch.cuda.device_count()):
+            with torch.cuda.device(f'cuda:{i}'):
+                torch.cuda.empty_cache()
+                torch.cuda.reset_peak_memory_stats()