Spaces:

Twelve2five
/

qlora-llama3-finetuning

Sleeping

App Files Files Community

Twelve2five commited on Apr 9

Commit

2784605

verified ·

1 Parent(s): c6225e3

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -30

app.py CHANGED Viewed

@@ -141,57 +141,59 @@ def load_model():
         bnb_4bit_use_double_quant=True,
     )
-    # Use DeepSpeed if available
-    try:
-        from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live
-        use_deepspeed = True
-        print("DeepSpeed available, will use ZeRO-3")
-    except ImportError:
-        use_deepspeed = False
-        print("DeepSpeed not available, falling back to standard distribution")
-    # Calculate per-GPU reserved memory (be very conservative)
-    n_gpus = max(1, torch.cuda.device_count())
-    max_memory = {i: f"{int(torch.cuda.get_device_properties(i).total_memory / 1e9) - 4}GB" for i in range(n_gpus)}
-    max_memory["cpu"] = "32GB"
-    print(f"Using {n_gpus} GPUs with memory configuration: {max_memory}")
-    # Load model with proper device distribution
     model = AutoModelForCausalLM.from_pretrained(
         hf_model_repo_id,
         quantization_config=bnb_config,
-        device_map="balanced_low_0",  # Distribute evenly with priority to minimize GPU 0 usage
-        max_memory=max_memory,
         trust_remote_code=True,
         use_cache=False,
         torch_dtype=torch.float16,
         low_cpu_mem_usage=True,
     )
-    print(f"Loaded model vocab size: {model.config.vocab_size}")
-    print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
-    # --- Prepare for K-bit Training & Apply LoRA ---
-    model = prepare_model_for_kbit_training(model)
     lora_config = LoraConfig(
-        task_type=TaskType.CAUSAL_LM,
-        r=16,
         lora_alpha=32,
         lora_dropout=0.05,
         bias="none",
-        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
     )
-    peft_model = get_peft_model(model, lora_config)
-    peft_model.print_trainable_parameters()
-    # Cleanup
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    return peft_model
 def load_dataset():
     # --- Download the dataset repository files ---

         bnb_4bit_use_double_quant=True,
     )
+    # For 4-bit training, we need to load on a single device
+    # Choose GPU with most available memory
+    free_memory = []
+    for i in range(torch.cuda.device_count()):
+        total_memory = torch.cuda.get_device_properties(i).total_memory
+        reserved_memory = torch.cuda.memory_reserved(i)
+        free_memory.append((total_memory - reserved_memory) / 1e9)  # Convert to GB
+    # Choose the GPU with the most free memory
+    target_gpu = free_memory.index(max(free_memory))
+    print(f"Loading model on GPU {target_gpu} with {free_memory[target_gpu]:.2f}GB free memory")
+    # Use target GPU for model loading (crucial for 4-bit training)
+    device_map = {'': target_gpu}
+    # Load model on the single target GPU
     model = AutoModelForCausalLM.from_pretrained(
         hf_model_repo_id,
         quantization_config=bnb_config,
+        device_map=device_map,  # Place entire model on one GPU
         trust_remote_code=True,
         use_cache=False,
         torch_dtype=torch.float16,
         low_cpu_mem_usage=True,
     )
+    # Add print statement to check which device the model is on
+    print(f"Model loaded on device: {next(model.parameters()).device}")
+    # Continue with the LoRA config as before
+    print(f"Loaded model vocab size: {model.get_input_embeddings().weight.shape[0]}")
+    print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
+    # --- Configure PEFT/LoRA ---
     lora_config = LoraConfig(
+        r=16,  # rank
         lora_alpha=32,
         lora_dropout=0.05,
         bias="none",
+        task_type=TaskType.CAUSAL_LM,
+        target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
     )
+    # Prepare model for k-bit training
+    model = prepare_model_for_kbit_training(model)
+    # Add LoRA adapters
+    model = get_peft_model(model, lora_config)
+    # Log number of trainable parameters
+    model.print_trainable_parameters()
+    return model
 def load_dataset():
     # --- Download the dataset repository files ---