Spaces:

Twelve2five
/

qlora-llama3-finetuning

Sleeping

App Files Files Community

Twelve2five commited on Apr 9

Commit

3e7a8e3

verified ·

1 Parent(s): c9681ef

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -18

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ from huggingface_hub import snapshot_download
 from tqdm import tqdm
 import gradio as gr
 import math
 # --- Configuration ---
 YOUR_HF_USERNAME = "Twelve2five"
@@ -38,6 +39,17 @@ WARMUP_RATIO = 0.03
 LR_SCHEDULER = "cosine"
 OPTIMIZER = "paged_adamw_8bit"
 def seq2seq_causal_collator(features):
     """
     Collator that concatenates context (input_ids) and target (labels)
@@ -128,19 +140,24 @@ def load_model():
         bnb_4bit_use_double_quant=True,
     )
-    # --- Load Base Model (with quantization) ---
-    try:
-        model = AutoModelForCausalLM.from_pretrained(
-            hf_model_repo_id,
-            quantization_config=bnb_config,
-            device_map="auto",
-            trust_remote_code=True
-        )
-        print(f"Loaded model vocab size: {model.config.vocab_size}")
-        print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
-    except Exception as e:
-        print(f"Error loading model: {e}")
-        return None
     # --- Prepare for K-bit Training & Apply LoRA ---
     model = prepare_model_for_kbit_training(model)
@@ -272,10 +289,13 @@ def train_model(progress=gr.Progress()):
         warmup_steps=num_warmup_steps,
         lr_scheduler_type=LR_SCHEDULER,
         report_to="tensorboard",
-        fp16=False,
-        bf16=True if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else False,
         gradient_checkpointing=True,
-        gradient_checkpointing_kwargs={'use_reentrant': False},
     )
     trainer = Trainer(
@@ -285,6 +305,10 @@ def train_model(progress=gr.Progress()):
         data_collator=seq2seq_causal_collator,
     )
     progress(0.5, desc="Starting training...")
     # Clear cache before starting
     gc.collect()
@@ -292,6 +316,7 @@ def train_model(progress=gr.Progress()):
         torch.cuda.empty_cache()
     try:
         train_result = trainer.train()
         progress(0.9, desc="Saving model...")
@@ -359,6 +384,6 @@ if __name__ == "__main__":
     # Install dependencies first if needed
     # !pip install -q -U transformers accelerate bitsandbytes peft torch datasets huggingface_hub gradio
-    # Create and launch the U
     demo = create_ui()
-    demo.launch()

 from tqdm import tqdm
 import gradio as gr
 import math
+from accelerate import Accelerator
 # --- Configuration ---
 YOUR_HF_USERNAME = "Twelve2five"
 LR_SCHEDULER = "cosine"
 OPTIMIZER = "paged_adamw_8bit"
+# Multi-GPU configuration
+accelerator = Accelerator()
+# Configure environment for multi-GPU
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
+# Print GPU information
+print(f"Available GPUs: {torch.cuda.device_count()}")
+for i in range(torch.cuda.device_count()):
+    print(f"GPU {i}: {torch.cuda.get_device_name(i)} with {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")
 def seq2seq_causal_collator(features):
     """
     Collator that concatenates context (input_ids) and target (labels)
         bnb_4bit_use_double_quant=True,
     )
+    # Configure for multi-GPU
+    device_map = "auto"  # Let the library automatically distribute across GPUs
+    # For 4x L4 GPUs (24GB each)
+    max_memory = {i: "22GB" for i in range(torch.cuda.device_count())}
+    max_memory["cpu"] = "32GB"  # Allow some CPU offloading if needed
+    model = AutoModelForCausalLM.from_pretrained(
+        hf_model_repo_id,
+        quantization_config=bnb_config,
+        device_map=device_map,
+        max_memory=max_memory,
+        trust_remote_code=True,
+        use_cache=False,
+        torch_dtype=torch.float16,
+    )
+    print(f"Loaded model vocab size: {model.config.vocab_size}")
+    print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
     # --- Prepare for K-bit Training & Apply LoRA ---
     model = prepare_model_for_kbit_training(model)
         warmup_steps=num_warmup_steps,
         lr_scheduler_type=LR_SCHEDULER,
         report_to="tensorboard",
+        fp16=True,
+        bf16=False,
         gradient_checkpointing=True,
+        gradient_checkpointing_kwargs={"use_reentrant": False},
+        ddp_find_unused_parameters=False,
+        local_rank=int(os.getenv("LOCAL_RANK", -1)),
+        dataloader_num_workers=4,
     )
     trainer = Trainer(
         data_collator=seq2seq_causal_collator,
     )
+    # Print memory usage before training
+    for i in range(torch.cuda.device_count()):
+        print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
     progress(0.5, desc="Starting training...")
     # Clear cache before starting
     gc.collect()
         torch.cuda.empty_cache()
     try:
+        # Train distributed across GPUs
         train_result = trainer.train()
         progress(0.9, desc="Saving model...")
     # Install dependencies first if needed
     # !pip install -q -U transformers accelerate bitsandbytes peft torch datasets huggingface_hub gradio
+    # Create and launch the UI
     demo = create_ui()
+    demo.launch()