Spaces:

Twelve2five
/

qlora-llama3-finetuning

Sleeping

App Files Files Community

Twelve2five commited on Apr 9

Commit

c6225e3

verified ·

1 Parent(s): f21c9d9

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -0

app.py CHANGED Viewed

@@ -281,6 +281,73 @@ def train_model(progress=gr.Progress()):
     # Initialize trainer with memory-optimized settings
     progress(0.2, desc="Initializing trainer...")
     # Optional: try a custom data collator that explicitly caps sequence length
     def data_capped_collator(examples):
         # Call your existing collator

     # Initialize trainer with memory-optimized settings
     progress(0.2, desc="Initializing trainer...")
+    # Setup DeepSpeed config if available
+    try:
+        from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live
+        use_deepspeed = True
+        print("DeepSpeed available, will use ZeRO-3")
+        ds_config = {
+            "zero_optimization": {
+                "stage": 3,
+                "offload_optimizer": {
+                    "device": "cpu",
+                    "pin_memory": True
+                },
+                "offload_param": {
+                    "device": "cpu",
+                    "pin_memory": True
+                },
+                "overlap_comm": True,
+                "contiguous_gradients": True,
+                "reduce_bucket_size": 5e7,
+                "stage3_prefetch_bucket_size": 5e7,
+                "stage3_param_persistence_threshold": 1e5
+            },
+            "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE,
+            "gradient_accumulation_steps": GRAD_ACCUMULATION_STEPS,
+            "fp16": {"enabled": True},
+            "zero_allow_untested_optimizer": True,
+            "aio": {"block_size": 1048576, "queue_depth": 8, "thread_count": 1}
+        }
+    except ImportError:
+        use_deepspeed = False
+        print("DeepSpeed not available, falling back to standard distribution")
+        ds_config = None
+    # Define training arguments inside the function
+    training_args = TrainingArguments(
+        output_dir=OUTPUT_TRAINING_DIR,
+        logging_dir=LOGGING_DIR,
+        num_train_epochs=NUM_EPOCHS,
+        per_device_train_batch_size=BATCH_SIZE_PER_DEVICE,
+        gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
+        learning_rate=LEARNING_RATE,
+        weight_decay=WEIGHT_DECAY,
+        warmup_ratio=WARMUP_RATIO,
+        lr_scheduler_type=LR_SCHEDULER,
+        report_to="tensorboard",
+        fp16=True,
+        bf16=False,
+        # Memory optimization
+        optim="adamw_torch_fused",
+        gradient_checkpointing=True,
+        gradient_checkpointing_kwargs={"use_reentrant": False},
+        # Explicit model distribution
+        ddp_find_unused_parameters=False,
+        deepspeed=ds_config if use_deepspeed else None,
+        # Other memory-saving settings
+        save_strategy="steps",
+        save_steps=50,
+        logging_steps=10,
+        dataloader_num_workers=0,  # Avoid extra memory usage with workers
+        group_by_length=True,  # Group samples of similar length
+        max_grad_norm=0.5,
+    )
     # Optional: try a custom data collator that explicitly caps sequence length
     def data_capped_collator(examples):
         # Call your existing collator