Spaces:

Twelve2five
/

qlora-llama3-finetuning

Sleeping

App Files Files Community

Twelve2five commited on Apr 9

Commit

26c97a9

verified ·

1 Parent(s): 9295d60

Update app.py

Browse files

Files changed (1) hide show

app.py +277 -320

app.py CHANGED Viewed

@@ -1,23 +1,23 @@
-import os
 import torch
-import glob
-import gc
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    BitsAndBytesConfig,
-    TrainingArguments,
-    Trainer,
-    DataCollatorForLanguageModeling,
-    AutoTokenizer
-)
 from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
 from datasets import Dataset
 from huggingface_hub import snapshot_download
 from tqdm import tqdm
-import gradio as gr
-import math
-from accelerate import Accelerator
 # --- Configuration ---
 YOUR_HF_USERNAME = "Twelve2five"
@@ -27,75 +27,155 @@ DATASET_REPO_NAME = "podcast-dialogue-rvq-pairs-3items"
 hf_model_repo_id = f"{YOUR_HF_USERNAME}/{MODEL_REPO_NAME}"
 hf_dataset_repo_id = f"{YOUR_HF_USERNAME}/{DATASET_REPO_NAME}"
-# Output directories
-OUTPUT_TRAINING_DIR = "./llama3-8b-rvq-qlora-finetuned-run"
-LOGGING_DIR = "./llama3-8b-rvq-qlora-logs-run"
 local_download_path = "./downloaded_dataset_files"
-# Training parameters
-NUM_EPOCHS = 1
-BATCH_SIZE_PER_DEVICE = 1
-GRAD_ACCUMULATION_STEPS = 64
-LEARNING_RATE = 1e-4
-WEIGHT_DECAY = 0.01
-WARMUP_RATIO = 0.03
-LR_SCHEDULER = "cosine"
-OPTIMIZER = "paged_adamw_8bit"
-MAX_SEQ_LENGTH = 256
-MICRO_BATCH_SIZE = 1
-# Multi-GPU configuration
-accelerator = Accelerator()
-# Configure environment for multi-GPU
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
-# Print GPU information
-print(f"Available GPUs: {torch.cuda.device_count()}")
-for i in range(torch.cuda.device_count()):
-    print(f"GPU {i}: {torch.cuda.get_device_name(i)} with {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")
 def seq2seq_causal_collator(features):
-    """
-    Collator that concatenates context (input_ids) and target (labels)
-    for Causal LM sequence-to-sequence training.
-    Masks the loss for the context part of the sequence.
-    Pads sequences to the maximum length in the batch.
-    """
     batch = {}
     concatenated_input_ids = []
     concatenated_labels = []
     max_len = 0
-    # --- First pass: Concatenate, create masked labels, find max length ---
     for feature in features:
-        # Dataset transform should provide tensors here
         input_ids = feature['input_ids']
         labels = feature['labels']
-        # Ensure tensors are 1D (handle potential extra dims if any)
         if input_ids.dim() > 1: input_ids = input_ids.squeeze()
         if labels.dim() > 1: labels = labels.squeeze()
         context_len = input_ids.shape[0]
         target_len = labels.shape[0]
-        # Concatenate context and target for input
         combined_ids = torch.cat([input_ids, labels], dim=0)
         concatenated_input_ids.append(combined_ids)
-        # Create labels: -100 for context, actual labels for target
         masked_labels = torch.cat([
             torch.full((context_len,), -100, dtype=torch.long, device=input_ids.device),
             labels
         ], dim=0)
         concatenated_labels.append(masked_labels)
-        # Track max length for padding
         if combined_ids.shape[0] > max_len:
             max_len = combined_ids.shape[0]
-    # --- Second pass: Pad to max length ---
     padded_input_ids = []
     padded_labels = []
     input_pad_token_id = 0
@@ -107,7 +187,6 @@ def seq2seq_causal_collator(features):
         padding_len = max_len - ids.shape[0]
-        # Pad on the right side
         padded_input_ids.append(torch.nn.functional.pad(
             ids, (0, padding_len), value=input_pad_token_id
         ))
@@ -115,281 +194,159 @@ def seq2seq_causal_collator(features):
             lbls, (0, padding_len), value=label_pad_token_id
         ))
-    # --- Stack and create final batch ---
     batch['input_ids'] = torch.stack(padded_input_ids)
     batch['labels'] = torch.stack(padded_labels)
-    # Create attention mask (1 for real tokens, 0 for padding)
     batch['attention_mask'] = batch['input_ids'].ne(input_pad_token_id).long()
     return batch
-def prepare_for_dataset(batch):
-    output = {'input_ids': [], 'labels': []}
-    for item in batch:
-        output['input_ids'].append(item['input_ids'].cpu().tolist())
-        output['labels'].append(item['labels'].cpu().tolist())
-    return output
-def load_model():
-    print(f"Loading base model architecture from: {hf_model_repo_id}")
-    # Get information about GPU with most free memory
-    gpu_id = 0  # Default to first GPU
-    max_free_memory = 0
-    for i in range(torch.cuda.device_count()):
-        free_memory = torch.cuda.get_device_properties(i).total_memory - torch.cuda.memory_allocated(i)
-        if free_memory > max_free_memory:
-            max_free_memory = free_memory
-            gpu_id = i
-    print(f"Loading model on GPU {gpu_id} with {max_free_memory / 1e9:.2f}GB free memory")
-    # Configure quantization
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_use_double_quant=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16
-    )
-    # Load the model
-    model = AutoModelForCausalLM.from_pretrained(
-        hf_model_repo_id,
-        quantization_config=bnb_config,
-        device_map={"": gpu_id},
-        torch_dtype=torch.bfloat16,
-    )
-    print(f"Model loaded on device: cuda:{gpu_id}")
-    # Load the official Meta tokenizer for LLaMA 3
-    tokenizer = AutoTokenizer.from_pretrained(
-        "meta-llama/Llama-3-8B",  # Use the official Meta tokenizer
-        use_auth_token=os.environ.get("HF_TOKEN", None)  # In case it's needed
-    )
-    if tokenizer is None:
-        # Fallback to another common foundation model tokenizer
-        print("Falling back to another tokenizer as Meta tokenizer requires auth token")
-        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
-    print(f"Loaded tokenizer vocabulary size: {len(tokenizer)}")
-    # Print information about input embeddings
-    print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
-    # Prepare model for k-bit training
-    model = prepare_model_for_kbit_training(model)
-    # Define LoRA configuration
-    lora_config = LoraConfig(
-        r=16,
-        lora_alpha=32,
-        target_modules=[
-            "q_proj",
-            "k_proj",
-            "v_proj",
-            "o_proj",
-            "gate_proj",
-            "up_proj",
-            "down_proj",
-        ],
-        lora_dropout=0.05,
-        bias="none",
-        task_type=TaskType.CAUSAL_LM
-    )
-    # Apply LoRA to model
-    model = get_peft_model(model, lora_config)
-    model.print_trainable_parameters()
-    return model, tokenizer  # Return both model and tokenizer
-def load_dataset():
-    # --- Download the dataset repository files ---
-    try:
-        os.makedirs(local_download_path, exist_ok=True)
-        downloaded_repo_root = snapshot_download(
-            repo_id=hf_dataset_repo_id,
-            repo_type="dataset",
-            local_dir=local_download_path,
-            local_dir_use_symlinks=False
-        )
-        print(f"Dataset repository content downloaded to: {downloaded_repo_root}")
-    except Exception as e:
-        print(f"Error downloading dataset: {e}")
-        return None
-    # --- Load .pt files into a Hugging Face Dataset object ---
-    pairs_dir = os.path.join(downloaded_repo_root, "final_rvq_pairs")
-    all_pair_files = glob.glob(os.path.join(pairs_dir, "*_rvq_pairs.pt"))
-    if not all_pair_files:
-        all_pair_files = glob.glob(os.path.join(downloaded_repo_root, "*_rvq_pairs.pt"))
-        if not all_pair_files:
-            print("No RVQ pair files found!")
-            return None
-    print(f"Found {len(all_pair_files)} RVQ pair files.")
-    # Load data from .pt files into memory
-    all_data_pairs = []
-    for file_path in tqdm(all_pair_files, desc="Loading pair files"):
-        try:
-            episode_pairs = torch.load(file_path, map_location='cpu')
-            all_data_pairs.extend(episode_pairs)
-        except Exception as e:
-            print(f"Warning: Could not load file {file_path}: {e}")
-    if not all_data_pairs:
-        return None
-    print(f"Loaded {len(all_data_pairs)} training pairs.")
-    # Convert to Hugging Face Dataset
-    chunk_size = 1000
-    processed_data = {'input_ids': [], 'labels': []}
-    for i in tqdm(range(0, len(all_data_pairs), chunk_size), desc="Preparing data"):
-        batch = all_data_pairs[i:i + chunk_size]
-        prepared_batch = prepare_for_dataset(batch)
-        processed_data['input_ids'].extend(prepared_batch['input_ids'])
-        processed_data['labels'].extend(prepared_batch['labels'])
-    hf_dataset = Dataset.from_dict(processed_data)
-    # Transform to get tensors back
-    hf_dataset.set_transform(lambda batch: {
-        'input_ids': [torch.tensor(ids, dtype=torch.long) for ids in batch['input_ids']],
-        'labels': [torch.tensor(lbls, dtype=torch.long) for lbls in batch['labels']]
-    })
-    # Cleanup
-    del all_data_pairs
-    del processed_data
-    gc.collect()
-    return hf_dataset
-# Memory cleaning function
-def clean_memory():
-    gc.collect()
-    if torch.cuda.is_available():
-        for i in range(torch.cuda.device_count()):
-            with torch.cuda.device(f'cuda:{i}'):
-                torch.cuda.empty_cache()
-                torch.cuda.reset_peak_memory_stats()
-def train_model(progress=gr.Progress()):
-    # Clean memory before starting
-    clean_memory()
-    # Load model with optimized memory settings
-    model, tokenizer = load_model()
-    # Load and prepare dataset
-    progress(0.1, desc="Loading dataset...")
-    train_dataset = load_dataset()
-    # Initialize trainer with debug flags
-    progress(0.2, desc="Initializing trainer...")
-    try:
-        # Set up training args with simplified settings
-        training_args = TrainingArguments(
-            output_dir="./results",
-            num_train_epochs=1,             # Just 1 epoch for testing
-            per_device_train_batch_size=1,  # Minimal batch size
-            gradient_accumulation_steps=4,  # Reduce memory pressure
-            warmup_steps=2,
-            logging_steps=1,                # Log every step
-            save_steps=10000,               # Don't save checkpoints during test
-            learning_rate=2e-4,
-            fp16=False,                     # Disable mixed precision for stability
-            optim="adamw_torch",
-            report_to="none",               # Disable wandb/tensorboard reporting
-            max_steps=3,                    # Just try 3 steps to see if it works
-            logging_first_step=True,        # Force log on first step
-        )
-        # Create a simple trainer with the tokenizer
-        trainer = Trainer(
-            model=model,
-            args=training_args,
-            train_dataset=train_dataset,
-            data_collator=DataCollatorForLanguageModeling(
-                tokenizer=tokenizer,
-                mlm=False
-            )
-        )
-        # Run training for just 3 steps
-        progress(0.3, desc="Starting training (this may take 5-15 minutes for first step)...")
-        trainer.train()
-        progress(0.9, desc="Initial training successful! You can now run full training.")
-        return "Initial training completed successfully! The system is working. You can now adjust parameters for a full training run."
-    except Exception as e:
-        error_msg = str(e)
-        print(f"Training error: {error_msg}")
-        # Add memory diagnostics to error message
-        mem_info = "\nMemory status at error time:\n"
-        for i in range(torch.cuda.device_count()):
-            mem_info += f"GPU {i}: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved\n"
-        return f"An error occurred during training: {error_msg}\n{mem_info}"
-# Create Gradio interface
-def create_ui():
-    with gr.Blocks() as demo:
-        gr.Markdown("# Fine-tune LLaMA 3 8B with QLoRA")
-        with gr.Tab("Training"):
-            train_button = gr.Button("Start Fine-tuning")
-            result_text = gr.Textbox(label="Training Results", interactive=False)
-            train_button.click(train_model, outputs=result_text)
-        with gr.Tab("About"):
-            gr.Markdown("""
-            ## Information
-            This is a Hugging Face Space version of the original Google Colab notebook.
-            It fine-tunes a quantized LLaMA 3 8B model using QLoRA on podcast dialogue data.
-            ### Model
-            - Base Model: {YOUR_HF_USERNAME}/{MODEL_REPO_NAME}
-            - Using 4-bit quantization with LoRA adapters
-            ### Dataset
-            - Custom dataset: {YOUR_HF_USERNAME}/{DATASET_REPO_NAME}
-            - Contains podcast dialogue pairs processed for training
-            ### Training Setup
-            - QLoRA fine-tuning
-            - Epochs: {NUM_EPOCHS}
-            - Batch size: {BATCH_SIZE_PER_DEVICE} with {GRAD_ACCUMULATION_STEPS} gradient accumulation steps
-            - Learning rate: {LEARNING_RATE}
-            """.format(
-                YOUR_HF_USERNAME=YOUR_HF_USERNAME,
-                MODEL_REPO_NAME=MODEL_REPO_NAME,
-                DATASET_REPO_NAME=DATASET_REPO_NAME,
-                NUM_EPOCHS=NUM_EPOCHS,
-                BATCH_SIZE_PER_DEVICE=BATCH_SIZE_PER_DEVICE,
-                GRAD_ACCUMULATION_STEPS=GRAD_ACCUMULATION_STEPS,
-                LEARNING_RATE=LEARNING_RATE
-            ))
-    return demo
-# Main entry point
-if __name__ == "__main__":
-    # Install dependencies first if needed
-    # !pip install -q -U transformers accelerate bitsandbytes peft torch datasets huggingface_hub gradio
-    # Create and launch the UI
-    demo = create_ui()
-    demo.launch()

+# -*- coding: utf-8 -*-
+"""
+Script for fine-tuning Llama-3-8B with RVQ tokens on multiple GPUs
+"""
+# Basic setup and installations
+!pip install -q -U transformers accelerate bitsandbytes peft torch datasets huggingface_hub deepspeed
+# No need for notebook_login on Hugging Face platform
+# Authentication is handled automatically
 import torch
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
 from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
+import gc
+import os
 from datasets import Dataset
 from huggingface_hub import snapshot_download
+import glob
 from tqdm import tqdm
 # --- Configuration ---
 YOUR_HF_USERNAME = "Twelve2five"
 hf_model_repo_id = f"{YOUR_HF_USERNAME}/{MODEL_REPO_NAME}"
 hf_dataset_repo_id = f"{YOUR_HF_USERNAME}/{DATASET_REPO_NAME}"
+# Check if running on multiple GPUs
+n_gpus = torch.cuda.device_count()
+print(f"Number of GPUs available: {n_gpus}")
+# --- Quantization Configuration ---
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
+)
+# --- Load Base Model (with quantization) ---
+try:
+    # For multi-GPU QLoRA, we'll use device_map="auto" and let DeepSpeed handle distribution later
+    model = AutoModelForCausalLM.from_pretrained(
+        hf_model_repo_id,
+        quantization_config=bnb_config,
+        device_map="auto",  # Will be overridden by DeepSpeed
+        trust_remote_code=True
+    )
+    print(f"Loaded model vocab size: {model.config.vocab_size}")
+    print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
+except Exception as e:
+    print(f"Error loading model from Hub: {e}")
+    raise SystemExit("Model loading failed.")
+# --- Prepare for K-bit Training & Apply LoRA ---
+model = prepare_model_for_kbit_training(model)
+lora_config = LoraConfig(
+    task_type=TaskType.CAUSAL_LM,
+    r=16,
+    lora_alpha=32,
+    lora_dropout=0.05,
+    bias="none",
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+)
+peft_model = get_peft_model(model, lora_config)
+peft_model.print_trainable_parameters()
+model_to_train = peft_model
+# Cleanup
+gc.collect()
+if torch.cuda.is_available():
+    torch.cuda.empty_cache()
+# --- Load Dataset from Hub ---
 local_download_path = "./downloaded_dataset_files"
+try:
+    downloaded_repo_root = snapshot_download(
+        repo_id=hf_dataset_repo_id,
+        repo_type="dataset",
+        local_dir=local_download_path,
+        local_dir_use_symlinks=False
+    )
+    print(f"Dataset repository content downloaded to: {downloaded_repo_root}")
+except Exception as e:
+    print(f"Error downloading dataset repository from Hub: {e}")
+    raise SystemExit("Dataset download failed.")
+# --- Find and load the .pt files ---
+pairs_dir = os.path.join(downloaded_repo_root, "final_rvq_pairs")
+all_pair_files = glob.glob(os.path.join(pairs_dir, "*_rvq_pairs.pt"))
+if not all_pair_files:
+    all_pair_files = glob.glob(os.path.join(downloaded_repo_root, "*_rvq_pairs.pt"))
+    if not all_pair_files:
+        raise FileNotFoundError(f"No RVQ pair files found in expected directories")
+print(f"Found {len(all_pair_files)} RVQ pair files.")
+# --- Load data from .pt files ---
+all_data_pairs = []
+for file_path in tqdm(all_pair_files, desc="Loading pair files"):
+    try:
+        episode_pairs = torch.load(file_path, map_location='cpu')
+        all_data_pairs.extend(episode_pairs)
+    except Exception as e:
+        print(f"Warning: Could not load file {file_path}: {e}")
+if not all_data_pairs:
+    raise ValueError("No valid data pairs were loaded")
+print(f"Loaded a total of {len(all_data_pairs)} training pairs into memory.")
+# --- Convert to HF Dataset ---
+def prepare_for_dataset(batch):
+    output = {'input_ids': [], 'labels': []}
+    for item in batch:
+        output['input_ids'].append(item['input_ids'].cpu().tolist())
+        output['labels'].append(item['labels'].cpu().tolist())
+    return output
+chunk_size = 1000
+processed_data = {'input_ids': [], 'labels': []}
+for i in tqdm(range(0, len(all_data_pairs), chunk_size), desc="Preparing data for Dataset"):
+    batch = all_data_pairs[i:i + chunk_size]
+    prepared_batch = prepare_for_dataset(batch)
+    processed_data['input_ids'].extend(prepared_batch['input_ids'])
+    processed_data['labels'].extend(prepared_batch['labels'])
+hf_dataset = Dataset.from_dict(processed_data)
+# Transform to get tensors back
+hf_dataset.set_transform(lambda batch: {
+    'input_ids': [torch.tensor(ids, dtype=torch.long) for ids in batch['input_ids']],
+    'labels': [torch.tensor(lbls, dtype=torch.long) for lbls in batch['labels']]
+})
+train_dataset = hf_dataset
+# Cleanup
+del all_data_pairs
+del processed_data
+gc.collect()
+# --- Define Data Collator ---
 def seq2seq_causal_collator(features):
     batch = {}
     concatenated_input_ids = []
     concatenated_labels = []
     max_len = 0
+    # First pass: Concatenate, create masked labels, find max length
     for feature in features:
         input_ids = feature['input_ids']
         labels = feature['labels']
         if input_ids.dim() > 1: input_ids = input_ids.squeeze()
         if labels.dim() > 1: labels = labels.squeeze()
         context_len = input_ids.shape[0]
         target_len = labels.shape[0]
         combined_ids = torch.cat([input_ids, labels], dim=0)
         concatenated_input_ids.append(combined_ids)
         masked_labels = torch.cat([
             torch.full((context_len,), -100, dtype=torch.long, device=input_ids.device),
             labels
         ], dim=0)
         concatenated_labels.append(masked_labels)
         if combined_ids.shape[0] > max_len:
             max_len = combined_ids.shape[0]
+    # Second pass: Pad to max length
     padded_input_ids = []
     padded_labels = []
     input_pad_token_id = 0
         padding_len = max_len - ids.shape[0]
         padded_input_ids.append(torch.nn.functional.pad(
             ids, (0, padding_len), value=input_pad_token_id
         ))
             lbls, (0, padding_len), value=label_pad_token_id
         ))
+    # Stack and create final batch
     batch['input_ids'] = torch.stack(padded_input_ids)
     batch['labels'] = torch.stack(padded_labels)
     batch['attention_mask'] = batch['input_ids'].ne(input_pad_token_id).long()
     return batch
+data_collator = seq2seq_causal_collator
+# --- Define Training Arguments and Initialize Trainer ---
+from transformers import TrainingArguments, Trainer
+import math
+# Output directories
+OUTPUT_TRAINING_DIR = "./llama3-8b-rvq-qlora-finetuned-run"
+LOGGING_DIR = "./llama3-8b-rvq-qlora-logs-run"
+# Training parameters - adjusted for 4x T4 GPUs
+NUM_EPOCHS = 1
+# Scale down per-device batch size since we have multiple GPUs now
+BATCH_SIZE_PER_DEVICE = 1  # Smaller per-device batch size to avoid OOM
+GRAD_ACCUMULATION_STEPS = 4
+LEARNING_RATE = 1e-4
+WEIGHT_DECAY = 0.01
+WARMUP_RATIO = 0.03
+LR_SCHEDULER = "cosine"
+OPTIMIZER = "paged_adamw_8bit"
+# Calculate total steps and warmup steps
+# Total batch size is now batch_size × num_gpus × grad_accum_steps
+total_train_batch_size = BATCH_SIZE_PER_DEVICE * n_gpus * GRAD_ACCUMULATION_STEPS
+num_training_steps = math.ceil((len(train_dataset) * NUM_EPOCHS) / total_train_batch_size)
+num_warmup_steps = int(num_training_steps * WARMUP_RATIO)
+# Logging/Saving frequency
+steps_per_epoch = math.ceil(len(train_dataset) / total_train_batch_size)
+LOGGING_STEPS = max(10, steps_per_epoch // 15)
+SAVE_STEPS = max(50, steps_per_epoch // 10)
+print(f"Dataset size: {len(train_dataset)}")
+print(f"Number of GPUs: {n_gpus}")
+print(f"Batch size per device: {BATCH_SIZE_PER_DEVICE}")
+print(f"Gradient Accumulation steps: {GRAD_ACCUMULATION_STEPS}")
+print(f"Total train batch size (effective): {total_train_batch_size}")
+print(f"Total optimization steps: {num_training_steps}")
+print(f"Warmup steps: {num_warmup_steps}")
+# Configure for multi-GPU training using DeepSpeed
+training_args = TrainingArguments(
+    output_dir=OUTPUT_TRAINING_DIR,
+    num_train_epochs=NUM_EPOCHS,
+    per_device_train_batch_size=BATCH_SIZE_PER_DEVICE,
+    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
+    optim=OPTIMIZER,
+    logging_dir=LOGGING_DIR,
+    logging_strategy="steps",
+    logging_steps=LOGGING_STEPS,
+    save_strategy="steps",
+    save_steps=SAVE_STEPS,
+    save_total_limit=2,
+    learning_rate=LEARNING_RATE,
+    weight_decay=WEIGHT_DECAY,
+    warmup_steps=num_warmup_steps,
+    lr_scheduler_type=LR_SCHEDULER,
+    report_to="tensorboard",
+    bf16=True if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else False,
+    gradient_checkpointing=True,
+    gradient_checkpointing_kwargs={'use_reentrant': False},
+    # Multi-GPU specific settings
+    deepspeed="ds_config.json",  # We'll create this file below
+    ddp_find_unused_parameters=False,
+)
+# --- Create DeepSpeed configuration file ---
+import json
+# DeepSpeed ZeRO-3 config optimized for T4 GPUs
+ds_config = {
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": True
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": True
+        },
+        "overlap_comm": True,
+        "contiguous_gradients": True,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "gather_16bit_weights_on_model_save": True,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9
+    },
+    "gradient_accumulation_steps": GRAD_ACCUMULATION_STEPS,
+    "gradient_clipping": "auto",
+    "steps_per_print": 10,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": False
+}
+with open("ds_config.json", "w") as f:
+    json.dump(ds_config, f, indent=4)
+# --- Initialize Trainer ---
+trainer = Trainer(
+    model=model_to_train,
+    args=training_args,
+    train_dataset=train_dataset,
+    data_collator=data_collator,
+)
+print("Trainer initialized with DeepSpeed for multi-GPU training.")
+# --- Start Training ---
+# Clear cache before starting
+gc.collect()
+if torch.cuda.is_available():
+    torch.cuda.empty_cache()
+try:
+    print("Starting distributed training on multiple GPUs...")
+    train_result = trainer.train()
+    # Save final model (adapter weights) and training state
+    final_save_path = os.path.join(training_args.output_dir, "final_checkpoint")
+    print(f"Saving final model checkpoint to {final_save_path}...")
+    trainer.save_model(final_save_path)
+    trainer.save_state()
+    # Log metrics
+    metrics = train_result.metrics
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+except Exception as e:
+    print(f"An error occurred during training: {e}")
+    raise e
+print("Multi-GPU training process complete.")