Spaces:

Twelve2five
/

qlora-llama3-finetuning

Sleeping

App Files Files Community

Twelve2five commited on Apr 9

Commit

1c688b1

verified ·

1 Parent(s): fd09ea6

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -233

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ import shutil
 # --- Configuration ---
 YOUR_HF_USERNAME = "Twelve2five"
-MODEL_REPO_NAME = "llama-3-8b-rvq-resized"
 DATASET_REPO_NAME = "podcast-dialogue-rvq-pairs-3items"
 hf_model_repo_id = f"{YOUR_HF_USERNAME}/{MODEL_REPO_NAME}"
@@ -329,21 +329,14 @@ def train_model(
     model_repo_name,
     dataset_repo_name,
     epochs=1,
-    batch_size=1,
-    grad_accum_steps=16,  # Increased from 8 to 16
-    learning_rate=1e-4,
     progress=gr.Progress()
 ):
     progress(0, desc="Setting up environment...")
     log = []
-    # Aggressive memory cleanup
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        # Reset peak memory stats
-        torch.cuda.reset_peak_memory_stats()
     # Clean up any existing model files to save space
     if os.path.exists("./model_files"):
         try:
@@ -371,8 +364,8 @@ def train_model(
         from huggingface_hub import snapshot_download
         import torch
         import transformers
-        from transformers import AutoModelForCausalLM, LlamaConfig, LlamaForCausalLM
-        from transformers import BitsAndBytesConfig, TrainingArguments, Trainer, AutoTokenizer
         from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
         log.append(f"Transformers version: {transformers.__version__}")
@@ -393,40 +386,65 @@ def train_model(
     n_gpus = torch.cuda.device_count()
     log.append(f"Number of GPUs available: {n_gpus}")
-    # --- Load Base Model (with extreme quantization) ---
-    progress(0.1, desc="Loading base model...")
-    local_model_path = "./model_files"
     try:
-        # Download the model files
         snapshot_download(
             repo_id=hf_model_repo_id,
             local_dir=local_model_path,
-            local_dir_use_symlinks=False
         )
         log.append(f"Model files downloaded to {local_model_path}")
-        # Ensure model_type is set correctly in the config
         config_path = os.path.join(local_model_path, "config.json")
         with open(config_path, "r") as f:
             config_data = json.load(f)
-        model_type = config_data.get("model_type", "")
         log.append(f"Model architecture type: {model_type}")
-        # Force model_type to be "llama" if it's not already
-        if model_type != "llama":
-            config_data["model_type"] = "llama"
-            # Also ensure architectures is set correctly
-            config_data["architectures"] = ["LlamaForCausalLM"]
-            with open(config_path, "w") as f:
-                json.dump(config_data, f, indent=2)
-            log.append("Updated config.json to use llama model_type")
-        # Load the config first
-        config = LlamaConfig.from_pretrained(local_model_path)
-        log.append(f"Successfully loaded config: {config.model_type}")
-        # Use 4-bit quantization for extreme memory savings
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_use_double_quant=True,
@@ -434,25 +452,28 @@ def train_model(
             bnb_4bit_compute_dtype=torch.bfloat16
         )
-        # Load tokenizer first (needed for dataset preparation)
-        tokenizer = AutoTokenizer.from_pretrained(local_model_path)
-        # Explicit device map to enable CPU offloading
-        max_memory = {0: "40GB", "cpu": "64GB"}
-        # Load the model with extreme memory optimization
-        model = LlamaForCausalLM.from_pretrained(
             local_model_path,
-            config=config,
             quantization_config=bnb_config,
             device_map="auto",
-            max_memory=max_memory,
-            torch_dtype=torch.bfloat16,
-            low_cpu_mem_usage=True
         )
-        log.append(f"Loaded model vocab size: {model.config.vocab_size}")
-        log.append(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
     except Exception as e:
         error_msg = f"Error loading model: {str(e)}"
         log.append(error_msg)
@@ -464,138 +485,77 @@ def train_model(
         model = prepare_model_for_kbit_training(model)
         log.append("Model prepared for k-bit training")
-        # Use minimal LoRA configuration with fewer parameters
         lora_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
-            r=8,  # Reduced from 16 to 8
-            lora_alpha=16,  # Reduced from 32 to 16
             lora_dropout=0.05,
             bias="none",
-            # Target only key modules to reduce memory usage
-            target_modules=["q_proj", "v_proj"]  # Reduced target modules
         )
-        # Apply LoRA
         peft_model = get_peft_model(model, lora_config)
         model_to_train = peft_model
-        log.append("LoRA applied to model")
-        # Free memory
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
     except Exception as e:
         error_msg = f"Error preparing model for training: {str(e)}"
         log.append(error_msg)
         return "\n".join(log)
-    # --- Download and Process Dataset ---
-    progress(0.2, desc="Loading dataset...")
     try:
-        # Download the dataset files
-        dataset_dir = os.path.join(os.getcwd(), "downloaded_dataset_files")
         snapshot_download(
             repo_id=hf_dataset_repo_id,
-            local_dir=dataset_dir,
-            local_dir_use_symlinks=False
         )
-        log.append(f"Dataset repository content downloaded to: {dataset_dir}")
-        # Find all RVQ pair files
-        rvq_pair_files = glob.glob(os.path.join(dataset_dir, "*_rvq_pairs.pt"))
         log.append(f"Found {len(rvq_pair_files)} RVQ pair files.")
-        # Load training pairs from the dataset
-        training_pairs = []
-        # For memory conservation, use only half the dataset for now
-        max_file_count = min(12, len(rvq_pair_files))
-        for i, pair_file in enumerate(rvq_pair_files[:max_file_count]):
-            try:
-                pairs = torch.load(pair_file)
-                training_pairs.extend(pairs)
-            except Exception as e:
-                log.append(f"Warning: Could not load {pair_file}: {e}")
-        log.append(f"Loaded a total of {len(training_pairs)} training pairs into memory.")
-        # Prepare dataset
-        dataset = Dataset.from_dict({
-            "input_ids": [pair[0].tolist() for pair in training_pairs],
-            "labels": [pair[1].tolist() for pair in training_pairs]
-        })
-        # Clear the training_pairs to free memory
-        training_pairs = None
-        gc.collect()
-        torch.cuda.empty_cache()
-        # Use a smaller max_length to reduce memory pressure
-        max_length = 512  # Reduced max sequence length
-        # Create data collator that handles padding
-        def data_collator(examples):
-            # Convert lists back to tensors
-            for i in range(len(examples)):
-                examples[i]["input_ids"] = torch.tensor(examples[i]["input_ids"], dtype=torch.long)
-                examples[i]["labels"] = torch.tensor(examples[i]["labels"], dtype=torch.long)
-            # Get max length in this batch
-            batch_max_length = min(
-                max(len(example["input_ids"]) for example in examples),
-                max_length
-            )
-            batch = {
-                "input_ids": [],
-                "attention_mask": [],
-                "labels": []
-            }
-            # Prepare sequences
-            for example in examples:
-                input_ids = example["input_ids"][:batch_max_length]
-                labels = example["labels"][:batch_max_length]
-                # Pad sequences
-                padding_length = batch_max_length - len(input_ids)
-                attention_mask = torch.ones_like(input_ids)
-                if padding_length > 0:
-                    padding = torch.ones(padding_length, dtype=input_ids.dtype) * tokenizer.pad_token_id
-                    input_ids = torch.cat([input_ids, padding])
-                    labels = torch.cat([labels, padding * -100])  # -100 to ignore in loss computation
-                    attention_mask = torch.cat([attention_mask, torch.zeros(padding_length, dtype=attention_mask.dtype)])
-                batch["input_ids"].append(input_ids)
-                batch["attention_mask"].append(attention_mask)
-                batch["labels"].append(labels)
-            # Convert lists to tensors
-            for key in batch:
-                batch[key] = torch.stack(batch[key])
-            return batch
-        # Convert to training dataset
-        train_dataset = dataset
-        # Free memory
-        del dataset
-        gc.collect()
-        torch.cuda.empty_cache()
     except Exception as e:
         error_msg = f"Error loading dataset: {str(e)}"
         log.append(error_msg)
         return "\n".join(log)
     # --- Training Arguments ---
-    progress(0.3, desc="Setting up training arguments...")
     output_dir = f"./results_{model_repo_name}"
     os.makedirs(output_dir, exist_ok=True)
-    # Super-aggressive memory conservation
     training_args = TrainingArguments(
         output_dir=output_dir,
         num_train_epochs=float(epochs),
@@ -604,90 +564,43 @@ def train_model(
         learning_rate=learning_rate,
         weight_decay=0.01,
         logging_dir=f"{output_dir}/logs",
-        logging_steps=1,  # Log frequently to see progress
-        save_steps=25,  # Save checkpoints more frequently
-        save_total_limit=1,  # Keep only one checkpoint to save space
         remove_unused_columns=False,
         push_to_hub=False,
         disable_tqdm=False,
         warmup_ratio=0.03,
         lr_scheduler_type="cosine",
         report_to="tensorboard",
-        bf16=True,
-        fp16=False,
-        # Memory optimization
         gradient_checkpointing=True,
         gradient_checkpointing_kwargs={'use_reentrant': False},
-        max_grad_norm=0.3,  # Reduced from default 1.0
-        dataloader_pin_memory=False,  # Reduce memory pressure
-        # Optimizer settings for memory efficiency
-        optim="adamw_torch",
-        adam_beta1=0.9,
-        adam_beta2=0.999,
-        adam_epsilon=1e-8,
-        # Evaluation settings
-        do_eval=False,
-        evaluation_strategy="no",
-        # Set this for smaller chunks of data processing
-        dataloader_num_workers=1,
-        # For memory efficiency when loading datasets
-        dataloader_drop_last=True,
     )
     # --- Initialize Trainer ---
-    progress(0.4, desc="Initializing trainer...")
-    # Use optimizer that requires less memory
-    class MemoryEfficientTrainer(Trainer):
-        def create_optimizer(self):
-            # Create optimizer with reduced memory footprint
-            optimizer = super().create_optimizer()
-            # Force optimizer to use CPU offloading for states
-            for param_group in optimizer.param_groups:
-                for param in param_group['params']:
-                    if param.requires_grad:
-                        param.data = param.data.to("cpu")
-                        if param.grad is not None:
-                            param.grad.data = param.grad.data.to("cpu")
-            return optimizer
-        def training_step(self, *args, **kwargs):
-            # Memory cleanup before each training step
-            gc.collect()
-            torch.cuda.empty_cache()
-            return super().training_step(*args, **kwargs)
-    trainer = MemoryEfficientTrainer(
         model=model_to_train,
         args=training_args,
         train_dataset=train_dataset,
         data_collator=data_collator,
     )
-    log.append("Trainer initialized with memory-efficient settings")
     # --- Start Training ---
     try:
-        # Final memory cleanup before training
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        progress(0.5, desc="Starting training...")
-        log.append("Starting training with extreme memory optimization...")
-        # Train in smaller chunks to manage memory better
-        total_steps = len(train_dataset) // (batch_size * grad_accum_steps)
-        log.append(f"Total training steps: {total_steps}")
-        # Train the model
         train_result = trainer.train()
         progress(0.95, desc="Saving model...")
         # Save final model (adapter weights) and training state
@@ -703,49 +616,33 @@ def train_model(
         for key, value in metrics.items():
             log.append(f"{key}: {value}")
-        # Print peak memory usage
-        if torch.cuda.is_available():
-            peak_memory = torch.cuda.max_memory_allocated() / (1024**3)
-            log.append(f"Peak GPU memory usage: {peak_memory:.2f} GB")
     except Exception as e:
-        error_msg = f"An error occurred during training: {str(e)}"
         log.append(error_msg)
-        # Try to save checkpoint even if training failed
-        try:
-            # Save whatever we have
-            log.append("Attempting to save partial checkpoint...")
-            emergency_save_path = os.path.join(training_args.output_dir, "emergency_checkpoint")
-            trainer.save_model(emergency_save_path)
-            log.append(f"Saved emergency checkpoint to {emergency_save_path}")
-        except Exception as save_error:
-            log.append(f"Could not save emergency checkpoint: {save_error}")
         return "\n".join(log)
     progress(1.0, desc="Training complete!")
-    log.append("Training process complete successfully.")
     return "\n".join(log)
 # Define the Gradio interface
 def create_interface():
-    with gr.Blocks(title="Llama 3 8B RVQ Fine-tuning") as demo:
-        gr.Markdown("# Llama 3 8B RVQ LoRA Fine-tuning")
-        gr.Markdown("Fine-tune a Llama 3 8B model with RVQ token embeddings using LoRA with extreme memory optimization")
         with gr.Row():
             with gr.Column():
                 hf_username = gr.Textbox(label="HuggingFace Username", value="Twelve2five")
-                model_repo = gr.Textbox(label="Model Repository Name", value="llama-3-8b-rvq-resized")
                 dataset_repo = gr.Textbox(label="Dataset Repository Name", value="podcast-dialogue-rvq-pairs-3items")
             with gr.Column():
-                epochs = gr.Number(label="Number of Epochs", value=1, minimum=1, maximum=10)
-                batch_size = gr.Number(label="Batch Size per Device", value=1, minimum=1, maximum=8)
-                grad_accum = gr.Number(label="Gradient Accumulation Steps", value=16, minimum=8, maximum=32)
-                lr = gr.Number(label="Learning Rate", value=1e-4)
         start_btn = gr.Button("Start Training")
         output = gr.Textbox(label="Training Log", lines=20)

 # --- Configuration ---
 YOUR_HF_USERNAME = "Twelve2five"
+MODEL_REPO_NAME = "llama-3.2-1b-rvq"
 DATASET_REPO_NAME = "podcast-dialogue-rvq-pairs-3items"
 hf_model_repo_id = f"{YOUR_HF_USERNAME}/{MODEL_REPO_NAME}"
     model_repo_name,
     dataset_repo_name,
     epochs=1,
+    batch_size=2,  # Increased batch size since model is smaller
+    grad_accum_steps=4,
+    learning_rate=2e-4,  # Slightly higher learning rate for smaller model
     progress=gr.Progress()
 ):
     progress(0, desc="Setting up environment...")
     log = []
     # Clean up any existing model files to save space
     if os.path.exists("./model_files"):
         try:
         from huggingface_hub import snapshot_download
         import torch
         import transformers
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        from transformers import BitsAndBytesConfig, TrainingArguments, Trainer
         from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
         log.append(f"Transformers version: {transformers.__version__}")
     n_gpus = torch.cuda.device_count()
     log.append(f"Number of GPUs available: {n_gpus}")
+    # --- DeepSpeed Configuration ---
+    # Create DeepSpeed config file
+    progress(0.1, desc="Setting up DeepSpeed configuration...")
+    # Create a conservative DeepSpeed config for the smaller model
+    ds_config = {
+        "fp16": {
+            "enabled": "auto",
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "initial_scale_power": 16,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
+        "bf16": {
+            "enabled": "auto"
+        },
+        "zero_optimization": {
+            "stage": 2,  # Lower stage for smaller model
+            "offload_optimizer": {
+                "device": "cpu",
+                "pin_memory": True
+            },
+            "contiguous_gradients": True,
+            "overlap_comm": True
+        },
+        "gradient_accumulation_steps": grad_accum_steps,
+        "gradient_clipping": 1.0,
+        "train_batch_size": batch_size * grad_accum_steps * max(1, n_gpus)
+    }
+    ds_config_path = "ds_config.json"
+    with open(ds_config_path, "w") as f:
+        json.dump(ds_config, f, indent=4)
+    log.append("DeepSpeed configuration created successfully")
+    # --- Download and Load Model ---
+    progress(0.15, desc="Downloading model...")
     try:
+        # Download model files
+        local_model_path = "./model_files"
         snapshot_download(
             repo_id=hf_model_repo_id,
             local_dir=local_model_path,
         )
         log.append(f"Model files downloaded to {local_model_path}")
+        # First, load the config
         config_path = os.path.join(local_model_path, "config.json")
         with open(config_path, "r") as f:
             config_data = json.load(f)
+        # Check model architecture
+        model_type = config_data.get("model_type", "").lower()
         log.append(f"Model architecture type: {model_type}")
+        # Set 4-bit quantization config
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_use_double_quant=True,
             bnb_4bit_compute_dtype=torch.bfloat16
         )
+        # Load the model with 4-bit quantization
+        model = AutoModelForCausalLM.from_pretrained(
             local_model_path,
             quantization_config=bnb_config,
             device_map="auto",
+            trust_remote_code=False,
+            use_cache=False  # No KV caching during training
         )
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(local_model_path)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        # Log model info
+        if hasattr(model, "config"):
+            log.append(f"Loaded model vocab size: {model.config.vocab_size}")
+        if hasattr(model, "get_input_embeddings"):
+            embedding = model.get_input_embeddings()
+            if hasattr(embedding, "weight"):
+                log.append(f"Input embedding shape: {embedding.weight.shape}")
     except Exception as e:
         error_msg = f"Error loading model: {str(e)}"
         log.append(error_msg)
         model = prepare_model_for_kbit_training(model)
         log.append("Model prepared for k-bit training")
+        # For Llama 3.2 1B the target modules might be slightly different
         lora_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
+            r=8,  # Reduced from 16 due to smaller model
+            lora_alpha=16,  # Reduced from 32
             lora_dropout=0.05,
             bias="none",
+            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
         )
         peft_model = get_peft_model(model, lora_config)
+        trainable_params = peft_model.print_trainable_parameters()
+        log.append(f"LoRA applied to model")
         model_to_train = peft_model
     except Exception as e:
         error_msg = f"Error preparing model for training: {str(e)}"
         log.append(error_msg)
         return "\n".join(log)
+    # --- Load and Prepare Dataset ---
+    progress(0.3, desc="Loading and preparing dataset...")
     try:
+        # Download dataset
+        dataset_path = "./downloaded_dataset_files"
         snapshot_download(
             repo_id=hf_dataset_repo_id,
+            local_dir=dataset_path,
         )
+        log.append(f"Dataset repository content downloaded to: {dataset_path}")
+        # Find all RVQ pairs files
+        rvq_pair_files = glob.glob(os.path.join(dataset_path, "*_rvq_pairs.pt"))
         log.append(f"Found {len(rvq_pair_files)} RVQ pair files.")
+        # Load the pytorch files
+        all_pairs = []
+        for file_path in rvq_pair_files:
+            pairs = torch.load(file_path)
+            all_pairs.extend(pairs)
+        # You may want to limit the data size for quicker testing
+        # all_pairs = all_pairs[:1000]  # Uncomment to limit data size
+        log.append(f"Loaded a total of {len(all_pairs)} training pairs into memory.")
+        # Convert to HF dataset format
+        dataset_dict = {
+            "input_ids": [pair["source"] for pair in all_pairs],
+            "labels": [pair["target"] for pair in all_pairs]
+        }
+        train_dataset = Dataset.from_dict(dataset_dict)
+        # Create data collator for padding
+        from transformers import DataCollatorForLanguageModeling
+        data_collator = DataCollatorForLanguageModeling(
+            tokenizer=tokenizer,
+            mlm=False
+        )
     except Exception as e:
         error_msg = f"Error loading dataset: {str(e)}"
         log.append(error_msg)
         return "\n".join(log)
     # --- Training Arguments ---
+    progress(0.75, desc="Setting up training arguments...")
     output_dir = f"./results_{model_repo_name}"
     os.makedirs(output_dir, exist_ok=True)
     training_args = TrainingArguments(
         output_dir=output_dir,
         num_train_epochs=float(epochs),
         learning_rate=learning_rate,
         weight_decay=0.01,
         logging_dir=f"{output_dir}/logs",
+        logging_steps=10,
+        save_steps=100,
+        save_total_limit=3,
         remove_unused_columns=False,
         push_to_hub=False,
         disable_tqdm=False,
         warmup_ratio=0.03,
         lr_scheduler_type="cosine",
         report_to="tensorboard",
+        bf16=True if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else False,
         gradient_checkpointing=True,
         gradient_checkpointing_kwargs={'use_reentrant': False},
+        ddp_find_unused_parameters=False,
+        deepspeed=ds_config_path if n_gpus > 1 else None,
     )
     # --- Initialize Trainer ---
+    progress(0.8, desc="Initializing trainer...")
+    trainer = Trainer(
         model=model_to_train,
         args=training_args,
         train_dataset=train_dataset,
         data_collator=data_collator,
     )
+    log.append("Trainer initialized for training.")
     # --- Start Training ---
+    # Clear cache before starting
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     try:
+        progress(0.85, desc="Starting training...")
+        log.append("Starting training...")
         train_result = trainer.train()
         progress(0.95, desc="Saving model...")
         # Save final model (adapter weights) and training state
         for key, value in metrics.items():
             log.append(f"{key}: {value}")
     except Exception as e:
+        error_msg = f"An error occurred during training: {e}"
         log.append(error_msg)
         return "\n".join(log)
     progress(1.0, desc="Training complete!")
+    log.append("Training process complete.")
     return "\n".join(log)
 # Define the Gradio interface
 def create_interface():
+    with gr.Blocks(title="Llama 3.2 1B RVQ Fine-tuning") as demo:
+        gr.Markdown("# Llama 3.2 1B RVQ LoRA Fine-tuning")
+        gr.Markdown("Fine-tune a Llama 3.2 1B model with RVQ token embeddings using LoRA")
         with gr.Row():
             with gr.Column():
                 hf_username = gr.Textbox(label="HuggingFace Username", value="Twelve2five")
+                model_repo = gr.Textbox(label="Model Repository Name", value="llama-3.2-1b-rvq")
                 dataset_repo = gr.Textbox(label="Dataset Repository Name", value="podcast-dialogue-rvq-pairs-3items")
             with gr.Column():
+                epochs = gr.Number(label="Number of Epochs", value=2, minimum=1, maximum=10)
+                batch_size = gr.Number(label="Batch Size per Device", value=2, minimum=1, maximum=8)
+                grad_accum = gr.Number(label="Gradient Accumulation Steps", value=4, minimum=1, maximum=16)
+                lr = gr.Number(label="Learning Rate", value=2e-4)
         start_btn = gr.Button("Start Training")
         output = gr.Textbox(label="Training Log", lines=20)