Spaces:

Twelve2five
/

qlora-llama3-finetuning

Sleeping

App Files Files Community

Twelve2five commited on Apr 9

Commit

154b3c1

verified ·

1 Parent(s): af0160e

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -22

app.py CHANGED Viewed

@@ -212,7 +212,7 @@ def load_model():
     try:
         log.append("Loading a compatible tokenizer...")
         # Use the tokenizer from Meta's official Llama models - should be compatible with Llama 3.2
-        tokenizer_id = "meta-llama/Llama-3-8B"  # This is a reliable source for a Llama tokenizer
         # Try with specified tokenizer first
         try:
@@ -504,38 +504,77 @@ def train_model(
             quantization_config=bnb_config,
             device_map="auto",
             use_cache=False,  # Needed for gradient checkpointing
             torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
         )
-        # Load the tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(
-            local_model_path,
-            padding_side="right",
-            use_fast=True,
-        )
-        tokenizer.pad_token = tokenizer.eos_token
-        # Find model's architecture type
-        model_type = model.config.model_type
-        log.append(f"Model architecture type: {model_type}")
-        # PEFT Configuration (Smaller LoRA for faster iteration)
         model = prepare_model_for_kbit_training(model)
         log.append("Model prepared for k-bit training")
         lora_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
-            r=16,  # Keeping higher rank for A100
-            lora_alpha=32,
-            lora_dropout=0.05,
-            bias="none",
-            target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]  # Fewer modules for faster training
         )
-        peft_model = get_peft_model(model, lora_config)
-        trainable_params = peft_model.print_trainable_parameters()
-        log.append(f"LoRA applied to model")
-        model_to_train = peft_model
     except Exception as e:
         error_msg = f"Error preparing model for training: {str(e)}"
         log.append(error_msg)

     try:
         log.append("Loading a compatible tokenizer...")
         # Use the tokenizer from Meta's official Llama models - should be compatible with Llama 3.2
+        tokenizer_id = "meta-llama/Llama-3-1B"  # This is a reliable source for a Llama tokenizer
         # Try with specified tokenizer first
         try:
             quantization_config=bnb_config,
             device_map="auto",
             use_cache=False,  # Needed for gradient checkpointing
+            trust_remote_code=True,  # Following reference code
             torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
         )
+        # --- Load Tokenizer (from a compatible model) ---
+        # Following the pattern from reference code
+        progress(0.3, desc="Loading tokenizer...")
+        # Try to load a compatible tokenizer
+        try:
+            # First try loading from standard Llama 3 model
+            tokenizer = AutoTokenizer.from_pretrained(
+                "meta-llama/Llama-3-8B",  # Using standard Llama 3 tokenizer
+                padding_side="right",
+                use_fast=True,
+                trust_remote_code=True
+            )
+            log.append("Loaded tokenizer from meta-llama/Llama-3-8B")
+        except Exception as e1:
+            log.append(f"Couldn't load Llama-3 tokenizer: {e1}")
+            try:
+                # Fallback to Llama 2
+                tokenizer = AutoTokenizer.from_pretrained(
+                    "meta-llama/Llama-2-7b-hf",
+                    padding_side="right",
+                    use_fast=True
+                )
+                log.append("Loaded Llama-2 tokenizer as fallback")
+            except Exception as e2:
+                log.append(f"Couldn't load Llama-2 tokenizer: {e2}")
+                # Final fallback
+                from transformers import LlamaTokenizer
+                tokenizer = LlamaTokenizer.from_pretrained(
+                    "hf-internal-testing/llama-tokenizer",
+                    padding_side="right"
+                )
+                log.append("Loaded testing Llama tokenizer as final fallback")
+        # Set pad token and ensure it's usable
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        log.append(f"Loaded model vocab size: {model.config.vocab_size}")
+        log.append(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
+        # --- QLoRA Preparation ---
+        progress(0.35, desc="Preparing model for k-bit training...")
         model = prepare_model_for_kbit_training(model)
         log.append("Model prepared for k-bit training")
+        # Define LoRA configuration
+        # Based on your reference code
         lora_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
+            r=16,  # Rank
+            lora_alpha=32,  # Alpha parameter
+            lora_dropout=0.05,  # Dropout probability
+            bias="none",  # Bias type
+            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
         )
+        # Apply LoRA to model
+        progress(0.4, desc="Applying LoRA to model...")
+        model_to_train = get_peft_model(model, lora_config)
+        log.append("LoRA applied to model")
+        # Cleanup to free up memory
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
     except Exception as e:
         error_msg = f"Error preparing model for training: {str(e)}"
         log.append(error_msg)