Spaces:

Twelve2five
/

qlora-llama3-finetuning

Sleeping

App Files Files Community

Twelve2five commited on Apr 9

Commit

0591b3c

verified ·

1 Parent(s): e302645

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -114

app.py CHANGED Viewed

@@ -207,90 +207,95 @@ def load_model():
             log.append(f"Alternative loading also failed: {e2}")
             return "\n".join(log)
-    # --- Load Tokenizer ---
     progress(0.3, desc="Loading tokenizer...")
     try:
-        log.append("Loading a compatible tokenizer...")
-        # Use the tokenizer from Meta's official Llama models - should be compatible with Llama 3.2
-        tokenizer_id = "meta-llama/Llama-3-1B"  # This is a reliable source for a Llama tokenizer
-        # Try with specified tokenizer first
         try:
             tokenizer = AutoTokenizer.from_pretrained(
-                tokenizer_id,
-                use_fast=True,
                 padding_side="right",
-                trust_remote_code=True
             )
-            log.append(f"Successfully loaded tokenizer from {tokenizer_id}")
-        except Exception as e:
-            log.append(f"Could not load from {tokenizer_id}: {e}")
-            # Fallback to Llama-2 tokenizer
             try:
                 tokenizer = AutoTokenizer.from_pretrained(
-                    "meta-llama/Llama-2-7b-hf",
-                    use_fast=True,
-                    padding_side="right"
-                )
-                log.append("Loaded Llama-2 tokenizer as fallback")
-            except Exception as e2:
-                # If that fails too, try the most basic option
-                from transformers import LlamaTokenizer
-                tokenizer = LlamaTokenizer.from_pretrained(
-                    "hf-internal-testing/llama-tokenizer",
-                    use_fast=False,
-                    padding_side="right"
                 )
-                log.append("Loaded basic Llama tokenizer from testing repo")
-        # Set pad token if not already set
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-            log.append("Set pad_token to eos_token")
-        # Make sure we have necessary special tokens
-        if tokenizer.bos_token is None:
-            tokenizer.bos_token = "<s>"
-            log.append("Set bos_token to <s>")
-        if tokenizer.eos_token is None:
-            tokenizer.eos_token = "</s>"
-            log.append("Set eos_token to </s>")
-        log.append(f"Loaded model vocab size: {len(tokenizer)}")
-    except Exception as e:
-        error_msg = f"All attempts to load a tokenizer failed: {e}"
-        log.append(error_msg)
-        return "\n".join(log)
-    # Print information about input embeddings
-    print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
     # Prepare model for k-bit training
     model = prepare_model_for_kbit_training(model)
-    # Define LoRA configuration
     lora_config = LoraConfig(
-        r=16,
-        lora_alpha=32,
-        target_modules=[
-            "q_proj",
-            "k_proj",
-            "v_proj",
-            "o_proj",
-            "gate_proj",
-            "up_proj",
-            "down_proj",
-        ],
         lora_dropout=0.05,
         bias="none",
-        task_type=TaskType.CAUSAL_LM
     )
     # Apply LoRA to model
-    model = get_peft_model(model, lora_config)
-    model.print_trainable_parameters()
     return model, tokenizer  # Return both model and tokenizer
@@ -373,9 +378,10 @@ def train_model(
     model_repo_name,
     dataset_repo_name,
     epochs=1,
-    batch_size=4,
-    grad_accum_steps=2,
     learning_rate=2e-4,
     progress=gr.Progress()
 ):
     progress(0, desc="Setting up environment...")
@@ -522,58 +528,66 @@ def train_model(
             torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
         )
-        # --- Load Tokenizer (using public models) ---
         progress(0.3, desc="Loading tokenizer...")
         # Try to load a compatible tokenizer
         try:
-            # First try TinyLlama which is open and uses Llama tokenizer
             tokenizer = AutoTokenizer.from_pretrained(
-                "TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # Public model with Llama tokenizer
                 padding_side="right",
                 use_fast=True,
             )
-            log.append("Loaded tokenizer from TinyLlama/TinyLlama-1.1B-Chat-v1.0")
         except Exception as e1:
-            log.append(f"Couldn't load TinyLlama tokenizer: {e1}")
             try:
-                # Try Phi-2 which is also public
                 tokenizer = AutoTokenizer.from_pretrained(
-                    "microsoft/phi-2",
                     padding_side="right",
-                    use_fast=True
                 )
-                log.append("Loaded Phi-2 tokenizer as fallback")
             except Exception as e2:
-                log.append(f"Couldn't load Phi-2 tokenizer: {e2}")
                 try:
-                    # Try CodeLlama which is popular and public
                     tokenizer = AutoTokenizer.from_pretrained(
-                        "codellama/CodeLlama-7b-hf",
-                        padding_side="right"
                     )
-                    log.append("Loaded CodeLlama tokenizer as fallback")
                 except Exception as e3:
-                    log.append(f"Couldn't load any standard tokenizers. Using a basic tokenizer instead.")
-                    # Create a minimal tokenizer that works with our format
-                    # Assuming the vocab size is 2048 (from the RVQ token count)
-                    from transformers import PreTrainedTokenizerFast
-                    # Create a very basic tokenizer
-                    tokenizer = PreTrainedTokenizerFast(
-                        tokenizer_file=None,  # No file needed
-                        bos_token="<s>",
-                        eos_token="</s>",
-                        unk_token="<unk>",
-                        pad_token="<pad>",
-                        model_max_length=2048  # Safe default value
-                    )
-                    # Add vocabulary - creating a minimal vocab for the RVQ tokens
-                    vocab = {f"<token_{i}>": i for i in range(model.config.vocab_size)}
-                    tokenizer.add_tokens(list(vocab.keys()))
-                    log.append(f"Created basic tokenizer with {len(tokenizer)} tokens")
         # Set pad token if not already set
         if tokenizer.pad_token is None:
@@ -589,14 +603,13 @@ def train_model(
         model = prepare_model_for_kbit_training(model)
         log.append("Model prepared for k-bit training")
-        # Define LoRA configuration
-        # Based on your reference code
         lora_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
-            r=16,  # Rank
-            lora_alpha=32,  # Alpha parameter
-            lora_dropout=0.05,  # Dropout probability
-            bias="none",  # Bias type
             target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
         )
@@ -604,6 +617,8 @@ def train_model(
         progress(0.4, desc="Applying LoRA to model...")
         model_to_train = get_peft_model(model, lora_config)
         log.append("LoRA applied to model")
         # Cleanup to free up memory
         gc.collect()
@@ -709,17 +724,17 @@ def train_model(
     output_dir = f"./results_{model_repo_name}"
     os.makedirs(output_dir, exist_ok=True)
-    # Optimize settings for A100
     training_args = TrainingArguments(
         output_dir=output_dir,
         num_train_epochs=float(epochs),
-        per_device_train_batch_size=batch_size,
-        gradient_accumulation_steps=grad_accum_steps,
         learning_rate=learning_rate,
         weight_decay=0.01,
         logging_dir=f"{output_dir}/logs",
         logging_steps=10,
-        save_steps=100,
         save_total_limit=3,
         remove_unused_columns=False,
         push_to_hub=False,
@@ -727,7 +742,8 @@ def train_model(
         warmup_ratio=0.03,
         lr_scheduler_type="cosine",
         report_to="tensorboard",
-        bf16=True if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else False,
         gradient_checkpointing=True,  # Still useful for efficiency
         gradient_checkpointing_kwargs={'use_reentrant': False},
         ddp_find_unused_parameters=False,
@@ -791,11 +807,16 @@ def create_interface():
                 hf_username = gr.Textbox(label="HuggingFace Username", value="Twelve2five")
                 model_repo = gr.Textbox(label="Model Repository Name", value="llama-3.2-1b-rvq")
                 dataset_repo = gr.Textbox(label="Dataset Repository Name", value="podcast-dialogue-rvq-pairs-3items")
             with gr.Column():
                 epochs = gr.Number(label="Number of Epochs", value=3, minimum=1, maximum=10)
-                batch_size = gr.Number(label="Batch Size per Device", value=4, minimum=1, maximum=16)
-                grad_accum = gr.Number(label="Gradient Accumulation Steps", value=2, minimum=1, maximum=16)
                 lr = gr.Number(label="Learning Rate", value=2e-4)
         start_btn = gr.Button("Start Training")
@@ -803,7 +824,7 @@ def create_interface():
         start_btn.click(
             fn=train_model,
-            inputs=[hf_username, model_repo, dataset_repo, epochs, batch_size, grad_accum, lr],
             outputs=output
         )

             log.append(f"Alternative loading also failed: {e2}")
             return "\n".join(log)
+    # --- Load Tokenizer (prioritizing Llama 3.2 1B) ---
     progress(0.3, desc="Loading tokenizer...")
+    # Set up token for authentication
+    token_param = {"token": hf_token} if hf_token and hf_token.strip() else {}
+    if token_param:
+        log.append("Using provided Hugging Face token for authentication")
+    else:
+        log.append("No token provided, using Space's default authentication")
+    # Try to load a compatible tokenizer
     try:
+        # First try the actual Llama 3.2 1B tokenizer
+        tokenizer_repo = "meta-llama/Llama-3.2-1B"  # The official 1B model
+        log.append(f"Attempting to load tokenizer from {tokenizer_repo}...")
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_repo,
+            padding_side="right",
+            use_fast=True,
+            **token_param  # Pass token if provided
+        )
+        log.append(f"Successfully loaded tokenizer from {tokenizer_repo}")
+    except Exception as e1:
+        log.append(f"Couldn't load {tokenizer_repo} tokenizer: {e1}")
+        # Try the model repo directly (in case it has a tokenizer)
         try:
             tokenizer = AutoTokenizer.from_pretrained(
+                hf_model_repo_id,  # The RVQ model repo
                 padding_side="right",
+                use_fast=True,
+                **token_param  # Pass token if provided
             )
+            log.append(f"Loaded tokenizer from the model repo: {hf_model_repo_id}")
+        except Exception as e2:
+            log.append(f"Couldn't load model repo tokenizer: {e2}")
+            # Continue with our fallbacks (public models don't need token)
             try:
+                # Try TinyLlama (public)
                 tokenizer = AutoTokenizer.from_pretrained(
+                    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+                    padding_side="right",
+                    use_fast=True
                 )
+                log.append("Loaded TinyLlama tokenizer as fallback")
+            except Exception as e3:
+                log.append(f"Couldn't load TinyLlama tokenizer: {e3}")
+                # Last resort - other public models
+                try:
+                    tokenizer = AutoTokenizer.from_pretrained(
+                        "microsoft/phi-2",  # Public model
+                        padding_side="right"
+                    )
+                    log.append("Loaded Phi-2 tokenizer as last resort")
+                except Exception as e4:
+                    error_msg = f"Failed to load any compatible tokenizer after multiple attempts: {e4}"
+                    log.append(error_msg)
+                    return "\n".join(log)
+    # Set pad token if not already set
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token is not None else "<pad>"
+        log.append("Set pad_token to eos_token or <pad>")
+    log.append(f"Tokenizer loaded with vocab size: {len(tokenizer)}")
+    log.append(f"Model vocab size: {model.config.vocab_size}")
+    log.append(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
     # Prepare model for k-bit training
     model = prepare_model_for_kbit_training(model)
+    # Define LoRA configuration - adjusted for 1B model
     lora_config = LoraConfig(
+        task_type=TaskType.CAUSAL_LM,
+        r=8,  # Smaller rank for 1B model (vs 16 for larger models)
+        lora_alpha=16,  # Adjusted alpha (vs 32 for larger models)
         lora_dropout=0.05,
         bias="none",
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
     )
     # Apply LoRA to model
+    progress(0.4, desc="Applying LoRA to model...")
+    model_to_train = get_peft_model(model, lora_config)
+    log.append("LoRA applied to model")
+    log.append(f"LoRA rank: 8, alpha: 16 (optimized for 1B model)")
+    model_to_train.print_trainable_parameters()
     return model, tokenizer  # Return both model and tokenizer
     model_repo_name,
     dataset_repo_name,
     epochs=1,
+    batch_size=8,
+    grad_accum_steps=1,
     learning_rate=2e-4,
+    hf_token=None,  # New parameter for token
     progress=gr.Progress()
 ):
     progress(0, desc="Setting up environment...")
             torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
         )
+        # --- Load Tokenizer (prioritizing Llama 3.2 1B) ---
         progress(0.3, desc="Loading tokenizer...")
+        # Set up token for authentication
+        token_param = {"token": hf_token} if hf_token and hf_token.strip() else {}
+        if token_param:
+            log.append("Using provided Hugging Face token for authentication")
+        else:
+            log.append("No token provided, using Space's default authentication")
         # Try to load a compatible tokenizer
         try:
+            # First try the actual Llama 3.2 1B tokenizer
+            tokenizer_repo = "meta-llama/Llama-3.2-1B"  # The official 1B model
+            log.append(f"Attempting to load tokenizer from {tokenizer_repo}...")
             tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_repo,
                 padding_side="right",
                 use_fast=True,
+                **token_param  # Pass token if provided
             )
+            log.append(f"Successfully loaded tokenizer from {tokenizer_repo}")
         except Exception as e1:
+            log.append(f"Couldn't load {tokenizer_repo} tokenizer: {e1}")
+            # Try the model repo directly (in case it has a tokenizer)
             try:
                 tokenizer = AutoTokenizer.from_pretrained(
+                    hf_model_repo_id,  # The RVQ model repo
                     padding_side="right",
+                    use_fast=True,
+                    **token_param  # Pass token if provided
                 )
+                log.append(f"Loaded tokenizer from the model repo: {hf_model_repo_id}")
             except Exception as e2:
+                log.append(f"Couldn't load model repo tokenizer: {e2}")
+                # Continue with our fallbacks (public models don't need token)
                 try:
+                    # Try TinyLlama (public)
                     tokenizer = AutoTokenizer.from_pretrained(
+                        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+                        padding_side="right",
+                        use_fast=True
                     )
+                    log.append("Loaded TinyLlama tokenizer as fallback")
                 except Exception as e3:
+                    log.append(f"Couldn't load TinyLlama tokenizer: {e3}")
+                    # Last resort - other public models
+                    try:
+                        tokenizer = AutoTokenizer.from_pretrained(
+                            "microsoft/phi-2",  # Public model
+                            padding_side="right"
+                        )
+                        log.append("Loaded Phi-2 tokenizer as last resort")
+                    except Exception as e4:
+                        error_msg = f"Failed to load any compatible tokenizer after multiple attempts: {e4}"
+                        log.append(error_msg)
+                        return "\n".join(log)
         # Set pad token if not already set
         if tokenizer.pad_token is None:
         model = prepare_model_for_kbit_training(model)
         log.append("Model prepared for k-bit training")
+        # Define LoRA configuration - adjusted for 1B model
         lora_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
+            r=8,  # Smaller rank for 1B model (vs 16 for larger models)
+            lora_alpha=16,  # Adjusted alpha (vs 32 for larger models)
+            lora_dropout=0.05,
+            bias="none",
             target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
         )
         progress(0.4, desc="Applying LoRA to model...")
         model_to_train = get_peft_model(model, lora_config)
         log.append("LoRA applied to model")
+        log.append(f"LoRA rank: 8, alpha: 16 (optimized for 1B model)")
+        model_to_train.print_trainable_parameters()
         # Cleanup to free up memory
         gc.collect()
     output_dir = f"./results_{model_repo_name}"
     os.makedirs(output_dir, exist_ok=True)
+    # For 1B model on A100, we can increase batch size and reduce gradient accumulation
     training_args = TrainingArguments(
         output_dir=output_dir,
         num_train_epochs=float(epochs),
+        per_device_train_batch_size=8,  # Larger batch size for 1B model
+        gradient_accumulation_steps=1,  # Reduced for 1B model
         learning_rate=learning_rate,
         weight_decay=0.01,
         logging_dir=f"{output_dir}/logs",
         logging_steps=10,
+        save_steps=50,
         save_total_limit=3,
         remove_unused_columns=False,
         push_to_hub=False,
         warmup_ratio=0.03,
         lr_scheduler_type="cosine",
         report_to="tensorboard",
+        bf16=True if torch.cuda.is_bf16_supported() else False,
+        fp16=False,  # Using BF16 instead
         gradient_checkpointing=True,  # Still useful for efficiency
         gradient_checkpointing_kwargs={'use_reentrant': False},
         ddp_find_unused_parameters=False,
                 hf_username = gr.Textbox(label="HuggingFace Username", value="Twelve2five")
                 model_repo = gr.Textbox(label="Model Repository Name", value="llama-3.2-1b-rvq")
                 dataset_repo = gr.Textbox(label="Dataset Repository Name", value="podcast-dialogue-rvq-pairs-3items")
+                hf_token = gr.Textbox(
+                    label="Hugging Face Token (Optional)",
+                    placeholder="Enter your HF token to access gated models",
+                    type="password"
+                )
             with gr.Column():
                 epochs = gr.Number(label="Number of Epochs", value=3, minimum=1, maximum=10)
+                batch_size = gr.Number(label="Batch Size per Device", value=8, minimum=1, maximum=16)
+                grad_accum = gr.Number(label="Gradient Accumulation Steps", value=1, minimum=1, maximum=16)
                 lr = gr.Number(label="Learning Rate", value=2e-4)
         start_btn = gr.Button("Start Training")
         start_btn.click(
             fn=train_model,
+            inputs=[hf_username, model_repo, dataset_repo, epochs, batch_size, grad_accum, lr, hf_token],
             outputs=output
         )