Spaces:

Twelve2five
/

qlora-llama3-finetuning

Sleeping

App Files Files Community

Twelve2five commited on Apr 9

Commit

fdebc65

verified ·

1 Parent(s): 1c688b1

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -82

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ import shutil
 # --- Configuration ---
 YOUR_HF_USERNAME = "Twelve2five"
-MODEL_REPO_NAME = "llama-3.2-1b-rvq"
 DATASET_REPO_NAME = "podcast-dialogue-rvq-pairs-3items"
 hf_model_repo_id = f"{YOUR_HF_USERNAME}/{MODEL_REPO_NAME}"
@@ -329,9 +329,9 @@ def train_model(
     model_repo_name,
     dataset_repo_name,
     epochs=1,
-    batch_size=2,  # Increased batch size since model is smaller
     grad_accum_steps=4,
-    learning_rate=2e-4,  # Slightly higher learning rate for smaller model
     progress=gr.Progress()
 ):
     progress(0, desc="Setting up environment...")
@@ -350,7 +350,7 @@ def train_model(
         except Exception as e:
             log.append(f"Warning: Could not remove existing dataset files: {e}")
-    # Print GPU info
     if torch.cuda.is_available():
         log.append(f"Available GPUs: {torch.cuda.device_count()}")
         for i in range(torch.cuda.device_count()):
@@ -362,7 +362,7 @@ def train_model(
     try:
         from datasets import Dataset
         from huggingface_hub import snapshot_download
-        import torch
         import transformers
         from transformers import AutoModelForCausalLM, AutoTokenizer
         from transformers import BitsAndBytesConfig, TrainingArguments, Trainer
@@ -390,25 +390,13 @@ def train_model(
     # Create DeepSpeed config file
     progress(0.1, desc="Setting up DeepSpeed configuration...")
-    # Create a conservative DeepSpeed config for the smaller model
     ds_config = {
-        "fp16": {
-            "enabled": "auto",
-            "loss_scale": 0,
-            "loss_scale_window": 1000,
-            "initial_scale_power": 16,
-            "hysteresis": 2,
-            "min_loss_scale": 1
-        },
         "bf16": {
             "enabled": "auto"
         },
         "zero_optimization": {
-            "stage": 2,  # Lower stage for smaller model
-            "offload_optimizer": {
-                "device": "cpu",
-                "pin_memory": True
-            },
             "contiguous_gradients": True,
             "overlap_comm": True
         },
@@ -432,113 +420,136 @@ def train_model(
         snapshot_download(
             repo_id=hf_model_repo_id,
             local_dir=local_model_path,
         )
         log.append(f"Model files downloaded to {local_model_path}")
-        # First, load the config
-        config_path = os.path.join(local_model_path, "config.json")
-        with open(config_path, "r") as f:
-            config_data = json.load(f)
-        # Check model architecture
-        model_type = config_data.get("model_type", "").lower()
-        log.append(f"Model architecture type: {model_type}")
-        # Set 4-bit quantization config
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
-            bnb_4bit_use_double_quant=True,
             bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.bfloat16
         )
-        # Load the model with 4-bit quantization
         model = AutoModelForCausalLM.from_pretrained(
             local_model_path,
             quantization_config=bnb_config,
             device_map="auto",
-            trust_remote_code=False,
-            use_cache=False  # No KV caching during training
         )
-        # Load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(local_model_path)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        # Log model info
-        if hasattr(model, "config"):
-            log.append(f"Loaded model vocab size: {model.config.vocab_size}")
-        if hasattr(model, "get_input_embeddings"):
-            embedding = model.get_input_embeddings()
-            if hasattr(embedding, "weight"):
-                log.append(f"Input embedding shape: {embedding.weight.shape}")
-    except Exception as e:
-        error_msg = f"Error loading model: {str(e)}"
-        log.append(error_msg)
-        return "\n".join(log)
-    # --- Prepare for K-bit Training & Apply LoRA ---
-    progress(0.15, desc="Preparing model for fine-tuning...")
-    try:
         model = prepare_model_for_kbit_training(model)
         log.append("Model prepared for k-bit training")
-        # For Llama 3.2 1B the target modules might be slightly different
         lora_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
-            r=8,  # Reduced from 16 due to smaller model
-            lora_alpha=16,  # Reduced from 32
             lora_dropout=0.05,
             bias="none",
-            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
         )
         peft_model = get_peft_model(model, lora_config)
         trainable_params = peft_model.print_trainable_parameters()
         log.append(f"LoRA applied to model")
         model_to_train = peft_model
     except Exception as e:
         error_msg = f"Error preparing model for training: {str(e)}"
         log.append(error_msg)
         return "\n".join(log)
-    # --- Load and Prepare Dataset ---
-    progress(0.3, desc="Loading and preparing dataset...")
     try:
-        # Download dataset
         dataset_path = "./downloaded_dataset_files"
         snapshot_download(
             repo_id=hf_dataset_repo_id,
             local_dir=dataset_path,
         )
         log.append(f"Dataset repository content downloaded to: {dataset_path}")
-        # Find all RVQ pairs files
-        rvq_pair_files = glob.glob(os.path.join(dataset_path, "*_rvq_pairs.pt"))
-        log.append(f"Found {len(rvq_pair_files)} RVQ pair files.")
-        # Load the pytorch files
-        all_pairs = []
-        for file_path in rvq_pair_files:
-            pairs = torch.load(file_path)
             all_pairs.extend(pairs)
-        # You may want to limit the data size for quicker testing
-        # all_pairs = all_pairs[:1000]  # Uncomment to limit data size
         log.append(f"Loaded a total of {len(all_pairs)} training pairs into memory.")
-        # Convert to HF dataset format
-        dataset_dict = {
-            "input_ids": [pair["source"] for pair in all_pairs],
-            "labels": [pair["target"] for pair in all_pairs]
-        }
-        train_dataset = Dataset.from_dict(dataset_dict)
-        # Create data collator for padding
         from transformers import DataCollatorForLanguageModeling
         data_collator = DataCollatorForLanguageModeling(
@@ -556,6 +567,7 @@ def train_model(
     output_dir = f"./results_{model_repo_name}"
     os.makedirs(output_dir, exist_ok=True)
     training_args = TrainingArguments(
         output_dir=output_dir,
         num_train_epochs=float(epochs),
@@ -574,10 +586,10 @@ def train_model(
         lr_scheduler_type="cosine",
         report_to="tensorboard",
         bf16=True if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else False,
-        gradient_checkpointing=True,
         gradient_checkpointing_kwargs={'use_reentrant': False},
         ddp_find_unused_parameters=False,
-        deepspeed=ds_config_path if n_gpus > 1 else None,
     )
     # --- Initialize Trainer ---
@@ -639,9 +651,9 @@ def create_interface():
                 dataset_repo = gr.Textbox(label="Dataset Repository Name", value="podcast-dialogue-rvq-pairs-3items")
             with gr.Column():
-                epochs = gr.Number(label="Number of Epochs", value=2, minimum=1, maximum=10)
-                batch_size = gr.Number(label="Batch Size per Device", value=2, minimum=1, maximum=8)
-                grad_accum = gr.Number(label="Gradient Accumulation Steps", value=4, minimum=1, maximum=16)
                 lr = gr.Number(label="Learning Rate", value=2e-4)
         start_btn = gr.Button("Start Training")

 # --- Configuration ---
 YOUR_HF_USERNAME = "Twelve2five"
+MODEL_REPO_NAME = "llama-3-8b-rvq-resized"
 DATASET_REPO_NAME = "podcast-dialogue-rvq-pairs-3items"
 hf_model_repo_id = f"{YOUR_HF_USERNAME}/{MODEL_REPO_NAME}"
     model_repo_name,
     dataset_repo_name,
     epochs=1,
+    batch_size=4,  # Increased for A100
     grad_accum_steps=4,
+    learning_rate=2e-4,
     progress=gr.Progress()
 ):
     progress(0, desc="Setting up environment...")
         except Exception as e:
             log.append(f"Warning: Could not remove existing dataset files: {e}")
+    # Print GPU info - using imported torch, not a local variable
     if torch.cuda.is_available():
         log.append(f"Available GPUs: {torch.cuda.device_count()}")
         for i in range(torch.cuda.device_count()):
     try:
         from datasets import Dataset
         from huggingface_hub import snapshot_download
+        # Don't import torch again, since it's already imported
         import transformers
         from transformers import AutoModelForCausalLM, AutoTokenizer
         from transformers import BitsAndBytesConfig, TrainingArguments, Trainer
     # Create DeepSpeed config file
     progress(0.1, desc="Setting up DeepSpeed configuration...")
+    # Create a simpler config since we have plenty of memory on A100
     ds_config = {
         "bf16": {
             "enabled": "auto"
         },
         "zero_optimization": {
+            "stage": 1,  # Lower stage is fine for A100-80GB
             "contiguous_gradients": True,
             "overlap_comm": True
         },
         snapshot_download(
             repo_id=hf_model_repo_id,
             local_dir=local_model_path,
+            use_auth_token=False,
+            resume_download=True
         )
         log.append(f"Model files downloaded to {local_model_path}")
+        # Create a bnb configuration for loading the model in 4-bit
+        # Not strictly necessary for A100 but keeps memory usage lower
+        progress(0.25, desc="Loading model...")
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=False
         )
+        # Load model and tokenizer
         model = AutoModelForCausalLM.from_pretrained(
             local_model_path,
             quantization_config=bnb_config,
             device_map="auto",
+            torch_dtype=torch.bfloat16,
         )
         tokenizer = AutoTokenizer.from_pretrained(local_model_path)
+        # Handle tokenizer settings
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
+        log.append(f"Loaded model vocab size: {tokenizer.vocab_size}")
+        log.append(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
+        # PEFT Configuration (Smaller LoRA for faster iteration)
         model = prepare_model_for_kbit_training(model)
         log.append("Model prepared for k-bit training")
         lora_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
+            r=16,  # Keeping higher rank for A100
+            lora_alpha=32,
             lora_dropout=0.05,
             bias="none",
+            target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]  # Fewer modules for faster training
         )
         peft_model = get_peft_model(model, lora_config)
         trainable_params = peft_model.print_trainable_parameters()
         log.append(f"LoRA applied to model")
         model_to_train = peft_model
     except Exception as e:
         error_msg = f"Error preparing model for training: {str(e)}"
         log.append(error_msg)
         return "\n".join(log)
+    # --- Download and Process Dataset ---
+    progress(0.4, desc="Downloading dataset...")
     try:
         dataset_path = "./downloaded_dataset_files"
         snapshot_download(
             repo_id=hf_dataset_repo_id,
             local_dir=dataset_path,
+            use_auth_token=False,
+            resume_download=True
         )
         log.append(f"Dataset repository content downloaded to: {dataset_path}")
+        # Load dataset from PT files
+        progress(0.5, desc="Processing dataset...")
+        # Load RVQ pairs
+        pair_files = glob.glob(f"{dataset_path}/*_rvq_pairs.pt")
+        log.append(f"Found {len(pair_files)} RVQ pair files.")
+        all_pairs = []
+        for file in pair_files:
+            pairs = torch.load(file)
             all_pairs.extend(pairs)
         log.append(f"Loaded a total of {len(all_pairs)} training pairs into memory.")
+        # Process pairs into a format suitable for training
+        all_texts = []
+        for pair in all_pairs:
+            # Create instruction format
+            if isinstance(pair, dict):
+                instruction = pair.get("instruction", "")
+                input_text = pair.get("input", "")
+                output = pair.get("output", "")
+                # ALPACA format
+                if instruction and input_text:
+                    text = f"### Instruction: {instruction}\n### Input: {input_text}\n### Response: {output}"
+                elif instruction:
+                    text = f"### Instruction: {instruction}\n### Response: {output}"
+                else:
+                    text = output
+            else:
+                # Simple prompt-completion format
+                if isinstance(pair, tuple) and len(pair) == 2:
+                    prompt, completion = pair
+                    text = f"{prompt}{completion}"
+                else:
+                    text = str(pair)
+            all_texts.append({"text": text})
+        # Create HF dataset
+        train_dataset = Dataset.from_list(all_texts)
+        # Function to tokenize the dataset
+        def tokenize_function(examples):
+            return tokenizer(
+                examples["text"],
+                padding=False,
+                truncation=True,
+                max_length=2048,
+                return_tensors=None,
+            )
+        # Tokenize the dataset
+        tokenized_dataset = train_dataset.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["text"],
+            desc="Tokenizing dataset",
+        )
+        train_dataset = tokenized_dataset
+        # Data collator
         from transformers import DataCollatorForLanguageModeling
         data_collator = DataCollatorForLanguageModeling(
     output_dir = f"./results_{model_repo_name}"
     os.makedirs(output_dir, exist_ok=True)
+    # Optimize settings for A100
     training_args = TrainingArguments(
         output_dir=output_dir,
         num_train_epochs=float(epochs),
         lr_scheduler_type="cosine",
         report_to="tensorboard",
         bf16=True if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else False,
+        gradient_checkpointing=True,  # Still useful for efficiency
         gradient_checkpointing_kwargs={'use_reentrant': False},
         ddp_find_unused_parameters=False,
+        deepspeed=ds_config_path if n_gpus > 1 else None,  # Only use DeepSpeed for multi-GPU
     )
     # --- Initialize Trainer ---
                 dataset_repo = gr.Textbox(label="Dataset Repository Name", value="podcast-dialogue-rvq-pairs-3items")
             with gr.Column():
+                epochs = gr.Number(label="Number of Epochs", value=3, minimum=1, maximum=10)
+                batch_size = gr.Number(label="Batch Size per Device", value=4, minimum=1, maximum=16)
+                grad_accum = gr.Number(label="Gradient Accumulation Steps", value=2, minimum=1, maximum=16)
                 lr = gr.Number(label="Learning Rate", value=2e-4)
         start_btn = gr.Button("Start Training")