import os import torch import glob import gc from transformers import ( AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer ) from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training from datasets import Dataset from huggingface_hub import snapshot_download from tqdm import tqdm import gradio as gr import math from accelerate import Accelerator # --- Configuration --- YOUR_HF_USERNAME = "Twelve2five" MODEL_REPO_NAME = "llama-3-8b-rvq-resized" DATASET_REPO_NAME = "podcast-dialogue-rvq-pairs-3items" hf_model_repo_id = f"{YOUR_HF_USERNAME}/{MODEL_REPO_NAME}" hf_dataset_repo_id = f"{YOUR_HF_USERNAME}/{DATASET_REPO_NAME}" # Output directories OUTPUT_TRAINING_DIR = "./llama3-8b-rvq-qlora-finetuned-run" LOGGING_DIR = "./llama3-8b-rvq-qlora-logs-run" local_download_path = "./downloaded_dataset_files" # Training parameters NUM_EPOCHS = 1 BATCH_SIZE_PER_DEVICE = 1 GRAD_ACCUMULATION_STEPS = 64 LEARNING_RATE = 1e-4 WEIGHT_DECAY = 0.01 WARMUP_RATIO = 0.03 LR_SCHEDULER = "cosine" OPTIMIZER = "paged_adamw_8bit" MAX_SEQ_LENGTH = 256 MICRO_BATCH_SIZE = 1 # Multi-GPU configuration accelerator = Accelerator() # Configure environment for multi-GPU os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32" # Print GPU information print(f"Available GPUs: {torch.cuda.device_count()}") for i in range(torch.cuda.device_count()): print(f"GPU {i}: {torch.cuda.get_device_name(i)} with {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB") def seq2seq_causal_collator(features): """ Collator that concatenates context (input_ids) and target (labels) for Causal LM sequence-to-sequence training. Masks the loss for the context part of the sequence. Pads sequences to the maximum length in the batch. """ batch = {} concatenated_input_ids = [] concatenated_labels = [] max_len = 0 # --- First pass: Concatenate, create masked labels, find max length --- for feature in features: # Dataset transform should provide tensors here input_ids = feature['input_ids'] labels = feature['labels'] # Ensure tensors are 1D (handle potential extra dims if any) if input_ids.dim() > 1: input_ids = input_ids.squeeze() if labels.dim() > 1: labels = labels.squeeze() context_len = input_ids.shape[0] target_len = labels.shape[0] # Concatenate context and target for input combined_ids = torch.cat([input_ids, labels], dim=0) concatenated_input_ids.append(combined_ids) # Create labels: -100 for context, actual labels for target masked_labels = torch.cat([ torch.full((context_len,), -100, dtype=torch.long, device=input_ids.device), labels ], dim=0) concatenated_labels.append(masked_labels) # Track max length for padding if combined_ids.shape[0] > max_len: max_len = combined_ids.shape[0] # --- Second pass: Pad to max length --- padded_input_ids = [] padded_labels = [] input_pad_token_id = 0 label_pad_token_id = -100 for i in range(len(features)): ids = concatenated_input_ids[i] lbls = concatenated_labels[i] padding_len = max_len - ids.shape[0] # Pad on the right side padded_input_ids.append(torch.nn.functional.pad( ids, (0, padding_len), value=input_pad_token_id )) padded_labels.append(torch.nn.functional.pad( lbls, (0, padding_len), value=label_pad_token_id )) # --- Stack and create final batch --- batch['input_ids'] = torch.stack(padded_input_ids) batch['labels'] = torch.stack(padded_labels) # Create attention mask (1 for real tokens, 0 for padding) batch['attention_mask'] = batch['input_ids'].ne(input_pad_token_id).long() return batch def prepare_for_dataset(batch): output = {'input_ids': [], 'labels': []} for item in batch: output['input_ids'].append(item['input_ids'].cpu().tolist()) output['labels'].append(item['labels'].cpu().tolist()) return output def load_model(): clean_memory() # Start with clean memory print(f"Loading base model architecture from: {hf_model_repo_id}") # Even more extreme quantization bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, # Use float16 instead of bfloat16 bnb_4bit_use_double_quant=True, ) # Use DeepSpeed if available try: from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live use_deepspeed = True print("DeepSpeed available, will use ZeRO-3") except ImportError: use_deepspeed = False print("DeepSpeed not available, falling back to standard distribution") # Calculate per-GPU reserved memory (be very conservative) n_gpus = max(1, torch.cuda.device_count()) max_memory = {i: f"{int(torch.cuda.get_device_properties(i).total_memory / 1e9) - 4}GB" for i in range(n_gpus)} max_memory["cpu"] = "32GB" print(f"Using {n_gpus} GPUs with memory configuration: {max_memory}") # Load model with proper device distribution model = AutoModelForCausalLM.from_pretrained( hf_model_repo_id, quantization_config=bnb_config, device_map="balanced_low_0", # Distribute evenly with priority to minimize GPU 0 usage max_memory=max_memory, trust_remote_code=True, use_cache=False, torch_dtype=torch.float16, low_cpu_mem_usage=True, ) print(f"Loaded model vocab size: {model.config.vocab_size}") print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}") # --- Prepare for K-bit Training & Apply LoRA --- model = prepare_model_for_kbit_training(model) lora_config = LoraConfig( task_type=TaskType.CAUSAL_LM, r=16, lora_alpha=32, lora_dropout=0.05, bias="none", target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] ) peft_model = get_peft_model(model, lora_config) peft_model.print_trainable_parameters() # Cleanup gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() return peft_model def load_dataset(): # --- Download the dataset repository files --- try: os.makedirs(local_download_path, exist_ok=True) downloaded_repo_root = snapshot_download( repo_id=hf_dataset_repo_id, repo_type="dataset", local_dir=local_download_path, local_dir_use_symlinks=False ) print(f"Dataset repository content downloaded to: {downloaded_repo_root}") except Exception as e: print(f"Error downloading dataset: {e}") return None # --- Load .pt files into a Hugging Face Dataset object --- pairs_dir = os.path.join(downloaded_repo_root, "final_rvq_pairs") all_pair_files = glob.glob(os.path.join(pairs_dir, "*_rvq_pairs.pt")) if not all_pair_files: all_pair_files = glob.glob(os.path.join(downloaded_repo_root, "*_rvq_pairs.pt")) if not all_pair_files: print("No RVQ pair files found!") return None print(f"Found {len(all_pair_files)} RVQ pair files.") # Load data from .pt files into memory all_data_pairs = [] for file_path in tqdm(all_pair_files, desc="Loading pair files"): try: episode_pairs = torch.load(file_path, map_location='cpu') all_data_pairs.extend(episode_pairs) except Exception as e: print(f"Warning: Could not load file {file_path}: {e}") if not all_data_pairs: return None print(f"Loaded {len(all_data_pairs)} training pairs.") # Convert to Hugging Face Dataset chunk_size = 1000 processed_data = {'input_ids': [], 'labels': []} for i in tqdm(range(0, len(all_data_pairs), chunk_size), desc="Preparing data"): batch = all_data_pairs[i:i + chunk_size] prepared_batch = prepare_for_dataset(batch) processed_data['input_ids'].extend(prepared_batch['input_ids']) processed_data['labels'].extend(prepared_batch['labels']) hf_dataset = Dataset.from_dict(processed_data) # Transform to get tensors back hf_dataset.set_transform(lambda batch: { 'input_ids': [torch.tensor(ids, dtype=torch.long) for ids in batch['input_ids']], 'labels': [torch.tensor(lbls, dtype=torch.long) for lbls in batch['labels']] }) # Cleanup del all_data_pairs del processed_data gc.collect() return hf_dataset # Memory cleaning function def clean_memory(): gc.collect() if torch.cuda.is_available(): for i in range(torch.cuda.device_count()): with torch.cuda.device(f'cuda:{i}'): torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() def train_model(progress=gr.Progress()): # Clean memory before starting clean_memory() # Load model with optimized memory settings model = load_model() # Load and prepare dataset progress(0.1, desc="Loading dataset...") train_dataset = load_dataset() # Initialize trainer with memory-optimized settings progress(0.2, desc="Initializing trainer...") # Optional: try a custom data collator that explicitly caps sequence length def data_capped_collator(examples): # Call your existing collator batch = seq2seq_causal_collator(examples) # Ensure we cap to MAX_SEQ_LENGTH for k, v in batch.items(): if isinstance(v, torch.Tensor) and v.dim() >= 2: batch[k] = v[:, :MAX_SEQ_LENGTH] return batch # Initialize trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, data_collator=data_capped_collator, # Use our capped collator ) # Print memory status before training progress(0.3, desc="Ready to train, checking memory...") for i in range(torch.cuda.device_count()): print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved") try: # Clean again just before training clean_memory() # Start with smaller gradient accumulation and increase progress(0.4, desc="Starting training with conservative settings...") # Train with multi-GPU support train_result = trainer.train() # Save the final model progress(0.9, desc="Saving model...") trainer.save_model(OUTPUT_TRAINING_DIR) return "Training completed successfully!" except Exception as e: error_msg = str(e) print(f"Training error: {error_msg}") # Add memory diagnostics to error message mem_info = "\nMemory status at error time:\n" for i in range(torch.cuda.device_count()): mem_info += f"GPU {i}: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved\n" return f"An error occurred during training: {error_msg}\n{mem_info}" # Create Gradio interface def create_ui(): with gr.Blocks() as demo: gr.Markdown("# Fine-tune LLaMA 3 8B with QLoRA") with gr.Tab("Training"): train_button = gr.Button("Start Fine-tuning") result_text = gr.Textbox(label="Training Results", interactive=False) train_button.click(train_model, outputs=result_text) with gr.Tab("About"): gr.Markdown(""" ## Information This is a Hugging Face Space version of the original Google Colab notebook. It fine-tunes a quantized LLaMA 3 8B model using QLoRA on podcast dialogue data. ### Model - Base Model: {YOUR_HF_USERNAME}/{MODEL_REPO_NAME} - Using 4-bit quantization with LoRA adapters ### Dataset - Custom dataset: {YOUR_HF_USERNAME}/{DATASET_REPO_NAME} - Contains podcast dialogue pairs processed for training ### Training Setup - QLoRA fine-tuning - Epochs: {NUM_EPOCHS} - Batch size: {BATCH_SIZE_PER_DEVICE} with {GRAD_ACCUMULATION_STEPS} gradient accumulation steps - Learning rate: {LEARNING_RATE} """.format( YOUR_HF_USERNAME=YOUR_HF_USERNAME, MODEL_REPO_NAME=MODEL_REPO_NAME, DATASET_REPO_NAME=DATASET_REPO_NAME, NUM_EPOCHS=NUM_EPOCHS, BATCH_SIZE_PER_DEVICE=BATCH_SIZE_PER_DEVICE, GRAD_ACCUMULATION_STEPS=GRAD_ACCUMULATION_STEPS, LEARNING_RATE=LEARNING_RATE )) return demo # Main entry point if __name__ == "__main__": # Install dependencies first if needed # !pip install -q -U transformers accelerate bitsandbytes peft torch datasets huggingface_hub gradio # Create and launch the UI demo = create_ui() demo.launch()