import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import gradio as gr import os from accelerate import Accelerator # Initialize the Accelerator to manage device placement and offloading accelerator = Accelerator() # Use GPU if available device = "cuda" if torch.cuda.is_available() else "cpu" # Base model and adapter paths base_model_name = "microsoft/phi-2" # Pull from HF Hub directly adapter_path = "Shriti09/Microsoft-Phi-QLora" # Update with your Hugging Face repo path # Create an offload directory to store the model parts offload_dir = "./offload_dir" # Set offload directory here # Ensure the offload directory exists os.makedirs(offload_dir, exist_ok=True) print("🔧 Loading base model...") # Using the Accelerator to load the model and dispatch to the correct devices base_model = AutoModelForCausalLM.from_pretrained( base_model_name, device_map="auto", # Automatically map the model to available devices torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32 ) # Ensure the model is offloaded when necessary base_model = accelerator.prepare(base_model) print("🔧 Loading LoRA adapter...") adapter_model = PeftModel.from_pretrained(base_model, adapter_path) print("🔗 Merging adapter into base model...") merged_model = adapter_model.merge_and_unload() merged_model.eval() # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(base_model_name) print("✅ Model ready for inference!") # Chat function with history def chat_fn(message, history): # Convert history to the required format for gr.Chatbot (list of dictionaries with role and content) full_prompt = "" for user_msg, bot_msg in history: full_prompt += f"User: {user_msg}\nAI: {bot_msg}\n" full_prompt += f"User: {message}\nAI:" # Tokenize inputs inputs = tokenizer(full_prompt, return_tensors="pt").to(device) with torch.no_grad(): outputs = merged_model.generate( **inputs, max_new_tokens=150, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id ) # Decode and return only the AI's latest response response = tokenizer.decode(outputs[0], skip_special_tokens=True) response = response.split("AI:")[-1].strip() # Append to history in the correct format for gr.Chatbot (list of dictionaries) history.append({"role": "user", "content": message}) history.append({"role": "assistant", "content": response}) return history, history # Gradio UI with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("