import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline from peft import PeftModel # 1) Load the original base model & tokenizer BASE_MODEL = "facebook/blenderbot-400M-distill" tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL) # 2) Load your fine-tuned LoRA adapter on top ADAPTER_REPO = "abinashnp/bayedger-chatbot" model = PeftModel.from_pretrained(base_model, ADAPTER_REPO) # 3) Wrap that in a text2text pipeline chatbot = pipeline( "text2text-generation", model=model, tokenizer=tokenizer, device_map="auto", # leave out device arg when using accelerate device_map ) def respond(query): out = chatbot( f"question: {query} answer:", max_new_tokens=150, temperature=1.0, top_p=0.9, repetition_penalty=1.1, num_beams=1 )[0]["generated_text"] return out with gr.Blocks() as demo: gr.Markdown("# 🤖 Bayedger FAQ Chatbot") txt = gr.Textbox(label="Ask me anything") out = gr.Textbox(label="Answer") txt.submit(respond, txt, out) demo.launch()