|
import gradio as gr |
|
from optimum.intel import OVModelForCausalLM |
|
from transformers import AutoTokenizer, pipeline |
|
|
|
|
|
model_id = "hsuwill000/Qwen2.5-1.5B-Instruct-openvino-8bit" |
|
model = OVModelForCausalLM.from_pretrained(model_id, device="CPU") |
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
|
|
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) |
|
|
|
def respond(message, history): |
|
try: |
|
|
|
input_text = message |
|
if history: |
|
input_text = "\n".join([f"User: {h['user']}\nBot: {h['bot']}" for h in history]) + f"\nUser: {message}" |
|
|
|
|
|
response = pipe( |
|
input_text, |
|
max_length=512, |
|
truncation=True, |
|
num_return_sequences=1, |
|
temperature=0.7, |
|
top_p=0.9, |
|
) |
|
reply = response[0]['generated_text'].strip() |
|
|
|
|
|
history.append({"user": message, "bot": reply}) |
|
return history |
|
|
|
except Exception as e: |
|
print(f"Error: {e}") |
|
return history + [{"user": message, "bot": "Sorry, something went wrong. Please try again."}] |
|
|
|
|
|
demo = gr.ChatInterface( |
|
fn=respond, |
|
title="Qwen2.5-1.5B-Instruct-openvino", |
|
description="Chat with Qwen2.5-1.5B-Instruct-openvino model.", |
|
examples=["Hello!", "Tell me a joke.", "Explain quantum computing."], |
|
retry_btn=None, |
|
undo_btn=None, |
|
clear_btn="Clear History", |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |