michailroussos
more
029560f
raw
history blame
2.02 kB
import gradio as gr
from transformers import TextStreamer
from unsloth import FastLanguageModel
# Define constants
max_seq_length = 2048
dtype = None
model_name_or_path = "michailroussos/model_llama_8d"
# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name_or_path,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=True,
)
# Optimize model for inference
FastLanguageModel.for_inference(model)
# Function to generate a response
def chat_with_model(user_message, chat_history=None):
try:
# Prepare the input messages
messages = [{"role": "user", "content": user_message}]
# Tokenize and prepare inputs for the model
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
).to("cuda")
# Generate response
output_ids = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"], # Ensure attention_mask is included
streamer=None, # Collect output as tensor
max_new_tokens=128,
use_cache=True,
temperature=1.5,
min_p=0.1,
)
# Decode the generated tokens into a string
response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
# Append the response to the chat history
if chat_history is None:
chat_history = []
chat_history.append((user_message, response))
return "", chat_history
except Exception as e:
return f"Error: {str(e)}", chat_history
# Create the chat interface
demo = gr.ChatInterface(
fn=chat_with_model,
chatbot=gr.Chatbot(label="Chat with Hugging Face Model"),
title="Hugging Face Chat Model",
description="Chat with a Hugging Face model using FastLanguageModel.",
)
# Launch the app
if __name__ == "__main__":
demo.launch()