Spaces:
Runtime error
Runtime error
import gradio as gr | |
from transformers import TextStreamer | |
from unsloth import FastLanguageModel | |
# Define constants | |
max_seq_length = 2048 | |
dtype = None | |
model_name_or_path = "michailroussos/model_llama_8d" | |
# Load the model and tokenizer | |
model, tokenizer = FastLanguageModel.from_pretrained( | |
model_name=model_name_or_path, | |
max_seq_length=max_seq_length, | |
dtype=dtype, | |
load_in_4bit=True, | |
) | |
# Optimize model for inference | |
FastLanguageModel.for_inference(model) | |
# Function to generate a response | |
def chat_with_model(user_message, chat_history=None): | |
try: | |
# Prepare the input messages | |
messages = [{"role": "user", "content": user_message}] | |
# Tokenize and prepare inputs for the model | |
inputs = tokenizer.apply_chat_template( | |
messages, | |
tokenize=True, | |
add_generation_prompt=True, | |
return_tensors="pt", | |
).to("cuda") | |
# Generate response | |
output_ids = model.generate( | |
input_ids=inputs["input_ids"], | |
attention_mask=inputs["attention_mask"], # Ensure attention_mask is included | |
streamer=None, # Collect output as tensor | |
max_new_tokens=128, | |
use_cache=True, | |
temperature=1.5, | |
min_p=0.1, | |
) | |
# Decode the generated tokens into a string | |
response = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
# Append the response to the chat history | |
if chat_history is None: | |
chat_history = [] | |
chat_history.append((user_message, response)) | |
return "", chat_history | |
except Exception as e: | |
return f"Error: {str(e)}", chat_history | |
# Create the chat interface | |
demo = gr.ChatInterface( | |
fn=chat_with_model, | |
chatbot=gr.Chatbot(label="Chat with Hugging Face Model"), | |
title="Hugging Face Chat Model", | |
description="Chat with a Hugging Face model using FastLanguageModel.", | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch() | |