Spaces:

michailroussos
/

ID2223_9D_withGPU

Runtime error

michailroussos

029560f 7 months ago

2.02 kB

	import gradio as gr
	from transformers import TextStreamer
	from unsloth import FastLanguageModel

	# Define constants
	max_seq_length = 2048
	dtype = None
	model_name_or_path = "michailroussos/model_llama_8d"

	# Load the model and tokenizer
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=model_name_or_path,
	max_seq_length=max_seq_length,
	dtype=dtype,
	load_in_4bit=True,
	)

	# Optimize model for inference
	FastLanguageModel.for_inference(model)

	# Function to generate a response
	def chat_with_model(user_message, chat_history=None):
	try:
	# Prepare the input messages
	messages = [{"role": "user", "content": user_message}]

	# Tokenize and prepare inputs for the model
	inputs = tokenizer.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_tensors="pt",
	).to("cuda")

	# Generate response
	output_ids = model.generate(
	input_ids=inputs["input_ids"],
	attention_mask=inputs["attention_mask"], # Ensure attention_mask is included
	streamer=None, # Collect output as tensor
	max_new_tokens=128,
	use_cache=True,
	temperature=1.5,
	min_p=0.1,
	)

	# Decode the generated tokens into a string
	response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

	# Append the response to the chat history
	if chat_history is None:
	chat_history = []
	chat_history.append((user_message, response))
	return "", chat_history
	except Exception as e:
	return f"Error: {str(e)}", chat_history

	# Create the chat interface
	demo = gr.ChatInterface(
	fn=chat_with_model,
	chatbot=gr.Chatbot(label="Chat with Hugging Face Model"),
	title="Hugging Face Chat Model",
	description="Chat with a Hugging Face model using FastLanguageModel.",
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()