Spaces:

nafisneehal
/

chanbot

Sleeping

App Files Files Community

chanbot / app.py

nafisneehal

Update app.py

5387ea1 verified about 1 year ago

raw

history blame

3.09 kB

	import gradio as gr
	import os
	import torch
	from unsloth import FastLanguageModel
	from huggingface_hub import spaces

	# Get Hugging Face token from environment variables
	HF_TOKEN = os.environ.get('HF_TOKEN')

	# Check if we're running in a Hugging Face Space with GPU constraints
	IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1"
	IS_SPACE = os.environ.get("SPACE_ID", None) is not None

	# Determine device (use GPU if available)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1"

	print(f"Using device: {device}")
	print(f"Low memory mode: {LOW_MEMORY}")

	# Model configuration
	max_seq_length = 2048 # Max sequence length for RoPE scaling
	dtype = torch.float16 if device == "cuda" else torch.float32
	load_in_4bit = True # Enable 4-bit quantization if memory is limited

	# Load model and tokenizer with device mapping
	model_name = "nafisneehal/chandler_bot"
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=model_name,
	max_seq_length=max_seq_length,
	dtype=dtype,
	load_in_4bit=load_in_4bit,
	device_map="auto" if device == "cuda" else None # Automatic GPU mapping
	)
	FastLanguageModel.for_inference(model) # Optimize model for faster inference

	# Define prompt structure (update if necessary for your model)
	alpaca_prompt = "{instruction} {input} {output}"

	instruction_text = "Learn how to talk like Chandler - a popular character from FRIENDS TV Show. Input is someone saying something, Output is what Chandler saying in response."


	@spaces.GPU # Use GPU provided by Hugging Face Spaces if available
	def generate_response(user_input, chat_history):
	instruction = user_input # Treats user input as instruction
	input_text = "" # Any additional input if needed; empty otherwise

	# Prepare inputs for model inference on the correct device
	inputs = tokenizer(
	[alpaca_prompt.format(instruction, input_text, "")],
	return_tensors="pt"
	).to(device) # Ensure tensors are on the correct device

	# Generate response on GPU or CPU as appropriate
	with torch.no_grad():
	outputs = model.generate(**inputs, max_new_tokens=100)

	# Decode response
	bot_reply = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Update chat history with user and bot interactions
	chat_history.append(("User", user_input))
	chat_history.append(("Bot", bot_reply))

	return chat_history, "" # Returns updated chat history and clears input


	# Set up Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# Llama-Based Chatbot on GPU")

	chat_history = gr.Chatbot(label="Chat History")
	user_input = gr.Textbox(
	placeholder="Type your message here...", label="Your Message")

	# Connect submit actions to generate response function
	user_input.submit(generate_response, [user_input, chat_history], [
	chat_history, user_input])
	submit_btn = gr.Button("Send")
	submit_btn.click(generate_response, [user_input, chat_history], [
	chat_history, user_input])

	demo.launch()