Spaces:

Spestly
/

AthenaPlayground

Running on Zero

App Files Files Community

AthenaPlayground / app.py

Spestly

Update app.py

018f2bb verified about 1 month ago

raw

history blame

6.25 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch
	import time
	import spaces

	# Model configurations
	MODELS = {
	"Athena-R3X 8B": "Spestly/Athena-R3X-8B",
	"Athena-R3X 4B": "Spestly/Athena-R3X-4B",
	"Athena-R3 7B": "Spestly/Athena-R3-7B",
	"Athena-3 3B": "Spestly/Athena-3-3B",
	"Athena-3 7B": "Spestly/Athena-3-7B",
	"Athena-3 14B": "Spestly/Athena-3-14B",
	"Athena-2 1.5B": "Spestly/Athena-2-1.5B",
	"Athena-1 3B": "Spestly/Athena-1-3B",
	"Athena-1 7B": "Spestly/Athena-1-7B"
	}

	@spaces.GPU
	def generate_response(model_id, conversation, user_message, max_length=512, temperature=0.7):
	"""Generate response using ZeroGPU - all CUDA operations happen here"""

	# Load model and tokenizer inside the GPU function
	print(f"🚀 Loading {model_id}...")
	start_time = time.time()

	tokenizer = AutoTokenizer.from_pretrained(model_id)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True
	)

	load_time = time.time() - start_time
	print(f"✅ Model loaded in {load_time:.2f}s")

	# Build conversation history
	conversation_history = []
	for user_msg, assistant_msg in conversation:
	if user_msg:
	conversation_history.append(f"User: {user_msg}")
	if assistant_msg:
	conversation_history.append(f"Athena: {assistant_msg}")

	# Add current user message
	conversation_history.append(f"User: {user_message}")
	conversation_history.append("Athena:")

	# Create prompt
	prompt = "\n".join(conversation_history)

	# Tokenize and move to GPU
	inputs = tokenizer(prompt, return_tensors="pt")

	# Move inputs to the same device as the model
	device = next(model.parameters()).device
	inputs = {k: v.to(device) for k, v in inputs.items()}

	generation_start = time.time()
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=max_length,
	temperature=temperature,
	do_sample=True,
	top_p=0.9,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id
	)

	generation_time = time.time() - generation_start

	# Decode response
	response = tokenizer.decode(
	outputs[0][inputs['input_ids'].shape[-1]:],
	skip_special_tokens=True
	).strip()

	return response, load_time, generation_time

	def chatbot(conversation, user_message, model_name, max_length=512, temperature=0.7):
	if not user_message.strip():
	return conversation, "", "Please enter a message"

	if conversation is None:
	conversation = []

	# Get model ID
	model_id = MODELS.get(model_name, MODELS["Athena-R3X 8B"])

	try:
	# Add user message to conversation
	conversation.append([user_message, ""])

	# Generate response using ZeroGPU
	response, load_time, generation_time = generate_response(
	model_id, conversation[:-1], user_message, max_length, temperature
	)

	# Update the conversation with the response
	conversation[-1][1] = response

	stats = f"⚡ Load: {load_time:.1f}s \| Gen: {generation_time:.1f}s \| Model: {model_name}"

	return conversation, "", stats

	except Exception as e:
	error_msg = f"Error: {str(e)}"
	if conversation:
	conversation[-1][1] = error_msg
	else:
	conversation = [[user_message, error_msg]]
	return conversation, "", f"❌ Error: {str(e)}"

	def clear_chat():
	return [], "", ""

	# CSS for better styling
	css = """
	#chatbot {
	height: 600px;
	}
	.message {
	padding: 10px;
	margin: 5px;
	border-radius: 10px;
	}
	"""

	# Create Gradio interface
	with gr.Blocks(title="Athena Playground Chat", css=css) as demo:
	gr.Markdown("# 🚀 Athena Playground Chat")
	gr.Markdown("Powered by HuggingFace ZeroGPU")

	with gr.Row():
	with gr.Column(scale=1):
	model_choice = gr.Dropdown(
	label="📱 Model",
	choices=list(MODELS.keys()),
	value="Athena-R3X 8B",
	info="Select which Athena model to use"
	)
	max_length = gr.Slider(
	32, 2048, value=512,
	label="📝 Max Tokens",
	info="Maximum number of tokens to generate"
	)
	temperature = gr.Slider(
	0.1, 2.0, value=0.7,
	label="🎨 Creativity",
	info="Higher values = more creative responses"
	)
	clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")

	with gr.Column(scale=3):
	chat_history = gr.Chatbot(
	elem_id="chatbot",
	show_label=False,
	avatar_images=["👤", "🤖"]
	)
	user_input = gr.Textbox(
	placeholder="Ask Athena anything...",
	label="Your message",
	lines=2,
	max_lines=10
	)
	with gr.Row():
	submit_btn = gr.Button("📤 Send", variant="primary")
	stats_output = gr.Textbox(
	label="Stats",
	interactive=False,
	show_label=False,
	placeholder="Stats will appear here..."
	)

	# Event handlers
	submit_btn.click(
	chatbot,
	inputs=[chat_history, user_input, model_choice, max_length, temperature],
	outputs=[chat_history, user_input, stats_output]
	)

	user_input.submit(
	chatbot,
	inputs=[chat_history, user_input, model_choice, max_length, temperature],
	outputs=[chat_history, user_input, stats_output]
	)

	clear_btn.click(
	clear_chat,
	inputs=[],
	outputs=[chat_history, user_input, stats_output]
	)

	if __name__ == "__main__":
	demo.launch()