GPT-OSS

Running on Zero

App Files Files Community

GPT-OSS / app.py

Spestly

Update app.py

da8de8d verified about 1 month ago

raw

history blame

5.75 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch
	import time
	import spaces

	# Model configurations
	MODELS = {
	"Athena-R3X 8B": "Spestly/Athena-R3X-8B",
	"Athena-R3X 4B": "Spestly/Athena-R3X-4B",
	"Athena-R3 7B": "Spestly/Athena-R3-7B",
	"Athena-3 3B": "Spestly/Athena-3-3B",
	"Athena-3 7B": "Spestly/Athena-3-7B",
	"Athena-3 14B": "Spestly/Athena-3-14B",
	"Athena-2 1.5B": "Spestly/Athena-2-1.5B",
	"Athena-1 3B": "Spestly/Athena-1-3B",
	"Athena-1 7B": "Spestly/Athena-1-7B"
	}

	@spaces.GPU
	def generate_response(model_id, conversation, user_message, max_length=512, temperature=0.7):
	"""Generate response using ZeroGPU - all CUDA operations happen here"""
	print(f"🚀 Loading {model_id}...")
	start_time = time.time()
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True
	)
	load_time = time.time() - start_time
	print(f"✅ Model loaded in {load_time:.2f}s")

	# Build messages in proper chat format (OpenAI-style messages)
	messages = []
	system_prompt = (
	"You are Athena, a helpful, harmless, and honest AI assistant. "
	"You provide clear, accurate, and concise responses to user questions. "
	"You are knowledgeable across many domains and always aim to be respectful and helpful. "
	"You are finetuned by Aayan Mishra"
	)
	messages.append({"role": "system", "content": system_prompt})

	# Add conversation history (OpenAI-style)
	for msg in conversation:
	if msg["role"] in ("user", "assistant"):
	messages.append({"role": msg["role"], "content": msg["content"]})

	# Add current user message
	messages.append({"role": "user", "content": user_message})

	prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	inputs = tokenizer(prompt, return_tensors="pt")
	device = next(model.parameters()).device
	inputs = {k: v.to(device) for k, v in inputs.items()}
	generation_start = time.time()
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=max_length,
	temperature=temperature,
	do_sample=True,
	top_p=0.9,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id
	)
	generation_time = time.time() - generation_start
	response = tokenizer.decode(
	outputs[0][inputs['input_ids'].shape[-1]:],
	skip_special_tokens=True
	).strip()
	return response, load_time, generation_time

	def respond(history, message, model_name, max_length, temperature):
	"""Main function for custom Chatbot interface"""
	if not message.strip():
	history = history + [["user", message], ["assistant", "Please enter a message"]]
	return history, ""
	model_id = MODELS.get(model_name, MODELS["Athena-R3X 8B"])
	try:
	# Format history for Athena
	formatted_history = []
	for i in range(0, len(history), 2):
	if i < len(history):
	user_msg = history[i][1] if history[i][0] == "user" else ""
	assistant_msg = history[i+1][1] if i+1 < len(history) and history[i+1][0] == "assistant" else ""
	if user_msg:
	formatted_history.append({"role": "user", "content": user_msg})
	if assistant_msg:
	formatted_history.append({"role": "assistant", "content": assistant_msg})
	response, load_time, generation_time = generate_response(
	model_id, formatted_history, message, max_length, temperature
	)
	history = history + [["user", message], ["assistant", response]]
	return history, ""
	except Exception as e:
	history = history + [["user", message], ["assistant", f"Error: {str(e)}"]]
	return history, ""

	css = """
	.message {
	padding: 10px;
	margin: 5px;
	border-radius: 10px;
	}
	"""

	theme = gr.themes.Monochrome()

	with gr.Blocks(title="Athena Playground Chat", css=css, theme=theme) as demo:
	gr.Markdown("# 🚀 Athena Playground Chat")
	gr.Markdown("Powered by HuggingFace ZeroGPU")

	chatbot = gr.Chatbot(height=500, label="Athena", avatar="🤖")
	state = gr.State([]) # chat history

	with gr.Row():
	user_input = gr.Textbox(label="Your message", scale=8, autofocus=True)
	send_btn = gr.Button(value="Send", scale=1)

	# --- Configuration controls at the bottom ---
	gr.Markdown("### ⚙️ Model & Generation Settings")
	with gr.Row():
	model_choice = gr.Dropdown(
	label="📱 Model",
	choices=list(MODELS.keys()),
	value="Athena-R3X 4B",
	info="Select which Athena model to use"
	)
	max_length = gr.Slider(
	32, 2048, value=512,
	label="📝 Max Tokens",
	info="Maximum number of tokens to generate"
	)
	temperature = gr.Slider(
	0.1, 2.0, value=0.7,
	label="🎨 Creativity",
	info="Higher values = more creative responses"
	)

	def chat_submit(history, message, model_name, max_length, temperature):
	return respond(history, message, model_name, max_length, temperature)

	send_btn.click(
	chat_submit,
	inputs=[state, user_input, model_choice, max_length, temperature],
	outputs=[chatbot, user_input]
	)

	if __name__ == "__main__":
	demo.launch()