GPT-OSS

Running on Zero

File size: 6,089 Bytes

82452fa
 
 
 
3a04e30
82452fa
eeda09f
 
 
 
 
 
 
 
 
 
 
 
 
3a04e30
eeda09f
 
 
 
 
82452fa
3a04e30
82452fa
eeda09f
 
 
82452fa
 
eeda09f
3a04e30
 
82452fa
3a04e30
82452fa
3a04e30
 
eeda09f
 
 
 
 
 
 
3a04e30
eeda09f
 
 
 
 
 
 
 
 
 
 
77246c4
 
 
 
 
 
 
3a04e30
 
77246c4
3a04e30
eeda09f
 
 
 
3a04e30
 
 
 
eeda09f
77246c4
3a04e30
 
 
 
 
 
 
 
 
 
 
eeda09f
3a04e30
 
 
eeda09f
 
3a04e30
 
eeda09f
 
3a04e30
eeda09f
3a04e30
 
 
 
 
 
 
 
 
eeda09f
77246c4
3a04e30
 
77246c4
3a04e30
 
 
 
 
 
 
 
 
 
 
77246c4
3a04e30
 
77246c4
3a04e30
 
82452fa
 
 
3a04e30
82452fa
3a04e30
 
82452fa
3a04e30
 
 
 
 
 
 
 
 
 
 
 
82452fa
3a04e30
 
 
 
 
77246c4
 
 
3a04e30
 
77246c4
3a04e30
 
 
 
 
 
 
 
 
 
82452fa
77246c4
 
3a04e30
82452fa
3a04e30
 
 
 
 
 
 
77246c4
 
 
3a04e30
82452fa
 
e6695b6

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
import spaces

# Model configurations
MODELS = {
    "Athena-R3X 8B": "Spestly/Athena-R3X-8B",
    "Athena-R3X 4B": "Spestly/Athena-R3X-4B",
    "Athena-R3 7B": "Spestly/Athena-R3-7B",
    "Athena-3 3B": "Spestly/Athena-3-3B",
    "Athena-3 7B": "Spestly/Athena-3-7B",
    "Athena-3 14B": "Spestly/Athena-3-14B",
    "Athena-2 1.5B": "Spestly/Athena-2-1.5B",
    "Athena-1 3B": "Spestly/Athena-1-3B",
    "Athena-1 7B": "Spestly/Athena-1-7B"
}

@spaces.GPU
def generate_response(model_id, conversation, user_message, max_length=512, temperature=0.7):
    """Generate response using ZeroGPU - all CUDA operations happen here"""
    
    # Load model and tokenizer inside the GPU function
    print(f"🚀 Loading {model_id}...")
    start_time = time.time()
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    
    load_time = time.time() - start_time
    print(f"✅ Model loaded in {load_time:.2f}s")
    
    # Build conversation history
    conversation_history = []
    for user_msg, assistant_msg in conversation:
        if user_msg:
            conversation_history.append(f"User: {user_msg}")
        if assistant_msg:
            conversation_history.append(f"Athena: {assistant_msg}")
    
    # Add current user message
    conversation_history.append(f"User: {user_message}")
    conversation_history.append("Athena:")
    
    # Create prompt
    prompt = "\n".join(conversation_history)
    
    # Tokenize and generate
    inputs = tokenizer(prompt, return_tensors="pt")
    
    generation_start = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=temperature,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    generation_time = time.time() - generation_start
    
    # Decode response
    response = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[-1]:], 
        skip_special_tokens=True
    ).strip()
    
    return response, load_time, generation_time

def chatbot(conversation, user_message, model_name, max_length=512, temperature=0.7):
    if not user_message.strip():
        return conversation, "", "Please enter a message"
    
    if conversation is None:
        conversation = []
    
    # Get model ID
    model_id = MODELS.get(model_name, MODELS["Athena-R3X 8B"])
    
    try:
        # Add user message to conversation
        conversation.append([user_message, ""])
        
        # Generate response using ZeroGPU
        response, load_time, generation_time = generate_response(
            model_id, conversation[:-1], user_message, max_length, temperature
        )
        
        # Update the conversation with the response
        conversation[-1][1] = response
        
        stats = f"⚡ Load: {load_time:.1f}s | Gen: {generation_time:.1f}s | Model: {model_name}"
        
        return conversation, "", stats
        
    except Exception as e:
        error_msg = f"Error: {str(e)}"
        if conversation:
            conversation[-1][1] = error_msg
        else:
            conversation = [[user_message, error_msg]]
        return conversation, "", f"❌ Error: {str(e)}"

def clear_chat():
    return [], "", ""

# CSS for better styling
css = """
#chatbot {
    height: 600px;
}
.message {
    padding: 10px;
    margin: 5px;
    border-radius: 10px;
}
"""

# Create Gradio interface
with gr.Blocks(title="Athena Playground Chat", css=css) as demo:
    gr.Markdown("# 🚀 Athena Playground Chat")
    gr.Markdown("*Powered by HuggingFace ZeroGPU*")
    
    with gr.Row():
        with gr.Column(scale=1):
            model_choice = gr.Dropdown(
                label="📱 Model",
                choices=list(MODELS.keys()),
                value="Athena-R3X 8B",
                info="Select which Athena model to use"
            )
            max_length = gr.Slider(
                32, 2048, value=512, 
                label="📝 Max Tokens",
                info="Maximum number of tokens to generate"
            )
            temperature = gr.Slider(
                0.1, 2.0, value=0.7, 
                label="🎨 Creativity",
                info="Higher values = more creative responses"
            )
            clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
        
        with gr.Column(scale=3):
            chat_history = gr.Chatbot(
                elem_id="chatbot",
                show_label=False,
                avatar_images=["👤", "🤖"]
            )
            user_input = gr.Textbox(
                placeholder="Ask Athena anything...",
                label="Your message",
                lines=2,
                max_lines=10
            )
            with gr.Row():
                submit_btn = gr.Button("📤 Send", variant="primary")
                stats_output = gr.Textbox(
                    label="Stats", 
                    interactive=False,
                    show_label=False,
                    placeholder="Stats will appear here..."
                )
    
    # Event handlers
    submit_btn.click(
        chatbot,
        inputs=[chat_history, user_input, model_choice, max_length, temperature],
        outputs=[chat_history, user_input, stats_output]
    )
    
    user_input.submit(
        chatbot,
        inputs=[chat_history, user_input, model_choice, max_length, temperature],
        outputs=[chat_history, user_input, stats_output]
    )
    
    clear_btn.click(
        clear_chat,
        inputs=[],
        outputs=[chat_history, user_input, stats_output]
    )

if __name__ == "__main__":
    demo.launch()