GPT-OSS

Running on Zero

App Files Files Community

Spestly commited on Jul 11

Commit

9c405c2

verified ·

1 Parent(s): 50b71d4

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -43

app.py CHANGED Viewed

@@ -20,56 +20,37 @@ MODELS = {
 @spaces.GPU
 def generate_response(model_id, conversation, user_message, max_length=512, temperature=0.7):
     """Generate response using ZeroGPU - all CUDA operations happen here"""
-    # Load model and tokenizer inside the GPU function
     print(f"🚀 Loading {model_id}...")
     start_time = time.time()
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         torch_dtype=torch.float16,
         device_map="auto",
         trust_remote_code=True
     )
     load_time = time.time() - start_time
     print(f"✅ Model loaded in {load_time:.2f}s")
     # Build messages in proper chat format
     messages = []
-    # Add system prompt first
     system_prompt = "You are Athena, a helpful, harmless, and honest AI assistant. You provide clear, accurate, and concise responses to user questions. You are knowledgeable across many domains and always aim to be respectful and helpful. You are finetuned by Aayan Mishra"
     messages.append({"role": "system", "content": system_prompt})
-    # Add conversation history
     for user_msg, assistant_msg in conversation:
         if user_msg:
             messages.append({"role": "user", "content": user_msg})
         if assistant_msg:
             messages.append({"role": "assistant", "content": assistant_msg})
-    # Add current user message
     messages.append({"role": "user", "content": user_message})
-    # Apply chat template
     prompt = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True
     )
-    # Tokenize and move to GPU
     inputs = tokenizer(prompt, return_tensors="pt")
-    # Move inputs to the same device as the model
     device = next(model.parameters()).device
     inputs = {k: v.to(device) for k, v in inputs.items()}
     generation_start = time.time()
     with torch.no_grad():
         outputs = model.generate(
@@ -81,38 +62,26 @@ def generate_response(model_id, conversation, user_message, max_length=512, temp
             pad_token_id=tokenizer.eos_token_id,
             eos_token_id=tokenizer.eos_token_id
         )
     generation_time = time.time() - generation_start
-    # Decode response
     response = tokenizer.decode(
         outputs[0][inputs['input_ids'].shape[-1]:],
         skip_special_tokens=True
     ).strip()
     return response, load_time, generation_time
 def respond(message, history, model_name, max_length, temperature):
     """Main function for ChatInterface - simplified signature"""
     if not message.strip():
         return "Please enter a message"
-    # Get model ID
     model_id = MODELS.get(model_name, MODELS["Athena-R3X 8B"])
     try:
-        # Generate response using ZeroGPU
         response, load_time, generation_time = generate_response(
             model_id, history, message, max_length, temperature
         )
-        # ChatInterface expects a generator for streaming or just return the response
         return response
     except Exception as e:
         return f"Error: {str(e)}"
-# CSS for better styling
 css = """
 .message {
     padding: 10px;
@@ -123,12 +92,9 @@ css = """
 theme = gr.themes.Monochrome()
-# Create ChatInterface
 with gr.Blocks(title="Athena Playground Chat", css=css, theme=theme) as demo:
     gr.Markdown("# 🚀 Athena Playground Chat")
     gr.Markdown("*Powered by HuggingFace ZeroGPU*")
-    # Additional inputs for configuration
     with gr.Row():
         with gr.Column(scale=1):
             model_choice = gr.Dropdown(
@@ -149,8 +115,6 @@ with gr.Blocks(title="Athena Playground Chat", css=css, theme=theme) as demo:
                 label="🎨 Creativity",
                 info="Higher values = more creative responses"
             )
-    # Create the ChatInterface
     chat_interface = gr.ChatInterface(
         fn=respond,
         additional_inputs=[model_choice, max_length, temperature],
@@ -158,18 +122,20 @@ with gr.Blocks(title="Athena Playground Chat", css=css, theme=theme) as demo:
         description="Ask Athena anything!",
         theme="soft",
         examples=[
-            "Hello! How are you?",
-            "What can you help me with?",
-            "Tell me about artificial intelligence",
-            "Write a short poem about space"
         ],
         cache_examples=False,
         chatbot=gr.Chatbot(
             height=500,
             placeholder="Start chatting with Athena...",
-            show_share_button=False
-        )
     )
 if __name__ == "__main__":
-    demo.launch()

 @spaces.GPU
 def generate_response(model_id, conversation, user_message, max_length=512, temperature=0.7):
     """Generate response using ZeroGPU - all CUDA operations happen here"""
     print(f"🚀 Loading {model_id}...")
     start_time = time.time()
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         torch_dtype=torch.float16,
         device_map="auto",
         trust_remote_code=True
     )
     load_time = time.time() - start_time
     print(f"✅ Model loaded in {load_time:.2f}s")
     # Build messages in proper chat format
     messages = []
     system_prompt = "You are Athena, a helpful, harmless, and honest AI assistant. You provide clear, accurate, and concise responses to user questions. You are knowledgeable across many domains and always aim to be respectful and helpful. You are finetuned by Aayan Mishra"
     messages.append({"role": "system", "content": system_prompt})
     for user_msg, assistant_msg in conversation:
         if user_msg:
             messages.append({"role": "user", "content": user_msg})
         if assistant_msg:
             messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": user_message})
     prompt = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True
     )
     inputs = tokenizer(prompt, return_tensors="pt")
     device = next(model.parameters()).device
     inputs = {k: v.to(device) for k, v in inputs.items()}
     generation_start = time.time()
     with torch.no_grad():
         outputs = model.generate(
             pad_token_id=tokenizer.eos_token_id,
             eos_token_id=tokenizer.eos_token_id
         )
     generation_time = time.time() - generation_start
     response = tokenizer.decode(
         outputs[0][inputs['input_ids'].shape[-1]:],
         skip_special_tokens=True
     ).strip()
     return response, load_time, generation_time
 def respond(message, history, model_name, max_length, temperature):
     """Main function for ChatInterface - simplified signature"""
     if not message.strip():
         return "Please enter a message"
     model_id = MODELS.get(model_name, MODELS["Athena-R3X 8B"])
     try:
         response, load_time, generation_time = generate_response(
             model_id, history, message, max_length, temperature
         )
         return response
     except Exception as e:
         return f"Error: {str(e)}"
 css = """
 .message {
     padding: 10px;
 theme = gr.themes.Monochrome()
 with gr.Blocks(title="Athena Playground Chat", css=css, theme=theme) as demo:
     gr.Markdown("# 🚀 Athena Playground Chat")
     gr.Markdown("*Powered by HuggingFace ZeroGPU*")
     with gr.Row():
         with gr.Column(scale=1):
             model_choice = gr.Dropdown(
                 label="🎨 Creativity",
                 info="Higher values = more creative responses"
             )
     chat_interface = gr.ChatInterface(
         fn=respond,
         additional_inputs=[model_choice, max_length, temperature],
         description="Ask Athena anything!",
         theme="soft",
         examples=[
+            ["Hello! How are you?", "Athena-R3X 8B", 512, 0.7],
+            ["What can you help me with?", "Athena-R3X 8B", 512, 0.7],
+            ["Tell me about artificial intelligence", "Athena-R3X 8B", 512, 0.7],
+            ["Write a short poem about space", "Athena-R3X 8B", 512, 0.7]
         ],
         cache_examples=False,
         chatbot=gr.Chatbot(
             height=500,
             placeholder="Start chatting with Athena...",
+            show_share_button=False,
+            type="messages"
+        ),
+        type="messages"
     )
 if __name__ == "__main__":
+    demo.launch()