GPT-OSS

Running on Zero

App Files Files Community

Spestly commited on Jul 11

Commit

77246c4

verified ·

1 Parent(s): 609287a

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -69

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-import spaces
 import time
-# Full precision models for H200 70GB
 MODELS = {
     "Athena-R3X 8B": "Spestly/Athena-R3X-8B",
     "Athena-R3X 4B": "Spestly/Athena-R3X-4B",
@@ -17,65 +17,75 @@ MODELS = {
     "Athena-1 7B": "Spestly/Athena-1-7B"
 }
-DEFAULT_MODEL = "Spestly/Athena-R3X-8B"
-# GPU-accelerated function
-@spaces.GPU
 def load_model(model_name):
-    model_id = MODELS.get(model_name, DEFAULT_MODEL)
-    print(f"🚀 Loading {model_id} on H200 GPU...")
     start_time = time.time()
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         torch_dtype=torch.bfloat16,
-        device_map="auto",
-        low_cpu_mem_usage=True
     )
     load_time = time.time() - start_time
-    print(f"✅ Model loaded in {load_time:.2f} seconds")
-    print(f"GPU Memory Allocated: {torch.cuda.memory_allocated()/1e9:.2f}GB")
     return model, tokenizer
-@spaces.GPU
-def generate_text(prompt, model_name, max_length=512, temperature=0.7):
-    try:
-        model, tokenizer = load_model(model_name)
-        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
-        start_time = time.time()
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=max_length,
-                temperature=temperature,
-                do_sample=True,
-                top_p=0.9
-            )
-        generation_time = time.time() - start_time
-        output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        stats = f"""
-        ⚡ Generation completed in {generation_time:.2f}s
-        💾 GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f}GB allocated
-        🌡️ Temperature: {temperature}
-        """
-        return output_text, stats
-    except Exception as e:
-        return f"❌ Error: {str(e)}", ""
-with gr.Blocks(title="Athena Playground") as demo:
-    gr.Markdown("""# 🚀 Athena Playground""")
     with gr.Row():
         with gr.Column(scale=1):
             model_choice = gr.Dropdown(
@@ -85,28 +95,32 @@ with gr.Blocks(title="Athena Playground") as demo:
             )
             max_length = gr.Slider(32, 4096, value=512, label="Max Tokens")
             temperature = gr.Slider(0.1, 2.0, value=0.7, label="Creativity")
-            gr.Markdown("**Note:** First load may take 1-2 minutes")
-            submit_btn = gr.Button("Generate", variant="primary")
         with gr.Column(scale=3):
-            prompt = gr.Textbox(label="Your Prompt", lines=8, placeholder="Type your prompt here...")
-            output = gr.Textbox(label="Model Output", lines=12)
-            stats = gr.Textbox(label="Performance Stats", lines=3)
     submit_btn.click(
-        generate_text,
-        inputs=[prompt, model_choice, max_length, temperature],
-        outputs=[output, stats]
     )
-    gr.Examples(
-        examples=[
-            ["Explain the transformer architecture like I'm five"],
-            ["Write a poem about AI in the style of Shakespeare"],
-            ["Generate Python code for a convolutional neural network"]
-        ],
-        inputs=prompt
     )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import time
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 MODELS = {
     "Athena-R3X 8B": "Spestly/Athena-R3X-8B",
     "Athena-R3X 4B": "Spestly/Athena-R3X-4B",
     "Athena-1 7B": "Spestly/Athena-1-7B"
 }
+loaded_models = {}
+loaded_tokenizers = {}
 def load_model(model_name):
+    if model_name in loaded_models:
+        return loaded_models[model_name], loaded_tokenizers[model_name]
+    model_id = MODELS.get(model_name, MODELS["Athena-R3X 8B"])
+    print(f"🚀 Loading {model_id} on {device}...")
     start_time = time.time()
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         torch_dtype=torch.bfloat16,
+        device_map=None
     )
+    model.to(device)
+    model.eval()
     load_time = time.time() - start_time
+    print(f"✅ Model loaded in {load_time:.2f}s, GPU mem: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
+    loaded_models[model_name] = model
+    loaded_tokenizers[model_name] = tokenizer
     return model, tokenizer
+def chatbot(conversation, user_message, model_name, max_length=512, temperature=0.7):
+    if conversation is None:
+        conversation = []
+    model, tokenizer = load_model(model_name)
+    # Append user message to conversation
+    conversation.append(("User", user_message))
+    # Build prompt from conversation history (simple concatenation)
+    prompt = ""
+    for speaker, text in conversation:
+        if speaker == "User":
+            prompt += f"User: {text}\n"
+        else:
+            prompt += f"Athena: {text}\n"
+    prompt += "Athena:"
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    start_time = time.time()
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_length,
+            temperature=temperature,
+            do_sample=True,
+            top_p=0.9,
+            pad_token_id=tokenizer.eos_token_id
+        )
+    generation_time = time.time() - start_time
+    output_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True).strip()
+    conversation.append(("Athena", output_text))
+    stats = f"⚡ Generated in {generation_time:.2f}s | GPU mem: {torch.cuda.memory_allocated()/1e9:.2f} GB | Temp: {temperature}"
+    return conversation, "", stats
+with gr.Blocks(title="Athena Playground Chat") as demo:
+    gr.Markdown("# 🚀 Athena Playground Chat")
     with gr.Row():
         with gr.Column(scale=1):
             model_choice = gr.Dropdown(
             )
             max_length = gr.Slider(32, 4096, value=512, label="Max Tokens")
             temperature = gr.Slider(0.1, 2.0, value=0.7, label="Creativity")
+            clear_btn = gr.Button("Clear Chat")
         with gr.Column(scale=3):
+            chat_history = gr.Chatbot(elem_id="chatbot").style(height=600)
+            user_input = gr.Textbox(
+                placeholder="Ask Athena anything...",
+                label="Your message",
+                lines=2
+            )
+            submit_btn = gr.Button("Send")
+    def clear_chat():
+        return [], "", ""
     submit_btn.click(
+        chatbot,
+        inputs=[chat_history, user_input, model_choice, max_length, temperature],
+        outputs=[chat_history, user_input, gr.Textbox(label="Stats")],
+        queue=True
     )
+    clear_btn.click(
+        clear_chat,
+        inputs=[],
+        outputs=[chat_history, user_input, gr.Textbox(label="Stats")]
     )
 if __name__ == "__main__":
+    demo.launch()