Spestly commited on
Commit
82452fa
·
verified ·
1 Parent(s): 89a6548

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ import torch
4
+ from huggingface_hub import space
5
+ import time
6
+
7
+ # Full precision models for H200 70GB
8
+ MODELS = {
9
+ "Athena-R3X 8B": "Spestly/Athena-R3X-8B",
10
+ "Athena-R3X 4B": "Spestly/Athena-R3X-4B",
11
+ "Athena-R3 7B": "Spestly/Athena-R3-7B",
12
+ "Athena-3 3B": "Spestly/Athena-3-3B",
13
+ "Athena-3 7B": "Spestly/Athena-3-7B",
14
+ "Athena-3 14B": "Spestly/Athena-3-14B",
15
+ "Athena-2 1.5B": "Spestly/Athena-2-1.5B",
16
+ "Athena-1 3B": "Spestly/Athena-1-3B",
17
+ "Athena-1 7B": "Spestly/Athena-1-7B"
18
+ }
19
+
20
+ DEFAULT_MODEL = "Spestly/Athena-R3X-8B"
21
+
22
+ # GPU-accelerated function
23
+ @space.GPU
24
+ def load_model(model_name):
25
+ model_id = MODELS.get(model_name, DEFAULT_MODEL)
26
+
27
+ print(f"🚀 Loading {model_id} on H200 GPU...")
28
+ start_time = time.time()
29
+
30
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
31
+
32
+ model = AutoModelForCausalLM.from_pretrained(
33
+ model_id,
34
+ torch_dtype=torch.bfloat16,
35
+ device_map="auto",
36
+ low_cpu_mem_usage=True
37
+ )
38
+
39
+ load_time = time.time() - start_time
40
+ print(f"✅ Model loaded in {load_time:.2f} seconds")
41
+ print(f"GPU Memory Allocated: {torch.cuda.memory_allocated()/1e9:.2f}GB")
42
+
43
+ return model, tokenizer
44
+
45
+ @space.GPU
46
+ def generate_text(prompt, model_name, max_length=512, temperature=0.7):
47
+ try:
48
+ model, tokenizer = load_model(model_name)
49
+
50
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
51
+
52
+ start_time = time.time()
53
+ with torch.no_grad():
54
+ outputs = model.generate(
55
+ **inputs,
56
+ max_new_tokens=max_length,
57
+ temperature=temperature,
58
+ do_sample=True,
59
+ top_p=0.9
60
+ )
61
+ generation_time = time.time() - start_time
62
+
63
+ output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
64
+
65
+ stats = f"""
66
+ ⚡ Generation completed in {generation_time:.2f}s
67
+ 💾 GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f}GB allocated
68
+ 🌡️ Temperature: {temperature}
69
+ """
70
+
71
+ return output_text, stats
72
+
73
+ except Exception as e:
74
+ return f"❌ Error: {str(e)}", ""
75
+
76
+ with gr.Blocks(title="Athena Playground") as demo:
77
+ gr.Markdown("""# 🚀 Athena Playground""")
78
+
79
+ with gr.Row():
80
+ with gr.Column(scale=1):
81
+ model_choice = gr.Dropdown(
82
+ label="Model",
83
+ choices=list(MODELS.keys()),
84
+ value="Athena-R3X 8B"
85
+ )
86
+ max_length = gr.Slider(32, 4096, value=512, label="Max Tokens")
87
+ temperature = gr.Slider(0.1, 2.0, value=0.7, label="Creativity")
88
+ gr.Markdown("**Note:** First load may take 1-2 minutes")
89
+ submit_btn = gr.Button("Generate", variant="primary")
90
+
91
+ with gr.Column(scale=3):
92
+ prompt = gr.Textbox(label="Your Prompt", lines=8, placeholder="Type your prompt here...")
93
+ output = gr.Textbox(label="Model Output", lines=12)
94
+ stats = gr.Textbox(label="Performance Stats", lines=3)
95
+
96
+ submit_btn.click(
97
+ generate_text,
98
+ inputs=[prompt, model_choice, max_length, temperature],
99
+ outputs=[output, stats]
100
+ )
101
+
102
+ gr.Examples(
103
+ examples=[
104
+ ["Explain the transformer architecture like I'm five"],
105
+ ["Write a poem about AI in the style of Shakespeare"],
106
+ ["Generate Python code for a convolutional neural network"]
107
+ ],
108
+ inputs=prompt
109
+ )
110
+
111
+ if __name__ == "__main__":
112
+ demo.launch()