Ais
commited on
Update app/main.py
Browse files- app/main.py +257 -150
app/main.py
CHANGED
@@ -8,7 +8,7 @@ from starlette.middleware.cors import CORSMiddleware
|
|
8 |
import re
|
9 |
|
10 |
# === Setup FastAPI ===
|
11 |
-
app = FastAPI(title="Apollo AI Backend", version="2.
|
12 |
|
13 |
# === CORS ===
|
14 |
app.add_middleware(
|
@@ -25,10 +25,12 @@ BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
|
|
25 |
ADAPTER_PATH = "adapter"
|
26 |
|
27 |
# === Load Model ===
|
28 |
-
print("🔧 Loading tokenizer...")
|
29 |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
|
|
|
|
30 |
|
31 |
-
print("🧠 Loading base model...")
|
32 |
base_model = AutoModelForCausalLM.from_pretrained(
|
33 |
BASE_MODEL,
|
34 |
trust_remote_code=True,
|
@@ -36,48 +38,140 @@ base_model = AutoModelForCausalLM.from_pretrained(
|
|
36 |
device_map="cpu"
|
37 |
)
|
38 |
|
39 |
-
print("🔗 Applying LoRA adapter...")
|
40 |
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
|
41 |
model.eval()
|
42 |
|
43 |
-
print("✅
|
44 |
|
45 |
-
def
|
46 |
"""
|
47 |
-
|
|
|
48 |
"""
|
49 |
if is_force_mode:
|
50 |
-
return """You are Apollo AI
|
51 |
-
|
52 |
-
|
53 |
-
-
|
54 |
-
-
|
55 |
-
-
|
56 |
-
-
|
57 |
|
58 |
Example:
|
59 |
-
User: "
|
60 |
-
You: "Use
|
61 |
"""
|
62 |
else:
|
63 |
-
return """You are Apollo AI
|
64 |
-
|
65 |
-
|
66 |
-
-
|
67 |
-
-
|
68 |
-
-
|
69 |
-
- Make students
|
70 |
-
- Never give full working code - always leave something for them to figure out
|
71 |
|
72 |
Example:
|
73 |
-
User: "
|
74 |
-
You: "
|
75 |
"""
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
def extract_clean_answer(full_response: str, formatted_prompt: str, user_message: str, is_force_mode: bool) -> str:
|
78 |
"""
|
79 |
-
|
80 |
-
|
81 |
"""
|
82 |
if not full_response or len(full_response.strip()) < 5:
|
83 |
return "I apologize, but I couldn't generate a response. Please try again."
|
@@ -85,126 +179,135 @@ def extract_clean_answer(full_response: str, formatted_prompt: str, user_message
|
|
85 |
print(f"🔍 Raw response length: {len(full_response)}")
|
86 |
print(f"🔍 Mode: {'FORCE' if is_force_mode else 'MENTOR'}")
|
87 |
|
88 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
generated_text = full_response
|
90 |
if formatted_prompt in full_response:
|
91 |
parts = full_response.split(formatted_prompt)
|
92 |
if len(parts) > 1:
|
93 |
generated_text = parts[-1]
|
94 |
|
95 |
-
# Step 2: Extract assistant content
|
96 |
assistant_content = generated_text
|
97 |
|
98 |
-
#
|
99 |
if "<|im_start|>assistant" in generated_text:
|
100 |
assistant_parts = generated_text.split("<|im_start|>assistant")
|
101 |
if len(assistant_parts) > 1:
|
102 |
assistant_content = assistant_parts[-1]
|
103 |
-
# Remove end marker if present
|
104 |
if "<|im_end|>" in assistant_content:
|
105 |
assistant_content = assistant_content.split("<|im_end|>")[0]
|
106 |
|
107 |
-
#
|
108 |
-
elif "assistant:" in generated_text.lower():
|
109 |
-
parts = generated_text.lower().split("assistant:")
|
110 |
-
if len(parts) > 1:
|
111 |
-
# Get the content after the last "assistant:" occurrence
|
112 |
-
assistant_content = generated_text[generated_text.lower().rfind("assistant:") + 10:]
|
113 |
-
|
114 |
-
# Step 3: POWERFUL CLEANING - Remove all template artifacts
|
115 |
clean_text = assistant_content.strip()
|
116 |
|
117 |
-
# Remove
|
118 |
clean_text = re.sub(r'<\|im_start\|>', '', clean_text)
|
119 |
clean_text = re.sub(r'<\|im_end\|>', '', clean_text)
|
120 |
clean_text = re.sub(r'<\|endoftext\|>', '', clean_text)
|
121 |
|
122 |
-
# Remove role prefixes
|
123 |
clean_text = re.sub(r'^(system|user|assistant):\s*', '', clean_text, flags=re.MULTILINE)
|
124 |
clean_text = re.sub(r'\n(system|user|assistant):\s*', '\n', clean_text, flags=re.MULTILINE)
|
125 |
|
126 |
-
#
|
127 |
-
clean_text = re.sub(r'
|
128 |
-
clean_text = re.sub(r'Guidelines?:.*?\n', '', clean_text, flags=re.IGNORECASE)
|
129 |
-
clean_text = re.sub(r'Example:.*?\n', '', clean_text, flags=re.IGNORECASE)
|
130 |
-
|
131 |
-
# Clean up excessive whitespace but preserve formatting
|
132 |
-
clean_text = re.sub(r'\n{4,}', '\n\n\n', clean_text)
|
133 |
clean_text = clean_text.strip()
|
134 |
|
135 |
-
# Step 4:
|
136 |
if not clean_text or len(clean_text) < 10:
|
137 |
-
# Special handling for simple math questions
|
138 |
-
if user_message and any(term in user_message.lower() for term in ['2+2', '2 + 2', 'calculate', 'what is']):
|
139 |
-
if '2+2' in user_message.lower() or '2 + 2' in user_message.lower():
|
140 |
-
return "4" if is_force_mode else "What do you think 2 + 2 equals? Try calculating it step by step."
|
141 |
-
|
142 |
-
# Generic fallback based on mode
|
143 |
if is_force_mode:
|
144 |
-
return "
|
145 |
else:
|
146 |
-
return "
|
147 |
|
148 |
-
# Step 5:
|
149 |
-
if
|
150 |
-
|
151 |
-
if len(
|
152 |
-
|
153 |
-
if len(sentences) > 3:
|
154 |
-
clean_text = '. '.join(sentences[:3]) + '.'
|
155 |
-
else:
|
156 |
-
# For mentor mode, ensure it's not giving away complete solutions
|
157 |
-
# Check if response contains complete code without guidance
|
158 |
-
code_block_pattern = r'```[\w]*\n(.*?)\n```'
|
159 |
-
code_blocks = re.findall(code_block_pattern, clean_text, re.DOTALL)
|
160 |
-
|
161 |
-
for code in code_blocks:
|
162 |
-
# If code looks complete and there's no guidance, add mentor touch
|
163 |
-
if len(code.strip()) > 50 and 'try' not in clean_text.lower() and '?' not in clean_text:
|
164 |
-
clean_text += "\n\nTry implementing this step by step. What do you think each part does?"
|
165 |
|
166 |
print(f"🧹 Final cleaned answer length: {len(clean_text)}")
|
167 |
-
print(f"🧹 Preview: {clean_text[:150]}...")
|
168 |
|
169 |
return clean_text
|
170 |
|
171 |
-
def generate_response(messages: list, is_force_mode: bool = False, max_tokens: int =
|
172 |
"""
|
173 |
-
|
174 |
"""
|
175 |
try:
|
176 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
clean_messages = []
|
178 |
|
179 |
-
# Add
|
180 |
-
system_prompt =
|
181 |
clean_messages.append({
|
182 |
"role": "system",
|
183 |
"content": system_prompt
|
184 |
})
|
185 |
|
186 |
-
# Add
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
194 |
|
195 |
-
print(f"🔍 Processing {len(clean_messages)} messages in {'FORCE' if is_force_mode else 'MENTOR'} mode")
|
196 |
|
197 |
-
#
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
-
# Tokenize with
|
205 |
-
inputs = tokenizer(
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
-
#
|
208 |
generation_params = {
|
209 |
"input_ids": inputs.input_ids,
|
210 |
"attention_mask": inputs.attention_mask,
|
@@ -214,29 +317,29 @@ def generate_response(messages: list, is_force_mode: bool = False, max_tokens: i
|
|
214 |
}
|
215 |
|
216 |
if is_force_mode:
|
217 |
-
# Force mode:
|
218 |
generation_params.update({
|
219 |
-
"max_new_tokens": min(max_tokens,
|
220 |
-
"temperature": 0.
|
221 |
-
"top_p": 0.
|
222 |
-
"
|
223 |
-
"
|
224 |
})
|
225 |
else:
|
226 |
-
# Mentor mode:
|
227 |
generation_params.update({
|
228 |
-
"max_new_tokens": min(max_tokens,
|
229 |
-
"temperature": 0.
|
230 |
-
"top_p": 0.
|
231 |
-
"
|
232 |
-
"
|
233 |
})
|
234 |
|
235 |
-
# Generate
|
236 |
with torch.no_grad():
|
237 |
outputs = model.generate(**generation_params)
|
238 |
|
239 |
-
# Decode
|
240 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
|
241 |
|
242 |
# Extract user message for context
|
@@ -246,33 +349,39 @@ def generate_response(messages: list, is_force_mode: bool = False, max_tokens: i
|
|
246 |
user_message = msg.get("content", "")
|
247 |
break
|
248 |
|
249 |
-
# Clean and
|
250 |
clean_answer = extract_clean_answer(full_response, formatted_prompt, user_message, is_force_mode)
|
251 |
|
252 |
return clean_answer
|
253 |
|
254 |
except Exception as e:
|
255 |
-
print(f"❌ Generation error: {e}")
|
256 |
-
mode_text = "direct answer" if is_force_mode else "guided learning
|
257 |
-
return f"I encountered an error
|
258 |
|
259 |
# === Routes ===
|
260 |
@app.get("/")
|
261 |
def root():
|
262 |
return {
|
263 |
-
"message": "🤖 Apollo AI Backend v2.
|
264 |
-
"model": "Qwen2-0.5B-Instruct with LoRA",
|
265 |
"status": "ready",
|
266 |
-
"
|
|
|
267 |
"modes": {
|
268 |
-
"mentor": "Guides learning with questions
|
269 |
-
"force": "Provides direct answers
|
270 |
}
|
271 |
}
|
272 |
|
273 |
@app.get("/health")
|
274 |
def health():
|
275 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
276 |
|
277 |
@app.post("/v1/chat/completions")
|
278 |
async def chat_completions(request: Request):
|
@@ -295,11 +404,11 @@ async def chat_completions(request: Request):
|
|
295 |
try:
|
296 |
body = await request.json()
|
297 |
messages = body.get("messages", [])
|
298 |
-
max_tokens = body.get("max_tokens",
|
299 |
-
temperature = body.get("temperature", 0.
|
300 |
|
301 |
-
#
|
302 |
-
is_force_mode = body.get("force_mode", False)
|
303 |
|
304 |
if not messages or not isinstance(messages, list):
|
305 |
raise ValueError("Messages field is required and must be a list")
|
@@ -310,7 +419,7 @@ async def chat_completions(request: Request):
|
|
310 |
content={"error": f"Invalid request body: {str(e)}"}
|
311 |
)
|
312 |
|
313 |
-
# Validate messages
|
314 |
for i, msg in enumerate(messages):
|
315 |
if not isinstance(msg, dict) or "role" not in msg or "content" not in msg:
|
316 |
return JSONResponse(
|
@@ -319,20 +428,19 @@ async def chat_completions(request: Request):
|
|
319 |
)
|
320 |
|
321 |
try:
|
322 |
-
|
323 |
-
print(f"📥 Processing {len(messages)} messages in {'FORCE' if is_force_mode else 'MENTOR'} mode")
|
324 |
print(f"📊 Settings: max_tokens={max_tokens}, temperature={temperature}")
|
325 |
|
326 |
response_content = generate_response(
|
327 |
messages=messages,
|
328 |
is_force_mode=is_force_mode,
|
329 |
-
max_tokens=
|
330 |
-
temperature=
|
331 |
)
|
332 |
|
333 |
# Return OpenAI-compatible response
|
334 |
return {
|
335 |
-
"id": f"chatcmpl-apollo-{hash(str(messages)) % 10000}",
|
336 |
"object": "chat.completion",
|
337 |
"created": int(torch.tensor(0).item()),
|
338 |
"model": f"qwen2-0.5b-{'force' if is_force_mode else 'mentor'}-mode",
|
@@ -351,7 +459,8 @@ async def chat_completions(request: Request):
|
|
351 |
"completion_tokens": len(response_content),
|
352 |
"total_tokens": len(str(messages)) + len(response_content)
|
353 |
},
|
354 |
-
"apollo_mode": "force" if is_force_mode else "mentor"
|
|
|
355 |
}
|
356 |
|
357 |
except Exception as e:
|
@@ -361,23 +470,21 @@ async def chat_completions(request: Request):
|
|
361 |
content={"error": f"Internal server error: {str(e)}"}
|
362 |
)
|
363 |
|
364 |
-
# === Test endpoint for
|
365 |
@app.post("/test")
|
366 |
async def test_generation(request: Request):
|
367 |
-
"""Test endpoint for debugging both modes"""
|
368 |
try:
|
369 |
body = await request.json()
|
370 |
prompt = body.get("prompt", "How do I print hello world in Python?")
|
371 |
-
max_tokens = body.get("max_tokens", 300)
|
372 |
test_both_modes = body.get("test_both_modes", True)
|
373 |
|
374 |
results = {}
|
375 |
|
376 |
# Test mentor mode
|
377 |
-
messages_mentor = [
|
378 |
-
|
379 |
-
]
|
380 |
-
mentor_response = generate_response(messages_mentor, is_force_mode=False, max_tokens=max_tokens, temperature=0.7)
|
381 |
results["mentor_mode"] = {
|
382 |
"response": mentor_response,
|
383 |
"length": len(mentor_response),
|
@@ -386,10 +493,8 @@ async def test_generation(request: Request):
|
|
386 |
|
387 |
if test_both_modes:
|
388 |
# Test force mode
|
389 |
-
messages_force = [
|
390 |
-
|
391 |
-
]
|
392 |
-
force_response = generate_response(messages_force, is_force_mode=True, max_tokens=max_tokens, temperature=0.3)
|
393 |
results["force_mode"] = {
|
394 |
"response": force_response,
|
395 |
"length": len(force_response),
|
@@ -399,8 +504,9 @@ async def test_generation(request: Request):
|
|
399 |
return {
|
400 |
"prompt": prompt,
|
401 |
"results": results,
|
402 |
-
"
|
403 |
-
"
|
|
|
404 |
}
|
405 |
|
406 |
except Exception as e:
|
@@ -411,7 +517,8 @@ async def test_generation(request: Request):
|
|
411 |
|
412 |
if __name__ == "__main__":
|
413 |
import uvicorn
|
414 |
-
print("🚀 Starting Apollo AI Backend v2.0...")
|
415 |
-
print("
|
416 |
-
print("
|
|
|
417 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
8 |
import re
|
9 |
|
10 |
# === Setup FastAPI ===
|
11 |
+
app = FastAPI(title="Apollo AI Backend - Qwen2-0.5B Optimized", version="2.1.0")
|
12 |
|
13 |
# === CORS ===
|
14 |
app.add_middleware(
|
|
|
25 |
ADAPTER_PATH = "adapter"
|
26 |
|
27 |
# === Load Model ===
|
28 |
+
print("🔧 Loading tokenizer for Qwen2-0.5B...")
|
29 |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
30 |
+
if tokenizer.pad_token is None:
|
31 |
+
tokenizer.pad_token = tokenizer.eos_token
|
32 |
|
33 |
+
print("🧠 Loading Qwen2-0.5B base model...")
|
34 |
base_model = AutoModelForCausalLM.from_pretrained(
|
35 |
BASE_MODEL,
|
36 |
trust_remote_code=True,
|
|
|
38 |
device_map="cpu"
|
39 |
)
|
40 |
|
41 |
+
print("🔗 Applying LoRA adapter to Qwen2-0.5B...")
|
42 |
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
|
43 |
model.eval()
|
44 |
|
45 |
+
print("✅ Qwen2-0.5B model ready with optimized settings!")
|
46 |
|
47 |
+
def get_simple_system_prompt(is_force_mode: bool) -> str:
|
48 |
"""
|
49 |
+
SIMPLIFIED system prompts optimized for Qwen2-0.5B's 500M parameters.
|
50 |
+
Shorter, clearer instructions that small models can follow better.
|
51 |
"""
|
52 |
if is_force_mode:
|
53 |
+
return """You are Apollo AI. Give direct, complete answers.
|
54 |
+
|
55 |
+
Rules:
|
56 |
+
- Provide full working code
|
57 |
+
- Be concise, max 3 sentences explanation
|
58 |
+
- Never ask questions back
|
59 |
+
- Give complete solutions immediately
|
60 |
|
61 |
Example:
|
62 |
+
User: "print hello world python"
|
63 |
+
You: "Use print('Hello World'). This outputs text to console."
|
64 |
"""
|
65 |
else:
|
66 |
+
return """You are Apollo AI tutor. Guide learning with questions.
|
67 |
+
|
68 |
+
Rules:
|
69 |
+
- Ask guiding questions instead of giving answers
|
70 |
+
- Never give complete working code
|
71 |
+
- Use hints and partial examples only
|
72 |
+
- Make students think and discover
|
|
|
73 |
|
74 |
Example:
|
75 |
+
User: "print hello world python"
|
76 |
+
You: "What function displays text in Python? Try looking up output functions."
|
77 |
"""
|
78 |
|
79 |
+
def create_simple_force_responses(user_message: str) -> str:
|
80 |
+
"""
|
81 |
+
Pre-defined responses for common questions in force mode.
|
82 |
+
This helps the 0.5B model give consistent direct answers.
|
83 |
+
"""
|
84 |
+
user_lower = user_message.lower()
|
85 |
+
|
86 |
+
# Python print
|
87 |
+
if 'print' in user_lower and ('hello' in user_lower or 'world' in user_lower):
|
88 |
+
return 'Use `print("Hello World")`. This function outputs text to the console.'
|
89 |
+
|
90 |
+
# Basic math
|
91 |
+
if '2+2' in user_lower or '2 + 2' in user_lower:
|
92 |
+
return '2 + 2 = 4. Addition combines two numbers to get their sum.'
|
93 |
+
|
94 |
+
# Python variable
|
95 |
+
if 'variable' in user_lower and ('python' in user_lower or 'create' in user_lower):
|
96 |
+
return 'Use `name = "value"`. Variables store data: `x = 5` or `text = "hello"`.'
|
97 |
+
|
98 |
+
# Python list
|
99 |
+
if 'list' in user_lower and 'python' in user_lower and 'create' in user_lower:
|
100 |
+
return 'Use square brackets: `my_list = [1, 2, 3]`. Lists store multiple items.'
|
101 |
+
|
102 |
+
# Python function
|
103 |
+
if 'function' in user_lower and 'python' in user_lower and ('create' in user_lower or 'define' in user_lower):
|
104 |
+
return '''Use def keyword:
|
105 |
+
```python
|
106 |
+
def my_function():
|
107 |
+
return "Hello"
|
108 |
+
```
|
109 |
+
Functions are reusable code blocks.'''
|
110 |
+
|
111 |
+
# Calculator
|
112 |
+
if 'calculator' in user_lower and ('create' in user_lower or 'make' in user_lower or 'build' in user_lower):
|
113 |
+
return '''Here's a simple calculator:
|
114 |
+
```python
|
115 |
+
a = float(input("First number: "))
|
116 |
+
b = float(input("Second number: "))
|
117 |
+
op = input("Operator (+,-,*,/): ")
|
118 |
+
if op == '+': print(a + b)
|
119 |
+
elif op == '-': print(a - b)
|
120 |
+
elif op == '*': print(a * b)
|
121 |
+
elif op == '/': print(a / b)
|
122 |
+
```
|
123 |
+
This performs basic math operations.'''
|
124 |
+
|
125 |
+
return None
|
126 |
+
|
127 |
+
def create_simple_mentor_responses(user_message: str) -> str:
|
128 |
+
"""
|
129 |
+
Pre-defined mentor responses for common questions.
|
130 |
+
This helps the 0.5B model give consistent guided learning.
|
131 |
+
"""
|
132 |
+
user_lower = user_message.lower()
|
133 |
+
|
134 |
+
# Python print
|
135 |
+
if 'print' in user_lower and ('hello' in user_lower or 'world' in user_lower):
|
136 |
+
return 'What function do you think displays text in Python? Think about showing output. What would it be called?'
|
137 |
+
|
138 |
+
# Basic math
|
139 |
+
if '2+2' in user_lower or '2 + 2' in user_lower:
|
140 |
+
return 'What do you think 2 + 2 equals? Try calculating it step by step.'
|
141 |
+
|
142 |
+
# Python variable
|
143 |
+
if 'variable' in user_lower and ('python' in user_lower or 'create' in user_lower):
|
144 |
+
return 'How do you think Python stores data? What symbol might assign a value to a name? Try: name = value'
|
145 |
+
|
146 |
+
# Python list
|
147 |
+
if 'list' in user_lower and 'python' in user_lower and 'create' in user_lower:
|
148 |
+
return 'What brackets do you think hold multiple items? Try making a list with [item1, item2]. What goes inside?'
|
149 |
+
|
150 |
+
# Python function
|
151 |
+
if 'function' in user_lower and 'python' in user_lower and ('create' in user_lower or 'define' in user_lower):
|
152 |
+
return '''What keyword defines a function in Python? Try this structure:
|
153 |
+
```python
|
154 |
+
___ function_name():
|
155 |
+
# your code here
|
156 |
+
```
|
157 |
+
What goes in the blank? How would you call it?'''
|
158 |
+
|
159 |
+
# Calculator
|
160 |
+
if 'calculator' in user_lower and ('create' in user_lower or 'make' in user_lower or 'build' in user_lower):
|
161 |
+
return '''What steps would a calculator need?
|
162 |
+
1. Get two numbers from user - what function gets input?
|
163 |
+
2. Get operation (+,-,*,/) - how to choose?
|
164 |
+
3. Calculate result - what structure handles choices?
|
165 |
+
4. Show result - what displays output?
|
166 |
+
|
167 |
+
Try building step 1 first. What function gets user input?'''
|
168 |
+
|
169 |
+
return None
|
170 |
+
|
171 |
def extract_clean_answer(full_response: str, formatted_prompt: str, user_message: str, is_force_mode: bool) -> str:
|
172 |
"""
|
173 |
+
Optimized cleaning for Qwen2-0.5B responses.
|
174 |
+
Simpler extraction since 0.5B models produce cleaner output.
|
175 |
"""
|
176 |
if not full_response or len(full_response.strip()) < 5:
|
177 |
return "I apologize, but I couldn't generate a response. Please try again."
|
|
|
179 |
print(f"🔍 Raw response length: {len(full_response)}")
|
180 |
print(f"🔍 Mode: {'FORCE' if is_force_mode else 'MENTOR'}")
|
181 |
|
182 |
+
# Check for pre-defined responses first
|
183 |
+
if is_force_mode:
|
184 |
+
predefined = create_simple_force_responses(user_message)
|
185 |
+
if predefined:
|
186 |
+
print("✅ Using predefined force response")
|
187 |
+
return predefined
|
188 |
+
else:
|
189 |
+
predefined = create_simple_mentor_responses(user_message)
|
190 |
+
if predefined:
|
191 |
+
print("✅ Using predefined mentor response")
|
192 |
+
return predefined
|
193 |
+
|
194 |
+
# Step 1: Remove the input prompt
|
195 |
generated_text = full_response
|
196 |
if formatted_prompt in full_response:
|
197 |
parts = full_response.split(formatted_prompt)
|
198 |
if len(parts) > 1:
|
199 |
generated_text = parts[-1]
|
200 |
|
201 |
+
# Step 2: Extract assistant content - simplified for 0.5B
|
202 |
assistant_content = generated_text
|
203 |
|
204 |
+
# Look for assistant markers
|
205 |
if "<|im_start|>assistant" in generated_text:
|
206 |
assistant_parts = generated_text.split("<|im_start|>assistant")
|
207 |
if len(assistant_parts) > 1:
|
208 |
assistant_content = assistant_parts[-1]
|
|
|
209 |
if "<|im_end|>" in assistant_content:
|
210 |
assistant_content = assistant_content.split("<|im_end|>")[0]
|
211 |
|
212 |
+
# Step 3: Basic cleaning - gentler for 0.5B
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
clean_text = assistant_content.strip()
|
214 |
|
215 |
+
# Remove template tokens
|
216 |
clean_text = re.sub(r'<\|im_start\|>', '', clean_text)
|
217 |
clean_text = re.sub(r'<\|im_end\|>', '', clean_text)
|
218 |
clean_text = re.sub(r'<\|endoftext\|>', '', clean_text)
|
219 |
|
220 |
+
# Remove role prefixes
|
221 |
clean_text = re.sub(r'^(system|user|assistant):\s*', '', clean_text, flags=re.MULTILINE)
|
222 |
clean_text = re.sub(r'\n(system|user|assistant):\s*', '\n', clean_text, flags=re.MULTILINE)
|
223 |
|
224 |
+
# Clean whitespace
|
225 |
+
clean_text = re.sub(r'\n{3,}', '\n\n', clean_text)
|
|
|
|
|
|
|
|
|
|
|
226 |
clean_text = clean_text.strip()
|
227 |
|
228 |
+
# Step 4: Fallback handling for 0.5B
|
229 |
if not clean_text or len(clean_text) < 10:
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
if is_force_mode:
|
231 |
+
return "Could you please be more specific about what you need?"
|
232 |
else:
|
233 |
+
return "What specific aspect would you like to explore? What's your approach?"
|
234 |
|
235 |
+
# Step 5: Length control for 0.5B
|
236 |
+
if len(clean_text) > 500: # Keep responses shorter for 0.5B
|
237 |
+
sentences = clean_text.split('. ')
|
238 |
+
if len(sentences) > 3:
|
239 |
+
clean_text = '. '.join(sentences[:3]) + '.'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
|
241 |
print(f"🧹 Final cleaned answer length: {len(clean_text)}")
|
|
|
242 |
|
243 |
return clean_text
|
244 |
|
245 |
+
def generate_response(messages: list, is_force_mode: bool = False, max_tokens: int = 200, temperature: float = 0.7) -> str:
|
246 |
"""
|
247 |
+
Optimized generation for Qwen2-0.5B with shorter contexts and conservative settings.
|
248 |
"""
|
249 |
try:
|
250 |
+
# Check for simple predefined responses first
|
251 |
+
if messages and len(messages) > 0:
|
252 |
+
last_user_msg = ""
|
253 |
+
for msg in reversed(messages):
|
254 |
+
if msg.get("role") == "user":
|
255 |
+
last_user_msg = msg.get("content", "")
|
256 |
+
break
|
257 |
+
|
258 |
+
if last_user_msg:
|
259 |
+
if is_force_mode:
|
260 |
+
predefined = create_simple_force_responses(last_user_msg)
|
261 |
+
if predefined:
|
262 |
+
return predefined
|
263 |
+
else:
|
264 |
+
predefined = create_simple_mentor_responses(last_user_msg)
|
265 |
+
if predefined:
|
266 |
+
return predefined
|
267 |
+
|
268 |
+
# Build simple conversation for 0.5B model
|
269 |
clean_messages = []
|
270 |
|
271 |
+
# Add simple system prompt
|
272 |
+
system_prompt = get_simple_system_prompt(is_force_mode)
|
273 |
clean_messages.append({
|
274 |
"role": "system",
|
275 |
"content": system_prompt
|
276 |
})
|
277 |
|
278 |
+
# Add only the last user message to keep context short for 0.5B
|
279 |
+
if messages and len(messages) > 0:
|
280 |
+
for msg in reversed(messages):
|
281 |
+
if msg.get("role") == "user":
|
282 |
+
clean_messages.append({
|
283 |
+
"role": "user",
|
284 |
+
"content": msg.get("content", "")
|
285 |
+
})
|
286 |
+
break
|
287 |
|
288 |
+
print(f"🔍 Processing {len(clean_messages)} messages for Qwen2-0.5B in {'FORCE' if is_force_mode else 'MENTOR'} mode")
|
289 |
|
290 |
+
# Apply chat template
|
291 |
+
try:
|
292 |
+
formatted_prompt = tokenizer.apply_chat_template(
|
293 |
+
clean_messages,
|
294 |
+
tokenize=False,
|
295 |
+
add_generation_prompt=True
|
296 |
+
)
|
297 |
+
except Exception as e:
|
298 |
+
print(f"⚠️ Chat template failed, using simple format: {e}")
|
299 |
+
# Fallback to simple format
|
300 |
+
formatted_prompt = f"System: {clean_messages[0]['content']}\nUser: {clean_messages[1]['content']}\nAssistant:"
|
301 |
|
302 |
+
# Tokenize with conservative limits for 0.5B
|
303 |
+
inputs = tokenizer(
|
304 |
+
formatted_prompt,
|
305 |
+
return_tensors="pt",
|
306 |
+
truncation=True,
|
307 |
+
max_length=800 # Shorter context for 0.5B
|
308 |
+
)
|
309 |
|
310 |
+
# Conservative generation settings for 0.5B model
|
311 |
generation_params = {
|
312 |
"input_ids": inputs.input_ids,
|
313 |
"attention_mask": inputs.attention_mask,
|
|
|
317 |
}
|
318 |
|
319 |
if is_force_mode:
|
320 |
+
# Force mode: Very conservative for 0.5B
|
321 |
generation_params.update({
|
322 |
+
"max_new_tokens": min(max_tokens, 150), # Very short
|
323 |
+
"temperature": 0.1, # Very focused
|
324 |
+
"top_p": 0.7,
|
325 |
+
"top_k": 20,
|
326 |
+
"repetition_penalty": 1.05,
|
327 |
})
|
328 |
else:
|
329 |
+
# Mentor mode: Still conservative but allows more creativity
|
330 |
generation_params.update({
|
331 |
+
"max_new_tokens": min(max_tokens, 200),
|
332 |
+
"temperature": 0.3, # Lower than original
|
333 |
+
"top_p": 0.8,
|
334 |
+
"top_k": 30,
|
335 |
+
"repetition_penalty": 1.02,
|
336 |
})
|
337 |
|
338 |
+
# Generate with timeout for 0.5B
|
339 |
with torch.no_grad():
|
340 |
outputs = model.generate(**generation_params)
|
341 |
|
342 |
+
# Decode response
|
343 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
|
344 |
|
345 |
# Extract user message for context
|
|
|
349 |
user_message = msg.get("content", "")
|
350 |
break
|
351 |
|
352 |
+
# Clean and return
|
353 |
clean_answer = extract_clean_answer(full_response, formatted_prompt, user_message, is_force_mode)
|
354 |
|
355 |
return clean_answer
|
356 |
|
357 |
except Exception as e:
|
358 |
+
print(f"❌ Generation error with Qwen2-0.5B: {e}")
|
359 |
+
mode_text = "direct answer" if is_force_mode else "guided learning"
|
360 |
+
return f"I encountered an error generating a {mode_text}. Please try a simpler question."
|
361 |
|
362 |
# === Routes ===
|
363 |
@app.get("/")
|
364 |
def root():
|
365 |
return {
|
366 |
+
"message": "🤖 Apollo AI Backend v2.1 - Qwen2-0.5B Optimized",
|
367 |
+
"model": "Qwen/Qwen2-0.5B-Instruct with LoRA",
|
368 |
"status": "ready",
|
369 |
+
"optimizations": ["short_contexts", "conservative_generation", "predefined_responses"],
|
370 |
+
"features": ["mentor_mode", "force_mode", "0.5B_optimized"],
|
371 |
"modes": {
|
372 |
+
"mentor": "Guides learning with simple questions",
|
373 |
+
"force": "Provides direct answers quickly"
|
374 |
}
|
375 |
}
|
376 |
|
377 |
@app.get("/health")
|
378 |
def health():
|
379 |
+
return {
|
380 |
+
"status": "healthy",
|
381 |
+
"model_loaded": True,
|
382 |
+
"model_size": "0.5B",
|
383 |
+
"optimizations": "qwen2_0.5B_specific"
|
384 |
+
}
|
385 |
|
386 |
@app.post("/v1/chat/completions")
|
387 |
async def chat_completions(request: Request):
|
|
|
404 |
try:
|
405 |
body = await request.json()
|
406 |
messages = body.get("messages", [])
|
407 |
+
max_tokens = min(body.get("max_tokens", 200), 300) # Cap at 300 for 0.5B
|
408 |
+
temperature = max(0.1, min(body.get("temperature", 0.5), 0.8)) # Conservative range
|
409 |
|
410 |
+
# Get mode information
|
411 |
+
is_force_mode = body.get("force_mode", False)
|
412 |
|
413 |
if not messages or not isinstance(messages, list):
|
414 |
raise ValueError("Messages field is required and must be a list")
|
|
|
419 |
content={"error": f"Invalid request body: {str(e)}"}
|
420 |
)
|
421 |
|
422 |
+
# Validate messages
|
423 |
for i, msg in enumerate(messages):
|
424 |
if not isinstance(msg, dict) or "role" not in msg or "content" not in msg:
|
425 |
return JSONResponse(
|
|
|
428 |
)
|
429 |
|
430 |
try:
|
431 |
+
print(f"📥 Processing request for Qwen2-0.5B in {'FORCE' if is_force_mode else 'MENTOR'} mode")
|
|
|
432 |
print(f"📊 Settings: max_tokens={max_tokens}, temperature={temperature}")
|
433 |
|
434 |
response_content = generate_response(
|
435 |
messages=messages,
|
436 |
is_force_mode=is_force_mode,
|
437 |
+
max_tokens=max_tokens,
|
438 |
+
temperature=temperature
|
439 |
)
|
440 |
|
441 |
# Return OpenAI-compatible response
|
442 |
return {
|
443 |
+
"id": f"chatcmpl-apollo-qwen05b-{hash(str(messages)) % 10000}",
|
444 |
"object": "chat.completion",
|
445 |
"created": int(torch.tensor(0).item()),
|
446 |
"model": f"qwen2-0.5b-{'force' if is_force_mode else 'mentor'}-mode",
|
|
|
459 |
"completion_tokens": len(response_content),
|
460 |
"total_tokens": len(str(messages)) + len(response_content)
|
461 |
},
|
462 |
+
"apollo_mode": "force" if is_force_mode else "mentor",
|
463 |
+
"model_optimizations": "qwen2_0.5B_specific"
|
464 |
}
|
465 |
|
466 |
except Exception as e:
|
|
|
470 |
content={"error": f"Internal server error: {str(e)}"}
|
471 |
)
|
472 |
|
473 |
+
# === Test endpoint optimized for 0.5B ===
|
474 |
@app.post("/test")
|
475 |
async def test_generation(request: Request):
|
476 |
+
"""Test endpoint for debugging both modes with 0.5B optimizations"""
|
477 |
try:
|
478 |
body = await request.json()
|
479 |
prompt = body.get("prompt", "How do I print hello world in Python?")
|
480 |
+
max_tokens = min(body.get("max_tokens", 200), 300)
|
481 |
test_both_modes = body.get("test_both_modes", True)
|
482 |
|
483 |
results = {}
|
484 |
|
485 |
# Test mentor mode
|
486 |
+
messages_mentor = [{"role": "user", "content": prompt}]
|
487 |
+
mentor_response = generate_response(messages_mentor, is_force_mode=False, max_tokens=max_tokens, temperature=0.3)
|
|
|
|
|
488 |
results["mentor_mode"] = {
|
489 |
"response": mentor_response,
|
490 |
"length": len(mentor_response),
|
|
|
493 |
|
494 |
if test_both_modes:
|
495 |
# Test force mode
|
496 |
+
messages_force = [{"role": "user", "content": prompt}]
|
497 |
+
force_response = generate_response(messages_force, is_force_mode=True, max_tokens=max_tokens, temperature=0.1)
|
|
|
|
|
498 |
results["force_mode"] = {
|
499 |
"response": force_response,
|
500 |
"length": len(force_response),
|
|
|
504 |
return {
|
505 |
"prompt": prompt,
|
506 |
"results": results,
|
507 |
+
"model": "Qwen2-0.5B-Instruct",
|
508 |
+
"optimizations": "0.5B_specific",
|
509 |
+
"status": "success"
|
510 |
}
|
511 |
|
512 |
except Exception as e:
|
|
|
517 |
|
518 |
if __name__ == "__main__":
|
519 |
import uvicorn
|
520 |
+
print("🚀 Starting Apollo AI Backend v2.1 - Qwen2-0.5B Optimized...")
|
521 |
+
print("🧠 Model: Qwen/Qwen2-0.5B-Instruct (500M parameters)")
|
522 |
+
print("⚡ Optimizations: Short contexts, conservative generation, predefined responses")
|
523 |
+
print("🎯 Modes: Mentor (simple questions) vs Force (direct answers)")
|
524 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|