Ais
commited on
Update app/main.py
Browse files- app/main.py +69 -146
app/main.py
CHANGED
@@ -44,207 +44,126 @@ print("✅ Model ready!")
|
|
44 |
|
45 |
def extract_clean_answer(full_response: str, formatted_prompt: str, user_message: str) -> str:
|
46 |
"""
|
47 |
-
|
48 |
"""
|
49 |
if not full_response or len(full_response.strip()) < 5:
|
50 |
return "I apologize, but I couldn't generate a response. Please try again."
|
51 |
|
52 |
print(f"🔍 Raw response length: {len(full_response)}")
|
53 |
-
print(f"🔍 Raw response preview: {full_response[:
|
54 |
|
55 |
-
# Step 1: Remove the
|
56 |
generated_text = full_response
|
57 |
if formatted_prompt in full_response:
|
58 |
parts = full_response.split(formatted_prompt)
|
59 |
if len(parts) > 1:
|
60 |
-
generated_text = parts[-1]
|
61 |
|
62 |
-
|
|
|
63 |
|
64 |
-
#
|
65 |
-
assistant_content = ""
|
66 |
-
|
67 |
-
# Look for the last assistant response
|
68 |
if "<|im_start|>assistant" in generated_text:
|
69 |
-
# Split by assistant markers and take the last one
|
70 |
assistant_parts = generated_text.split("<|im_start|>assistant")
|
71 |
if len(assistant_parts) > 1:
|
72 |
assistant_content = assistant_parts[-1]
|
73 |
-
|
74 |
-
# Remove everything after <|im_end|> if it exists
|
75 |
if "<|im_end|>" in assistant_content:
|
76 |
assistant_content = assistant_content.split("<|im_end|>")[0]
|
77 |
-
else:
|
78 |
-
# Fallback: use the generated text as-is
|
79 |
-
assistant_content = generated_text
|
80 |
|
81 |
-
|
|
|
82 |
|
83 |
-
#
|
84 |
-
clean_text =
|
|
|
85 |
|
86 |
-
# Remove
|
87 |
-
|
88 |
-
r'<\|im_start\|>.*?<\|im_end\|>',
|
89 |
-
r'<\|im_start\|>.*',
|
90 |
-
r'<\|im_end\|>.*',
|
91 |
-
r'^(system|user|assistant):\s*',
|
92 |
-
r'\n(system|user|assistant):\s*',
|
93 |
-
r'^\s*(system|user|assistant)\s*\n',
|
94 |
-
]
|
95 |
|
96 |
-
|
97 |
-
clean_text = re.sub(pattern, '', clean_text, flags=re.MULTILINE | re.IGNORECASE)
|
98 |
|
99 |
-
# Step 4:
|
100 |
-
|
101 |
-
final_lines = []
|
102 |
|
103 |
-
#
|
104 |
-
|
105 |
-
|
106 |
-
'
|
107 |
-
|
108 |
-
|
109 |
-
'- provide clear',
|
110 |
-
'- use markdown',
|
111 |
-
'- always include',
|
112 |
-
'- be encouraging',
|
113 |
-
'- if asked about',
|
114 |
-
'- for project',
|
115 |
-
'- focus on',
|
116 |
-
'- keep responses',
|
117 |
-
'- use emojis',
|
118 |
-
'- use bold',
|
119 |
-
'- use bullet points',
|
120 |
-
'- never include api',
|
121 |
-
'vs code context:',
|
122 |
-
'[vs code context',
|
123 |
-
'what is 2+2', # Remove question echo
|
124 |
-
'current request:',
|
125 |
-
'previous conversation:',
|
126 |
-
]
|
127 |
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
for line in lines:
|
132 |
-
line_clean = line.strip()
|
133 |
-
line_lower = line_clean.lower()
|
134 |
-
|
135 |
-
# Skip empty lines at the start
|
136 |
-
if not line_clean and not found_real_content:
|
137 |
-
continue
|
138 |
-
|
139 |
-
# Check if this line contains system prompt artifacts
|
140 |
-
is_system_line = any(indicator in line_lower for indicator in system_indicators)
|
141 |
-
|
142 |
-
if is_system_line:
|
143 |
-
# Start skipping mode when we hit system prompts
|
144 |
-
skip_mode = True
|
145 |
-
continue
|
146 |
-
|
147 |
-
# If we hit actual content after system prompts, stop skipping
|
148 |
-
if skip_mode and line_clean and not is_system_line:
|
149 |
-
# Check if this looks like real content (not more system stuff)
|
150 |
-
if (len(line_clean) > 3 and
|
151 |
-
not line_lower.startswith(('what is', 'calculate', 'result =', '```')) and
|
152 |
-
not re.match(r'^[#*\-\d\.\s]+$', line_clean)):
|
153 |
-
skip_mode = False
|
154 |
-
found_real_content = True
|
155 |
-
|
156 |
-
# Add line if we're not in skip mode
|
157 |
-
if not skip_mode:
|
158 |
-
final_lines.append(line)
|
159 |
-
found_real_content = True
|
160 |
-
|
161 |
-
# Step 5: Reconstruct the clean response
|
162 |
-
final_answer = '\n'.join(final_lines).strip()
|
163 |
-
|
164 |
-
# Step 6: Handle special cases and final cleanup
|
165 |
-
if not final_answer or len(final_answer) < 10:
|
166 |
-
# Try to extract just a simple answer if available
|
167 |
-
if user_message and ('2+2' in user_message or '2 + 2' in user_message):
|
168 |
-
return "4\n\nThe answer to 2 + 2 is 4."
|
169 |
-
return "I understand your question. Could you please be more specific about what you'd like to know?"
|
170 |
-
|
171 |
-
# Remove any remaining artifacts that might have slipped through
|
172 |
-
final_answer = re.sub(r'```\s*$', '', final_answer) # Remove trailing code block
|
173 |
-
final_answer = re.sub(r'^\s*```.*?\n', '', final_answer) # Remove leading code block start
|
174 |
-
final_answer = final_answer.strip()
|
175 |
-
|
176 |
-
print(f"🧹 Final cleaned answer: {final_answer}")
|
177 |
-
return final_answer
|
178 |
|
179 |
-
def generate_response(messages: list, max_tokens: int =
|
180 |
"""
|
181 |
-
Generate response
|
182 |
"""
|
183 |
try:
|
184 |
-
# Create
|
185 |
clean_messages = []
|
186 |
|
187 |
# Add minimal system message
|
188 |
clean_messages.append({
|
189 |
"role": "system",
|
190 |
-
"content": "You are a helpful assistant."
|
191 |
})
|
192 |
|
193 |
-
# Add
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
clean_messages.append(latest_user)
|
199 |
|
200 |
-
print(f"🔍
|
201 |
|
202 |
-
# Build
|
203 |
formatted_prompt = tokenizer.apply_chat_template(
|
204 |
clean_messages,
|
205 |
tokenize=False,
|
206 |
add_generation_prompt=True
|
207 |
)
|
208 |
|
209 |
-
|
210 |
-
|
211 |
-
# Tokenize with truncation to prevent overlong prompts
|
212 |
-
inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=1024)
|
213 |
|
214 |
-
# Generate with
|
215 |
with torch.no_grad():
|
216 |
outputs = model.generate(
|
217 |
inputs.input_ids,
|
218 |
attention_mask=inputs.attention_mask,
|
219 |
-
max_new_tokens=min(max_tokens,
|
220 |
-
temperature=max(0.3, min(temperature, 0.
|
221 |
-
top_p=0.
|
222 |
do_sample=True,
|
223 |
pad_token_id=tokenizer.eos_token_id,
|
224 |
eos_token_id=tokenizer.eos_token_id,
|
225 |
-
repetition_penalty=1.
|
226 |
-
length_penalty=0
|
227 |
-
early_stopping=
|
228 |
-
no_repeat_ngram_size=
|
229 |
)
|
230 |
|
231 |
# Decode the full response
|
232 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
|
233 |
|
234 |
-
# Extract user message for
|
235 |
user_message = ""
|
236 |
-
for msg in clean_messages:
|
237 |
if msg.get("role") == "user":
|
238 |
user_message = msg.get("content", "")
|
|
|
239 |
|
240 |
-
# Clean and extract the answer
|
241 |
clean_answer = extract_clean_answer(full_response, formatted_prompt, user_message)
|
242 |
|
243 |
return clean_answer
|
244 |
|
245 |
except Exception as e:
|
246 |
print(f"❌ Generation error: {e}")
|
247 |
-
return f"I encountered an error while processing your request. Please try again."
|
248 |
|
249 |
# === Routes ===
|
250 |
@app.get("/")
|
@@ -252,7 +171,8 @@ def root():
|
|
252 |
return {
|
253 |
"message": "🤖 Apollo AI Backend is running!",
|
254 |
"model": "Qwen2-0.5B-Instruct with LoRA",
|
255 |
-
"status": "ready"
|
|
|
256 |
}
|
257 |
|
258 |
@app.get("/health")
|
@@ -280,7 +200,7 @@ async def chat_completions(request: Request):
|
|
280 |
try:
|
281 |
body = await request.json()
|
282 |
messages = body.get("messages", [])
|
283 |
-
max_tokens = body.get("max_tokens",
|
284 |
temperature = body.get("temperature", 0.7)
|
285 |
|
286 |
if not messages or not isinstance(messages, list):
|
@@ -301,19 +221,19 @@ async def chat_completions(request: Request):
|
|
301 |
)
|
302 |
|
303 |
try:
|
304 |
-
# Generate response
|
305 |
-
print(f"📥 Processing {len(messages)} messages")
|
306 |
response_content = generate_response(
|
307 |
messages=messages,
|
308 |
-
max_tokens=min(max_tokens,
|
309 |
-
temperature=max(0.1, min(temperature, 1.0))
|
310 |
)
|
311 |
|
312 |
# Return OpenAI-compatible response
|
313 |
return {
|
314 |
"id": f"chatcmpl-apollo-{hash(str(messages)) % 10000}",
|
315 |
"object": "chat.completion",
|
316 |
-
"created": int(torch.tensor(0).item()),
|
317 |
"model": "qwen2-0.5b-instruct-lora",
|
318 |
"choices": [
|
319 |
{
|
@@ -326,8 +246,8 @@ async def chat_completions(request: Request):
|
|
326 |
}
|
327 |
],
|
328 |
"usage": {
|
329 |
-
"prompt_tokens": len(str(messages)),
|
330 |
-
"completion_tokens": len(response_content),
|
331 |
"total_tokens": len(str(messages)) + len(response_content)
|
332 |
}
|
333 |
}
|
@@ -346,17 +266,19 @@ async def test_generation(request: Request):
|
|
346 |
try:
|
347 |
body = await request.json()
|
348 |
prompt = body.get("prompt", "Hello, how are you?")
|
|
|
349 |
|
350 |
messages = [
|
351 |
-
{"role": "system", "content": "You are a helpful assistant."},
|
352 |
{"role": "user", "content": prompt}
|
353 |
]
|
354 |
|
355 |
-
response = generate_response(messages, max_tokens=
|
356 |
|
357 |
return {
|
358 |
"prompt": prompt,
|
359 |
"response": response,
|
|
|
360 |
"status": "success"
|
361 |
}
|
362 |
|
@@ -368,5 +290,6 @@ async def test_generation(request: Request):
|
|
368 |
|
369 |
if __name__ == "__main__":
|
370 |
import uvicorn
|
371 |
-
print("🚀 Starting Apollo AI Backend...")
|
|
|
372 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
44 |
|
45 |
def extract_clean_answer(full_response: str, formatted_prompt: str, user_message: str) -> str:
|
46 |
"""
|
47 |
+
FIXED VERSION - Much gentler cleaning that preserves complete responses.
|
48 |
"""
|
49 |
if not full_response or len(full_response.strip()) < 5:
|
50 |
return "I apologize, but I couldn't generate a response. Please try again."
|
51 |
|
52 |
print(f"🔍 Raw response length: {len(full_response)}")
|
53 |
+
print(f"🔍 Raw response preview: {full_response[:200]}...")
|
54 |
|
55 |
+
# Step 1: Remove the input prompt to get only generated content
|
56 |
generated_text = full_response
|
57 |
if formatted_prompt in full_response:
|
58 |
parts = full_response.split(formatted_prompt)
|
59 |
if len(parts) > 1:
|
60 |
+
generated_text = parts[-1]
|
61 |
|
62 |
+
# Step 2: Extract assistant content - SIMPLIFIED approach
|
63 |
+
assistant_content = generated_text
|
64 |
|
65 |
+
# Look for assistant tags and extract content
|
|
|
|
|
|
|
66 |
if "<|im_start|>assistant" in generated_text:
|
|
|
67 |
assistant_parts = generated_text.split("<|im_start|>assistant")
|
68 |
if len(assistant_parts) > 1:
|
69 |
assistant_content = assistant_parts[-1]
|
70 |
+
# Remove end marker if present
|
|
|
71 |
if "<|im_end|>" in assistant_content:
|
72 |
assistant_content = assistant_content.split("<|im_end|>")[0]
|
|
|
|
|
|
|
73 |
|
74 |
+
# Step 3: GENTLE cleaning - only remove obvious template artifacts
|
75 |
+
clean_text = assistant_content.strip()
|
76 |
|
77 |
+
# Remove template tokens only
|
78 |
+
clean_text = re.sub(r'<\|im_start\|>', '', clean_text)
|
79 |
+
clean_text = re.sub(r'<\|im_end\|>', '', clean_text)
|
80 |
|
81 |
+
# Remove role prefixes only at start of lines
|
82 |
+
clean_text = re.sub(r'^(system|user|assistant):\s*', '', clean_text, flags=re.MULTILINE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
+
# REMOVED: Aggressive line-by-line filtering that was truncating responses
|
|
|
85 |
|
86 |
+
# Step 4: Final cleanup - preserve content structure
|
87 |
+
clean_text = clean_text.strip()
|
|
|
88 |
|
89 |
+
# Only apply minimal fixes
|
90 |
+
if not clean_text or len(clean_text) < 10:
|
91 |
+
# Fallback for very short responses
|
92 |
+
if user_message and any(math_term in user_message.lower() for math_term in ['2+2', '2 + 2', 'calculate', 'math']):
|
93 |
+
return "4\n\nThe answer is 4."
|
94 |
+
return "I understand your question. Could you please provide more details?"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
+
print(f"🧹 Final cleaned answer length: {len(clean_text)}")
|
97 |
+
print(f"🧹 Final answer preview: {clean_text[:150]}...")
|
98 |
+
return clean_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
+
def generate_response(messages: list, max_tokens: int = 400, temperature: float = 0.7) -> str:
|
101 |
"""
|
102 |
+
FIXED: Generate response with higher token limits and better settings.
|
103 |
"""
|
104 |
try:
|
105 |
+
# Create clean conversation
|
106 |
clean_messages = []
|
107 |
|
108 |
# Add minimal system message
|
109 |
clean_messages.append({
|
110 |
"role": "system",
|
111 |
+
"content": "You are Apollo AI, a helpful coding assistant. Provide clear, complete explanations with proper code formatting."
|
112 |
})
|
113 |
|
114 |
+
# Add recent conversation context (last 2-3 messages)
|
115 |
+
recent_messages = messages[-3:] if len(messages) > 3 else messages
|
116 |
+
for msg in recent_messages:
|
117 |
+
if msg.get("role") in ["user", "assistant"]:
|
118 |
+
clean_messages.append(msg)
|
|
|
119 |
|
120 |
+
print(f"🔍 Processing {len(clean_messages)} messages")
|
121 |
|
122 |
+
# Build conversation using tokenizer's chat template
|
123 |
formatted_prompt = tokenizer.apply_chat_template(
|
124 |
clean_messages,
|
125 |
tokenize=False,
|
126 |
add_generation_prompt=True
|
127 |
)
|
128 |
|
129 |
+
# Tokenize with proper length limits
|
130 |
+
inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=1500)
|
|
|
|
|
131 |
|
132 |
+
# FIXED: Generate with much higher token limits
|
133 |
with torch.no_grad():
|
134 |
outputs = model.generate(
|
135 |
inputs.input_ids,
|
136 |
attention_mask=inputs.attention_mask,
|
137 |
+
max_new_tokens=min(max_tokens, 500), # INCREASED from 150 to 500
|
138 |
+
temperature=max(0.3, min(temperature, 0.9)),
|
139 |
+
top_p=0.9,
|
140 |
do_sample=True,
|
141 |
pad_token_id=tokenizer.eos_token_id,
|
142 |
eos_token_id=tokenizer.eos_token_id,
|
143 |
+
repetition_penalty=1.05, # Reduced to allow natural repetition
|
144 |
+
length_penalty=1.0, # Neutral length penalty
|
145 |
+
early_stopping=False, # Don't stop early
|
146 |
+
no_repeat_ngram_size=2, # Reduced to allow more natural flow
|
147 |
)
|
148 |
|
149 |
# Decode the full response
|
150 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
|
151 |
|
152 |
+
# Extract user message for context
|
153 |
user_message = ""
|
154 |
+
for msg in reversed(clean_messages):
|
155 |
if msg.get("role") == "user":
|
156 |
user_message = msg.get("content", "")
|
157 |
+
break
|
158 |
|
159 |
+
# Clean and extract the answer with gentle approach
|
160 |
clean_answer = extract_clean_answer(full_response, formatted_prompt, user_message)
|
161 |
|
162 |
return clean_answer
|
163 |
|
164 |
except Exception as e:
|
165 |
print(f"❌ Generation error: {e}")
|
166 |
+
return f"I encountered an error while processing your request. Please try again with a simpler question."
|
167 |
|
168 |
# === Routes ===
|
169 |
@app.get("/")
|
|
|
171 |
return {
|
172 |
"message": "🤖 Apollo AI Backend is running!",
|
173 |
"model": "Qwen2-0.5B-Instruct with LoRA",
|
174 |
+
"status": "ready",
|
175 |
+
"max_tokens": "500 (increased)"
|
176 |
}
|
177 |
|
178 |
@app.get("/health")
|
|
|
200 |
try:
|
201 |
body = await request.json()
|
202 |
messages = body.get("messages", [])
|
203 |
+
max_tokens = body.get("max_tokens", 400) # INCREASED default
|
204 |
temperature = body.get("temperature", 0.7)
|
205 |
|
206 |
if not messages or not isinstance(messages, list):
|
|
|
221 |
)
|
222 |
|
223 |
try:
|
224 |
+
# Generate response with higher limits
|
225 |
+
print(f"📥 Processing {len(messages)} messages with max_tokens: {max_tokens}")
|
226 |
response_content = generate_response(
|
227 |
messages=messages,
|
228 |
+
max_tokens=min(max_tokens, 600), # INCREASED cap to 600
|
229 |
+
temperature=max(0.1, min(temperature, 1.0))
|
230 |
)
|
231 |
|
232 |
# Return OpenAI-compatible response
|
233 |
return {
|
234 |
"id": f"chatcmpl-apollo-{hash(str(messages)) % 10000}",
|
235 |
"object": "chat.completion",
|
236 |
+
"created": int(torch.tensor(0).item()),
|
237 |
"model": "qwen2-0.5b-instruct-lora",
|
238 |
"choices": [
|
239 |
{
|
|
|
246 |
}
|
247 |
],
|
248 |
"usage": {
|
249 |
+
"prompt_tokens": len(str(messages)),
|
250 |
+
"completion_tokens": len(response_content),
|
251 |
"total_tokens": len(str(messages)) + len(response_content)
|
252 |
}
|
253 |
}
|
|
|
266 |
try:
|
267 |
body = await request.json()
|
268 |
prompt = body.get("prompt", "Hello, how are you?")
|
269 |
+
max_tokens = body.get("max_tokens", 300)
|
270 |
|
271 |
messages = [
|
272 |
+
{"role": "system", "content": "You are Apollo AI, a helpful assistant."},
|
273 |
{"role": "user", "content": prompt}
|
274 |
]
|
275 |
|
276 |
+
response = generate_response(messages, max_tokens=max_tokens, temperature=0.7)
|
277 |
|
278 |
return {
|
279 |
"prompt": prompt,
|
280 |
"response": response,
|
281 |
+
"response_length": len(response),
|
282 |
"status": "success"
|
283 |
}
|
284 |
|
|
|
290 |
|
291 |
if __name__ == "__main__":
|
292 |
import uvicorn
|
293 |
+
print("🚀 Starting Apollo AI Backend with FIXED response limits...")
|
294 |
+
print("📊 Max tokens increased to 500+ for complete responses")
|
295 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|