Ais commited on
Commit
70df3dc
·
verified ·
1 Parent(s): 7cd3cee

Update app/main.py

Browse files
Files changed (1) hide show
  1. app/main.py +69 -146
app/main.py CHANGED
@@ -44,207 +44,126 @@ print("✅ Model ready!")
44
 
45
  def extract_clean_answer(full_response: str, formatted_prompt: str, user_message: str) -> str:
46
  """
47
- COMPLETELY REWRITTEN - Extract only the AI's actual response, removing ALL artifacts.
48
  """
49
  if not full_response or len(full_response.strip()) < 5:
50
  return "I apologize, but I couldn't generate a response. Please try again."
51
 
52
  print(f"🔍 Raw response length: {len(full_response)}")
53
- print(f"🔍 Raw response preview: {full_response[:400]}...")
54
 
55
- # Step 1: Remove the entire input prompt to get only generated content
56
  generated_text = full_response
57
  if formatted_prompt in full_response:
58
  parts = full_response.split(formatted_prompt)
59
  if len(parts) > 1:
60
- generated_text = parts[-1] # Take everything after the prompt
61
 
62
- print(f"🔍 After prompt removal: {generated_text[:200]}...")
 
63
 
64
- # Step 2: Extract content between assistant tags (most reliable method)
65
- assistant_content = ""
66
-
67
- # Look for the last assistant response
68
  if "<|im_start|>assistant" in generated_text:
69
- # Split by assistant markers and take the last one
70
  assistant_parts = generated_text.split("<|im_start|>assistant")
71
  if len(assistant_parts) > 1:
72
  assistant_content = assistant_parts[-1]
73
-
74
- # Remove everything after <|im_end|> if it exists
75
  if "<|im_end|>" in assistant_content:
76
  assistant_content = assistant_content.split("<|im_end|>")[0]
77
- else:
78
- # Fallback: use the generated text as-is
79
- assistant_content = generated_text
80
 
81
- print(f"🔍 After assistant extraction: {assistant_content[:200]}...")
 
82
 
83
- # Step 3: Remove ALL template artifacts aggressively
84
- clean_text = assistant_content
 
85
 
86
- # Remove any remaining template tokens
87
- template_artifacts = [
88
- r'<\|im_start\|>.*?<\|im_end\|>',
89
- r'<\|im_start\|>.*',
90
- r'<\|im_end\|>.*',
91
- r'^(system|user|assistant):\s*',
92
- r'\n(system|user|assistant):\s*',
93
- r'^\s*(system|user|assistant)\s*\n',
94
- ]
95
 
96
- for pattern in template_artifacts:
97
- clean_text = re.sub(pattern, '', clean_text, flags=re.MULTILINE | re.IGNORECASE)
98
 
99
- # Step 4: NUCLEAR OPTION - Remove all system prompt leaks line by line
100
- lines = clean_text.split('\n')
101
- final_lines = []
102
 
103
- # System prompt indicators to completely remove
104
- system_indicators = [
105
- 'you are apollo ai',
106
- 'you are a helpful',
107
- 'guidelines:',
108
- 'response format:',
109
- '- provide clear',
110
- '- use markdown',
111
- '- always include',
112
- '- be encouraging',
113
- '- if asked about',
114
- '- for project',
115
- '- focus on',
116
- '- keep responses',
117
- '- use emojis',
118
- '- use bold',
119
- '- use bullet points',
120
- '- never include api',
121
- 'vs code context:',
122
- '[vs code context',
123
- 'what is 2+2', # Remove question echo
124
- 'current request:',
125
- 'previous conversation:',
126
- ]
127
 
128
- skip_mode = False
129
- found_real_content = False
130
-
131
- for line in lines:
132
- line_clean = line.strip()
133
- line_lower = line_clean.lower()
134
-
135
- # Skip empty lines at the start
136
- if not line_clean and not found_real_content:
137
- continue
138
-
139
- # Check if this line contains system prompt artifacts
140
- is_system_line = any(indicator in line_lower for indicator in system_indicators)
141
-
142
- if is_system_line:
143
- # Start skipping mode when we hit system prompts
144
- skip_mode = True
145
- continue
146
-
147
- # If we hit actual content after system prompts, stop skipping
148
- if skip_mode and line_clean and not is_system_line:
149
- # Check if this looks like real content (not more system stuff)
150
- if (len(line_clean) > 3 and
151
- not line_lower.startswith(('what is', 'calculate', 'result =', '```')) and
152
- not re.match(r'^[#*\-\d\.\s]+$', line_clean)):
153
- skip_mode = False
154
- found_real_content = True
155
-
156
- # Add line if we're not in skip mode
157
- if not skip_mode:
158
- final_lines.append(line)
159
- found_real_content = True
160
-
161
- # Step 5: Reconstruct the clean response
162
- final_answer = '\n'.join(final_lines).strip()
163
-
164
- # Step 6: Handle special cases and final cleanup
165
- if not final_answer or len(final_answer) < 10:
166
- # Try to extract just a simple answer if available
167
- if user_message and ('2+2' in user_message or '2 + 2' in user_message):
168
- return "4\n\nThe answer to 2 + 2 is 4."
169
- return "I understand your question. Could you please be more specific about what you'd like to know?"
170
-
171
- # Remove any remaining artifacts that might have slipped through
172
- final_answer = re.sub(r'```\s*$', '', final_answer) # Remove trailing code block
173
- final_answer = re.sub(r'^\s*```.*?\n', '', final_answer) # Remove leading code block start
174
- final_answer = final_answer.strip()
175
-
176
- print(f"🧹 Final cleaned answer: {final_answer}")
177
- return final_answer
178
 
179
- def generate_response(messages: list, max_tokens: int = 200, temperature: float = 0.7) -> str:
180
  """
181
- Generate response using the model with MINIMAL system prompts.
182
  """
183
  try:
184
- # Create MINIMAL conversation - just the essentials
185
  clean_messages = []
186
 
187
  # Add minimal system message
188
  clean_messages.append({
189
  "role": "system",
190
- "content": "You are a helpful assistant."
191
  })
192
 
193
- # Add only the last few user/assistant exchanges to avoid context pollution
194
- user_messages = [msg for msg in messages if msg.get("role") == "user"]
195
- if user_messages:
196
- # Take only the latest user message to avoid confusion
197
- latest_user = user_messages[-1]
198
- clean_messages.append(latest_user)
199
 
200
- print(f"🔍 Clean messages: {clean_messages}")
201
 
202
- # Build the conversation using tokenizer's chat template
203
  formatted_prompt = tokenizer.apply_chat_template(
204
  clean_messages,
205
  tokenize=False,
206
  add_generation_prompt=True
207
  )
208
 
209
- print(f"🔍 Formatted prompt: {formatted_prompt}")
210
-
211
- # Tokenize with truncation to prevent overlong prompts
212
- inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=1024)
213
 
214
- # Generate with conservative settings
215
  with torch.no_grad():
216
  outputs = model.generate(
217
  inputs.input_ids,
218
  attention_mask=inputs.attention_mask,
219
- max_new_tokens=min(max_tokens, 150), # Keep responses short
220
- temperature=max(0.3, min(temperature, 0.8)), # Controlled temperature
221
- top_p=0.85,
222
  do_sample=True,
223
  pad_token_id=tokenizer.eos_token_id,
224
  eos_token_id=tokenizer.eos_token_id,
225
- repetition_penalty=1.1,
226
- length_penalty=0.8,
227
- early_stopping=True,
228
- no_repeat_ngram_size=3,
229
  )
230
 
231
  # Decode the full response
232
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
233
 
234
- # Extract user message for cleaning context
235
  user_message = ""
236
- for msg in clean_messages:
237
  if msg.get("role") == "user":
238
  user_message = msg.get("content", "")
 
239
 
240
- # Clean and extract the answer
241
  clean_answer = extract_clean_answer(full_response, formatted_prompt, user_message)
242
 
243
  return clean_answer
244
 
245
  except Exception as e:
246
  print(f"❌ Generation error: {e}")
247
- return f"I encountered an error while processing your request. Please try again."
248
 
249
  # === Routes ===
250
  @app.get("/")
@@ -252,7 +171,8 @@ def root():
252
  return {
253
  "message": "🤖 Apollo AI Backend is running!",
254
  "model": "Qwen2-0.5B-Instruct with LoRA",
255
- "status": "ready"
 
256
  }
257
 
258
  @app.get("/health")
@@ -280,7 +200,7 @@ async def chat_completions(request: Request):
280
  try:
281
  body = await request.json()
282
  messages = body.get("messages", [])
283
- max_tokens = body.get("max_tokens", 200)
284
  temperature = body.get("temperature", 0.7)
285
 
286
  if not messages or not isinstance(messages, list):
@@ -301,19 +221,19 @@ async def chat_completions(request: Request):
301
  )
302
 
303
  try:
304
- # Generate response
305
- print(f"📥 Processing {len(messages)} messages")
306
  response_content = generate_response(
307
  messages=messages,
308
- max_tokens=min(max_tokens, 300), # Cap max tokens
309
- temperature=max(0.1, min(temperature, 1.0)) # Clamp temperature
310
  )
311
 
312
  # Return OpenAI-compatible response
313
  return {
314
  "id": f"chatcmpl-apollo-{hash(str(messages)) % 10000}",
315
  "object": "chat.completion",
316
- "created": int(torch.tensor(0).item()), # Simple timestamp
317
  "model": "qwen2-0.5b-instruct-lora",
318
  "choices": [
319
  {
@@ -326,8 +246,8 @@ async def chat_completions(request: Request):
326
  }
327
  ],
328
  "usage": {
329
- "prompt_tokens": len(str(messages)), # Approximate
330
- "completion_tokens": len(response_content), # Approximate
331
  "total_tokens": len(str(messages)) + len(response_content)
332
  }
333
  }
@@ -346,17 +266,19 @@ async def test_generation(request: Request):
346
  try:
347
  body = await request.json()
348
  prompt = body.get("prompt", "Hello, how are you?")
 
349
 
350
  messages = [
351
- {"role": "system", "content": "You are a helpful assistant."},
352
  {"role": "user", "content": prompt}
353
  ]
354
 
355
- response = generate_response(messages, max_tokens=150, temperature=0.7)
356
 
357
  return {
358
  "prompt": prompt,
359
  "response": response,
 
360
  "status": "success"
361
  }
362
 
@@ -368,5 +290,6 @@ async def test_generation(request: Request):
368
 
369
  if __name__ == "__main__":
370
  import uvicorn
371
- print("🚀 Starting Apollo AI Backend...")
 
372
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
44
 
45
  def extract_clean_answer(full_response: str, formatted_prompt: str, user_message: str) -> str:
46
  """
47
+ FIXED VERSION - Much gentler cleaning that preserves complete responses.
48
  """
49
  if not full_response or len(full_response.strip()) < 5:
50
  return "I apologize, but I couldn't generate a response. Please try again."
51
 
52
  print(f"🔍 Raw response length: {len(full_response)}")
53
+ print(f"🔍 Raw response preview: {full_response[:200]}...")
54
 
55
+ # Step 1: Remove the input prompt to get only generated content
56
  generated_text = full_response
57
  if formatted_prompt in full_response:
58
  parts = full_response.split(formatted_prompt)
59
  if len(parts) > 1:
60
+ generated_text = parts[-1]
61
 
62
+ # Step 2: Extract assistant content - SIMPLIFIED approach
63
+ assistant_content = generated_text
64
 
65
+ # Look for assistant tags and extract content
 
 
 
66
  if "<|im_start|>assistant" in generated_text:
 
67
  assistant_parts = generated_text.split("<|im_start|>assistant")
68
  if len(assistant_parts) > 1:
69
  assistant_content = assistant_parts[-1]
70
+ # Remove end marker if present
 
71
  if "<|im_end|>" in assistant_content:
72
  assistant_content = assistant_content.split("<|im_end|>")[0]
 
 
 
73
 
74
+ # Step 3: GENTLE cleaning - only remove obvious template artifacts
75
+ clean_text = assistant_content.strip()
76
 
77
+ # Remove template tokens only
78
+ clean_text = re.sub(r'<\|im_start\|>', '', clean_text)
79
+ clean_text = re.sub(r'<\|im_end\|>', '', clean_text)
80
 
81
+ # Remove role prefixes only at start of lines
82
+ clean_text = re.sub(r'^(system|user|assistant):\s*', '', clean_text, flags=re.MULTILINE)
 
 
 
 
 
 
 
83
 
84
+ # REMOVED: Aggressive line-by-line filtering that was truncating responses
 
85
 
86
+ # Step 4: Final cleanup - preserve content structure
87
+ clean_text = clean_text.strip()
 
88
 
89
+ # Only apply minimal fixes
90
+ if not clean_text or len(clean_text) < 10:
91
+ # Fallback for very short responses
92
+ if user_message and any(math_term in user_message.lower() for math_term in ['2+2', '2 + 2', 'calculate', 'math']):
93
+ return "4\n\nThe answer is 4."
94
+ return "I understand your question. Could you please provide more details?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ print(f"🧹 Final cleaned answer length: {len(clean_text)}")
97
+ print(f"🧹 Final answer preview: {clean_text[:150]}...")
98
+ return clean_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
+ def generate_response(messages: list, max_tokens: int = 400, temperature: float = 0.7) -> str:
101
  """
102
+ FIXED: Generate response with higher token limits and better settings.
103
  """
104
  try:
105
+ # Create clean conversation
106
  clean_messages = []
107
 
108
  # Add minimal system message
109
  clean_messages.append({
110
  "role": "system",
111
+ "content": "You are Apollo AI, a helpful coding assistant. Provide clear, complete explanations with proper code formatting."
112
  })
113
 
114
+ # Add recent conversation context (last 2-3 messages)
115
+ recent_messages = messages[-3:] if len(messages) > 3 else messages
116
+ for msg in recent_messages:
117
+ if msg.get("role") in ["user", "assistant"]:
118
+ clean_messages.append(msg)
 
119
 
120
+ print(f"🔍 Processing {len(clean_messages)} messages")
121
 
122
+ # Build conversation using tokenizer's chat template
123
  formatted_prompt = tokenizer.apply_chat_template(
124
  clean_messages,
125
  tokenize=False,
126
  add_generation_prompt=True
127
  )
128
 
129
+ # Tokenize with proper length limits
130
+ inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=1500)
 
 
131
 
132
+ # FIXED: Generate with much higher token limits
133
  with torch.no_grad():
134
  outputs = model.generate(
135
  inputs.input_ids,
136
  attention_mask=inputs.attention_mask,
137
+ max_new_tokens=min(max_tokens, 500), # INCREASED from 150 to 500
138
+ temperature=max(0.3, min(temperature, 0.9)),
139
+ top_p=0.9,
140
  do_sample=True,
141
  pad_token_id=tokenizer.eos_token_id,
142
  eos_token_id=tokenizer.eos_token_id,
143
+ repetition_penalty=1.05, # Reduced to allow natural repetition
144
+ length_penalty=1.0, # Neutral length penalty
145
+ early_stopping=False, # Don't stop early
146
+ no_repeat_ngram_size=2, # Reduced to allow more natural flow
147
  )
148
 
149
  # Decode the full response
150
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
151
 
152
+ # Extract user message for context
153
  user_message = ""
154
+ for msg in reversed(clean_messages):
155
  if msg.get("role") == "user":
156
  user_message = msg.get("content", "")
157
+ break
158
 
159
+ # Clean and extract the answer with gentle approach
160
  clean_answer = extract_clean_answer(full_response, formatted_prompt, user_message)
161
 
162
  return clean_answer
163
 
164
  except Exception as e:
165
  print(f"❌ Generation error: {e}")
166
+ return f"I encountered an error while processing your request. Please try again with a simpler question."
167
 
168
  # === Routes ===
169
  @app.get("/")
 
171
  return {
172
  "message": "🤖 Apollo AI Backend is running!",
173
  "model": "Qwen2-0.5B-Instruct with LoRA",
174
+ "status": "ready",
175
+ "max_tokens": "500 (increased)"
176
  }
177
 
178
  @app.get("/health")
 
200
  try:
201
  body = await request.json()
202
  messages = body.get("messages", [])
203
+ max_tokens = body.get("max_tokens", 400) # INCREASED default
204
  temperature = body.get("temperature", 0.7)
205
 
206
  if not messages or not isinstance(messages, list):
 
221
  )
222
 
223
  try:
224
+ # Generate response with higher limits
225
+ print(f"📥 Processing {len(messages)} messages with max_tokens: {max_tokens}")
226
  response_content = generate_response(
227
  messages=messages,
228
+ max_tokens=min(max_tokens, 600), # INCREASED cap to 600
229
+ temperature=max(0.1, min(temperature, 1.0))
230
  )
231
 
232
  # Return OpenAI-compatible response
233
  return {
234
  "id": f"chatcmpl-apollo-{hash(str(messages)) % 10000}",
235
  "object": "chat.completion",
236
+ "created": int(torch.tensor(0).item()),
237
  "model": "qwen2-0.5b-instruct-lora",
238
  "choices": [
239
  {
 
246
  }
247
  ],
248
  "usage": {
249
+ "prompt_tokens": len(str(messages)),
250
+ "completion_tokens": len(response_content),
251
  "total_tokens": len(str(messages)) + len(response_content)
252
  }
253
  }
 
266
  try:
267
  body = await request.json()
268
  prompt = body.get("prompt", "Hello, how are you?")
269
+ max_tokens = body.get("max_tokens", 300)
270
 
271
  messages = [
272
+ {"role": "system", "content": "You are Apollo AI, a helpful assistant."},
273
  {"role": "user", "content": prompt}
274
  ]
275
 
276
+ response = generate_response(messages, max_tokens=max_tokens, temperature=0.7)
277
 
278
  return {
279
  "prompt": prompt,
280
  "response": response,
281
+ "response_length": len(response),
282
  "status": "success"
283
  }
284
 
 
290
 
291
  if __name__ == "__main__":
292
  import uvicorn
293
+ print("🚀 Starting Apollo AI Backend with FIXED response limits...")
294
+ print("📊 Max tokens increased to 500+ for complete responses")
295
  uvicorn.run(app, host="0.0.0.0", port=7860)