Ais commited on
Commit
fc679ee
·
verified ·
1 Parent(s): 70df3dc

Update app/main.py

Browse files
Files changed (1) hide show
  1. app/main.py +184 -62
app/main.py CHANGED
@@ -8,7 +8,7 @@ from starlette.middleware.cors import CORSMiddleware
8
  import re
9
 
10
  # === Setup FastAPI ===
11
- app = FastAPI(title="Apollo AI Backend", version="1.0.0")
12
 
13
  # === CORS ===
14
  app.add_middleware(
@@ -42,15 +42,48 @@ model.eval()
42
 
43
  print("✅ Model ready!")
44
 
45
- def extract_clean_answer(full_response: str, formatted_prompt: str, user_message: str) -> str:
46
  """
47
- FIXED VERSION - Much gentler cleaning that preserves complete responses.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  """
49
  if not full_response or len(full_response.strip()) < 5:
50
  return "I apologize, but I couldn't generate a response. Please try again."
51
 
52
  print(f"🔍 Raw response length: {len(full_response)}")
53
- print(f"🔍 Raw response preview: {full_response[:200]}...")
54
 
55
  # Step 1: Remove the input prompt to get only generated content
56
  generated_text = full_response
@@ -59,10 +92,10 @@ def extract_clean_answer(full_response: str, formatted_prompt: str, user_message
59
  if len(parts) > 1:
60
  generated_text = parts[-1]
61
 
62
- # Step 2: Extract assistant content - SIMPLIFIED approach
63
  assistant_content = generated_text
64
 
65
- # Look for assistant tags and extract content
66
  if "<|im_start|>assistant" in generated_text:
67
  assistant_parts = generated_text.split("<|im_start|>assistant")
68
  if len(assistant_parts) > 1:
@@ -71,53 +104,95 @@ def extract_clean_answer(full_response: str, formatted_prompt: str, user_message
71
  if "<|im_end|>" in assistant_content:
72
  assistant_content = assistant_content.split("<|im_end|>")[0]
73
 
74
- # Step 3: GENTLE cleaning - only remove obvious template artifacts
 
 
 
 
 
 
 
75
  clean_text = assistant_content.strip()
76
 
77
- # Remove template tokens only
78
  clean_text = re.sub(r'<\|im_start\|>', '', clean_text)
79
  clean_text = re.sub(r'<\|im_end\|>', '', clean_text)
 
80
 
81
- # Remove role prefixes only at start of lines
82
  clean_text = re.sub(r'^(system|user|assistant):\s*', '', clean_text, flags=re.MULTILINE)
 
83
 
84
- # REMOVED: Aggressive line-by-line filtering that was truncating responses
 
 
 
85
 
86
- # Step 4: Final cleanup - preserve content structure
 
87
  clean_text = clean_text.strip()
88
 
89
- # Only apply minimal fixes
90
  if not clean_text or len(clean_text) < 10:
91
- # Fallback for very short responses
92
- if user_message and any(math_term in user_message.lower() for math_term in ['2+2', '2 + 2', 'calculate', 'math']):
93
- return "4\n\nThe answer is 4."
94
- return "I understand your question. Could you please provide more details?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  print(f"🧹 Final cleaned answer length: {len(clean_text)}")
97
- print(f"🧹 Final answer preview: {clean_text[:150]}...")
 
98
  return clean_text
99
 
100
- def generate_response(messages: list, max_tokens: int = 400, temperature: float = 0.7) -> str:
101
  """
102
- FIXED: Generate response with higher token limits and better settings.
103
  """
104
  try:
105
- # Create clean conversation
106
  clean_messages = []
107
 
108
- # Add minimal system message
 
109
  clean_messages.append({
110
  "role": "system",
111
- "content": "You are Apollo AI, a helpful coding assistant. Provide clear, complete explanations with proper code formatting."
112
  })
113
 
114
- # Add recent conversation context (last 2-3 messages)
115
  recent_messages = messages[-3:] if len(messages) > 3 else messages
116
  for msg in recent_messages:
117
  if msg.get("role") in ["user", "assistant"]:
 
 
 
118
  clean_messages.append(msg)
119
 
120
- print(f"🔍 Processing {len(clean_messages)} messages")
121
 
122
  # Build conversation using tokenizer's chat template
123
  formatted_prompt = tokenizer.apply_chat_template(
@@ -129,22 +204,37 @@ def generate_response(messages: list, max_tokens: int = 400, temperature: float
129
  # Tokenize with proper length limits
130
  inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=1500)
131
 
132
- # FIXED: Generate with much higher token limits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  with torch.no_grad():
134
- outputs = model.generate(
135
- inputs.input_ids,
136
- attention_mask=inputs.attention_mask,
137
- max_new_tokens=min(max_tokens, 500), # INCREASED from 150 to 500
138
- temperature=max(0.3, min(temperature, 0.9)),
139
- top_p=0.9,
140
- do_sample=True,
141
- pad_token_id=tokenizer.eos_token_id,
142
- eos_token_id=tokenizer.eos_token_id,
143
- repetition_penalty=1.05, # Reduced to allow natural repetition
144
- length_penalty=1.0, # Neutral length penalty
145
- early_stopping=False, # Don't stop early
146
- no_repeat_ngram_size=2, # Reduced to allow more natural flow
147
- )
148
 
149
  # Decode the full response
150
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
@@ -156,28 +246,33 @@ def generate_response(messages: list, max_tokens: int = 400, temperature: float
156
  user_message = msg.get("content", "")
157
  break
158
 
159
- # Clean and extract the answer with gentle approach
160
- clean_answer = extract_clean_answer(full_response, formatted_prompt, user_message)
161
 
162
  return clean_answer
163
 
164
  except Exception as e:
165
  print(f"❌ Generation error: {e}")
166
- return f"I encountered an error while processing your request. Please try again with a simpler question."
 
167
 
168
  # === Routes ===
169
  @app.get("/")
170
  def root():
171
  return {
172
- "message": "🤖 Apollo AI Backend is running!",
173
  "model": "Qwen2-0.5B-Instruct with LoRA",
174
  "status": "ready",
175
- "max_tokens": "500 (increased)"
 
 
 
 
176
  }
177
 
178
  @app.get("/health")
179
  def health():
180
- return {"status": "healthy", "model_loaded": True}
181
 
182
  @app.post("/v1/chat/completions")
183
  async def chat_completions(request: Request):
@@ -200,9 +295,12 @@ async def chat_completions(request: Request):
200
  try:
201
  body = await request.json()
202
  messages = body.get("messages", [])
203
- max_tokens = body.get("max_tokens", 400) # INCREASED default
204
  temperature = body.get("temperature", 0.7)
205
 
 
 
 
206
  if not messages or not isinstance(messages, list):
207
  raise ValueError("Messages field is required and must be a list")
208
 
@@ -221,11 +319,14 @@ async def chat_completions(request: Request):
221
  )
222
 
223
  try:
224
- # Generate response with higher limits
225
- print(f"📥 Processing {len(messages)} messages with max_tokens: {max_tokens}")
 
 
226
  response_content = generate_response(
227
  messages=messages,
228
- max_tokens=min(max_tokens, 600), # INCREASED cap to 600
 
229
  temperature=max(0.1, min(temperature, 1.0))
230
  )
231
 
@@ -234,7 +335,7 @@ async def chat_completions(request: Request):
234
  "id": f"chatcmpl-apollo-{hash(str(messages)) % 10000}",
235
  "object": "chat.completion",
236
  "created": int(torch.tensor(0).item()),
237
- "model": "qwen2-0.5b-instruct-lora",
238
  "choices": [
239
  {
240
  "index": 0,
@@ -249,7 +350,8 @@ async def chat_completions(request: Request):
249
  "prompt_tokens": len(str(messages)),
250
  "completion_tokens": len(response_content),
251
  "total_tokens": len(str(messages)) + len(response_content)
252
- }
 
253
  }
254
 
255
  except Exception as e:
@@ -259,27 +361,46 @@ async def chat_completions(request: Request):
259
  content={"error": f"Internal server error: {str(e)}"}
260
  )
261
 
262
- # === Test endpoint for debugging ===
263
  @app.post("/test")
264
  async def test_generation(request: Request):
265
- """Test endpoint for debugging the model directly"""
266
  try:
267
  body = await request.json()
268
- prompt = body.get("prompt", "Hello, how are you?")
269
  max_tokens = body.get("max_tokens", 300)
 
 
 
270
 
271
- messages = [
272
- {"role": "system", "content": "You are Apollo AI, a helpful assistant."},
273
  {"role": "user", "content": prompt}
274
  ]
 
 
 
 
 
 
275
 
276
- response = generate_response(messages, max_tokens=max_tokens, temperature=0.7)
 
 
 
 
 
 
 
 
 
 
277
 
278
  return {
279
  "prompt": prompt,
280
- "response": response,
281
- "response_length": len(response),
282
- "status": "success"
283
  }
284
 
285
  except Exception as e:
@@ -290,6 +411,7 @@ async def test_generation(request: Request):
290
 
291
  if __name__ == "__main__":
292
  import uvicorn
293
- print("🚀 Starting Apollo AI Backend with FIXED response limits...")
294
- print("📊 Max tokens increased to 500+ for complete responses")
 
295
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
8
  import re
9
 
10
  # === Setup FastAPI ===
11
+ app = FastAPI(title="Apollo AI Backend", version="2.0.0")
12
 
13
  # === CORS ===
14
  app.add_middleware(
 
42
 
43
  print("✅ Model ready!")
44
 
45
+ def get_system_prompt(is_force_mode: bool) -> str:
46
  """
47
+ Returns mode-specific system prompts for proper AI behavior.
48
+ """
49
+ if is_force_mode:
50
+ return """You are Apollo AI in DIRECT ANSWER mode. Provide:
51
+ - Clear, concise, direct answers
52
+ - Complete working code when requested
53
+ - Brief explanations (2-3 sentences max)
54
+ - Immediate solutions without teaching moments
55
+ - No lengthy tutorials or step-by-step guides
56
+ - Get straight to the point
57
+
58
+ Example:
59
+ User: "How do I print hello world in Python?"
60
+ You: "Use `print("Hello World")`. This function outputs text to the console."
61
+ """
62
+ else:
63
+ return """You are Apollo AI in MENTOR mode. Your role is to guide learning:
64
+ - Ask leading questions instead of giving direct answers
65
+ - Provide hints and concepts, never complete solutions
66
+ - Encourage thinking: "What do you think would happen if...?"
67
+ - Give partial code with blanks: "Try filling in the _____ part"
68
+ - Guide discovery: "Have you considered looking into...?"
69
+ - Make students work for understanding
70
+ - Never give full working code - always leave something for them to figure out
71
+
72
+ Example:
73
+ User: "How do I print hello world in Python?"
74
+ You: "Great question! What function do you think might be used to display text on screen? Think about what action you want to perform. Try looking up Python's built-in functions for output."
75
+ """
76
+
77
+ def extract_clean_answer(full_response: str, formatted_prompt: str, user_message: str, is_force_mode: bool) -> str:
78
+ """
79
+ SINGLE POWERFUL CLEANING FUNCTION - The only place where response cleaning happens.
80
+ All frontend cleaning is removed, this is the source of truth.
81
  """
82
  if not full_response or len(full_response.strip()) < 5:
83
  return "I apologize, but I couldn't generate a response. Please try again."
84
 
85
  print(f"🔍 Raw response length: {len(full_response)}")
86
+ print(f"🔍 Mode: {'FORCE' if is_force_mode else 'MENTOR'}")
87
 
88
  # Step 1: Remove the input prompt to get only generated content
89
  generated_text = full_response
 
92
  if len(parts) > 1:
93
  generated_text = parts[-1]
94
 
95
+ # Step 2: Extract assistant content using multiple strategies
96
  assistant_content = generated_text
97
 
98
+ # Strategy A: Look for assistant tags
99
  if "<|im_start|>assistant" in generated_text:
100
  assistant_parts = generated_text.split("<|im_start|>assistant")
101
  if len(assistant_parts) > 1:
 
104
  if "<|im_end|>" in assistant_content:
105
  assistant_content = assistant_content.split("<|im_end|>")[0]
106
 
107
+ # Strategy B: Look for role-based prefixes
108
+ elif "assistant:" in generated_text.lower():
109
+ parts = generated_text.lower().split("assistant:")
110
+ if len(parts) > 1:
111
+ # Get the content after the last "assistant:" occurrence
112
+ assistant_content = generated_text[generated_text.lower().rfind("assistant:") + 10:]
113
+
114
+ # Step 3: POWERFUL CLEANING - Remove all template artifacts
115
  clean_text = assistant_content.strip()
116
 
117
+ # Remove all chat template tokens
118
  clean_text = re.sub(r'<\|im_start\|>', '', clean_text)
119
  clean_text = re.sub(r'<\|im_end\|>', '', clean_text)
120
+ clean_text = re.sub(r'<\|endoftext\|>', '', clean_text)
121
 
122
+ # Remove role prefixes from anywhere in text
123
  clean_text = re.sub(r'^(system|user|assistant):\s*', '', clean_text, flags=re.MULTILINE)
124
+ clean_text = re.sub(r'\n(system|user|assistant):\s*', '\n', clean_text, flags=re.MULTILINE)
125
 
126
+ # Remove common system prompt artifacts
127
+ clean_text = re.sub(r'You are Apollo AI.*?mode[^\n]*\n?', '', clean_text, flags=re.IGNORECASE)
128
+ clean_text = re.sub(r'Guidelines?:.*?\n', '', clean_text, flags=re.IGNORECASE)
129
+ clean_text = re.sub(r'Example:.*?\n', '', clean_text, flags=re.IGNORECASE)
130
 
131
+ # Clean up excessive whitespace but preserve formatting
132
+ clean_text = re.sub(r'\n{4,}', '\n\n\n', clean_text)
133
  clean_text = clean_text.strip()
134
 
135
+ # Step 4: Handle edge cases and fallbacks
136
  if not clean_text or len(clean_text) < 10:
137
+ # Special handling for simple math questions
138
+ if user_message and any(term in user_message.lower() for term in ['2+2', '2 + 2', 'calculate', 'what is']):
139
+ if '2+2' in user_message.lower() or '2 + 2' in user_message.lower():
140
+ return "4" if is_force_mode else "What do you think 2 + 2 equals? Try calculating it step by step."
141
+
142
+ # Generic fallback based on mode
143
+ if is_force_mode:
144
+ return "I understand your question. Could you please be more specific about what you need?"
145
+ else:
146
+ return "That's an interesting question! What approach do you think we should take to solve this? What's your initial thought?"
147
+
148
+ # Step 5: Mode-specific post-processing
149
+ if is_force_mode:
150
+ # For force mode, ensure response is concise
151
+ if len(clean_text) > 800: # If too long, truncate but keep it coherent
152
+ sentences = clean_text.split('. ')
153
+ if len(sentences) > 3:
154
+ clean_text = '. '.join(sentences[:3]) + '.'
155
+ else:
156
+ # For mentor mode, ensure it's not giving away complete solutions
157
+ # Check if response contains complete code without guidance
158
+ code_block_pattern = r'```[\w]*\n(.*?)\n```'
159
+ code_blocks = re.findall(code_block_pattern, clean_text, re.DOTALL)
160
+
161
+ for code in code_blocks:
162
+ # If code looks complete and there's no guidance, add mentor touch
163
+ if len(code.strip()) > 50 and 'try' not in clean_text.lower() and '?' not in clean_text:
164
+ clean_text += "\n\nTry implementing this step by step. What do you think each part does?"
165
 
166
  print(f"🧹 Final cleaned answer length: {len(clean_text)}")
167
+ print(f"🧹 Preview: {clean_text[:150]}...")
168
+
169
  return clean_text
170
 
171
+ def generate_response(messages: list, is_force_mode: bool = False, max_tokens: int = 400, temperature: float = 0.7) -> str:
172
  """
173
+ Generate response with mode-specific system prompts and proper settings.
174
  """
175
  try:
176
+ # Create clean conversation with mode-specific system prompt
177
  clean_messages = []
178
 
179
+ # Add mode-specific system message
180
+ system_prompt = get_system_prompt(is_force_mode)
181
  clean_messages.append({
182
  "role": "system",
183
+ "content": system_prompt
184
  })
185
 
186
+ # Add recent conversation context (last 2-3 messages, but filter appropriately)
187
  recent_messages = messages[-3:] if len(messages) > 3 else messages
188
  for msg in recent_messages:
189
  if msg.get("role") in ["user", "assistant"]:
190
+ # Skip system messages from frontend to avoid conflicts
191
+ if msg.get("role") == "system":
192
+ continue
193
  clean_messages.append(msg)
194
 
195
+ print(f"🔍 Processing {len(clean_messages)} messages in {'FORCE' if is_force_mode else 'MENTOR'} mode")
196
 
197
  # Build conversation using tokenizer's chat template
198
  formatted_prompt = tokenizer.apply_chat_template(
 
204
  # Tokenize with proper length limits
205
  inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=1500)
206
 
207
+ # Mode-specific generation settings
208
+ generation_params = {
209
+ "input_ids": inputs.input_ids,
210
+ "attention_mask": inputs.attention_mask,
211
+ "pad_token_id": tokenizer.eos_token_id,
212
+ "eos_token_id": tokenizer.eos_token_id,
213
+ "do_sample": True,
214
+ }
215
+
216
+ if is_force_mode:
217
+ # Force mode: Direct, concise answers
218
+ generation_params.update({
219
+ "max_new_tokens": min(max_tokens, 300), # Shorter responses
220
+ "temperature": 0.3, # More focused
221
+ "top_p": 0.8,
222
+ "repetition_penalty": 1.1,
223
+ "length_penalty": 0.8, # Encourage shorter responses
224
+ })
225
+ else:
226
+ # Mentor mode: More thoughtful, questioning responses
227
+ generation_params.update({
228
+ "max_new_tokens": min(max_tokens, 500), # Allow longer explanations
229
+ "temperature": 0.7, # More creative for questions
230
+ "top_p": 0.9,
231
+ "repetition_penalty": 1.05,
232
+ "length_penalty": 1.0, # Neutral length
233
+ })
234
+
235
+ # Generate response
236
  with torch.no_grad():
237
+ outputs = model.generate(**generation_params)
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
  # Decode the full response
240
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
 
246
  user_message = msg.get("content", "")
247
  break
248
 
249
+ # Clean and extract the answer using our SINGLE POWERFUL cleaning function
250
+ clean_answer = extract_clean_answer(full_response, formatted_prompt, user_message, is_force_mode)
251
 
252
  return clean_answer
253
 
254
  except Exception as e:
255
  print(f"❌ Generation error: {e}")
256
+ mode_text = "direct answer" if is_force_mode else "guided learning approach"
257
+ return f"I encountered an error while generating a {mode_text}. Please try rephrasing your question."
258
 
259
  # === Routes ===
260
  @app.get("/")
261
  def root():
262
  return {
263
+ "message": "🤖 Apollo AI Backend v2.0 - Mode-Specific AI",
264
  "model": "Qwen2-0.5B-Instruct with LoRA",
265
  "status": "ready",
266
+ "features": ["mentor_mode", "force_mode", "single_powerful_cleaning"],
267
+ "modes": {
268
+ "mentor": "Guides learning with questions and hints",
269
+ "force": "Provides direct answers and solutions"
270
+ }
271
  }
272
 
273
  @app.get("/health")
274
  def health():
275
+ return {"status": "healthy", "model_loaded": True, "cleaning": "single_backend_only"}
276
 
277
  @app.post("/v1/chat/completions")
278
  async def chat_completions(request: Request):
 
295
  try:
296
  body = await request.json()
297
  messages = body.get("messages", [])
298
+ max_tokens = body.get("max_tokens", 400)
299
  temperature = body.get("temperature", 0.7)
300
 
301
+ # NEW: Get mode information from request
302
+ is_force_mode = body.get("force_mode", False) # Default to mentor mode
303
+
304
  if not messages or not isinstance(messages, list):
305
  raise ValueError("Messages field is required and must be a list")
306
 
 
319
  )
320
 
321
  try:
322
+ # Generate response with mode-specific behavior
323
+ print(f"📥 Processing {len(messages)} messages in {'FORCE' if is_force_mode else 'MENTOR'} mode")
324
+ print(f"📊 Settings: max_tokens={max_tokens}, temperature={temperature}")
325
+
326
  response_content = generate_response(
327
  messages=messages,
328
+ is_force_mode=is_force_mode,
329
+ max_tokens=min(max_tokens, 600),
330
  temperature=max(0.1, min(temperature, 1.0))
331
  )
332
 
 
335
  "id": f"chatcmpl-apollo-{hash(str(messages)) % 10000}",
336
  "object": "chat.completion",
337
  "created": int(torch.tensor(0).item()),
338
+ "model": f"qwen2-0.5b-{'force' if is_force_mode else 'mentor'}-mode",
339
  "choices": [
340
  {
341
  "index": 0,
 
350
  "prompt_tokens": len(str(messages)),
351
  "completion_tokens": len(response_content),
352
  "total_tokens": len(str(messages)) + len(response_content)
353
+ },
354
+ "apollo_mode": "force" if is_force_mode else "mentor"
355
  }
356
 
357
  except Exception as e:
 
361
  content={"error": f"Internal server error: {str(e)}"}
362
  )
363
 
364
+ # === Test endpoint for debugging modes ===
365
  @app.post("/test")
366
  async def test_generation(request: Request):
367
+ """Test endpoint for debugging both modes"""
368
  try:
369
  body = await request.json()
370
+ prompt = body.get("prompt", "How do I print hello world in Python?")
371
  max_tokens = body.get("max_tokens", 300)
372
+ test_both_modes = body.get("test_both_modes", True)
373
+
374
+ results = {}
375
 
376
+ # Test mentor mode
377
+ messages_mentor = [
378
  {"role": "user", "content": prompt}
379
  ]
380
+ mentor_response = generate_response(messages_mentor, is_force_mode=False, max_tokens=max_tokens, temperature=0.7)
381
+ results["mentor_mode"] = {
382
+ "response": mentor_response,
383
+ "length": len(mentor_response),
384
+ "mode": "mentor"
385
+ }
386
 
387
+ if test_both_modes:
388
+ # Test force mode
389
+ messages_force = [
390
+ {"role": "user", "content": prompt}
391
+ ]
392
+ force_response = generate_response(messages_force, is_force_mode=True, max_tokens=max_tokens, temperature=0.3)
393
+ results["force_mode"] = {
394
+ "response": force_response,
395
+ "length": len(force_response),
396
+ "mode": "force"
397
+ }
398
 
399
  return {
400
  "prompt": prompt,
401
+ "results": results,
402
+ "status": "success",
403
+ "cleaning": "single_backend_only"
404
  }
405
 
406
  except Exception as e:
 
411
 
412
  if __name__ == "__main__":
413
  import uvicorn
414
+ print("🚀 Starting Apollo AI Backend v2.0...")
415
+ print("📊 Features: Mode-specific prompts, Single powerful cleaning")
416
+ print("🎯 Modes: Mentor (guides learning) vs Force (direct answers)")
417
  uvicorn.run(app, host="0.0.0.0", port=7860)