Ais commited on
Commit
b397650
·
verified ·
1 Parent(s): fc679ee

Update app/main.py

Browse files
Files changed (1) hide show
  1. app/main.py +257 -150
app/main.py CHANGED
@@ -8,7 +8,7 @@ from starlette.middleware.cors import CORSMiddleware
8
  import re
9
 
10
  # === Setup FastAPI ===
11
- app = FastAPI(title="Apollo AI Backend", version="2.0.0")
12
 
13
  # === CORS ===
14
  app.add_middleware(
@@ -25,10 +25,12 @@ BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
25
  ADAPTER_PATH = "adapter"
26
 
27
  # === Load Model ===
28
- print("🔧 Loading tokenizer...")
29
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
 
 
30
 
31
- print("🧠 Loading base model...")
32
  base_model = AutoModelForCausalLM.from_pretrained(
33
  BASE_MODEL,
34
  trust_remote_code=True,
@@ -36,48 +38,140 @@ base_model = AutoModelForCausalLM.from_pretrained(
36
  device_map="cpu"
37
  )
38
 
39
- print("🔗 Applying LoRA adapter...")
40
  model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
41
  model.eval()
42
 
43
- print("✅ Model ready!")
44
 
45
- def get_system_prompt(is_force_mode: bool) -> str:
46
  """
47
- Returns mode-specific system prompts for proper AI behavior.
 
48
  """
49
  if is_force_mode:
50
- return """You are Apollo AI in DIRECT ANSWER mode. Provide:
51
- - Clear, concise, direct answers
52
- - Complete working code when requested
53
- - Brief explanations (2-3 sentences max)
54
- - Immediate solutions without teaching moments
55
- - No lengthy tutorials or step-by-step guides
56
- - Get straight to the point
57
 
58
  Example:
59
- User: "How do I print hello world in Python?"
60
- You: "Use `print("Hello World")`. This function outputs text to the console."
61
  """
62
  else:
63
- return """You are Apollo AI in MENTOR mode. Your role is to guide learning:
64
- - Ask leading questions instead of giving direct answers
65
- - Provide hints and concepts, never complete solutions
66
- - Encourage thinking: "What do you think would happen if...?"
67
- - Give partial code with blanks: "Try filling in the _____ part"
68
- - Guide discovery: "Have you considered looking into...?"
69
- - Make students work for understanding
70
- - Never give full working code - always leave something for them to figure out
71
 
72
  Example:
73
- User: "How do I print hello world in Python?"
74
- You: "Great question! What function do you think might be used to display text on screen? Think about what action you want to perform. Try looking up Python's built-in functions for output."
75
  """
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def extract_clean_answer(full_response: str, formatted_prompt: str, user_message: str, is_force_mode: bool) -> str:
78
  """
79
- SINGLE POWERFUL CLEANING FUNCTION - The only place where response cleaning happens.
80
- All frontend cleaning is removed, this is the source of truth.
81
  """
82
  if not full_response or len(full_response.strip()) < 5:
83
  return "I apologize, but I couldn't generate a response. Please try again."
@@ -85,126 +179,135 @@ def extract_clean_answer(full_response: str, formatted_prompt: str, user_message
85
  print(f"🔍 Raw response length: {len(full_response)}")
86
  print(f"🔍 Mode: {'FORCE' if is_force_mode else 'MENTOR'}")
87
 
88
- # Step 1: Remove the input prompt to get only generated content
 
 
 
 
 
 
 
 
 
 
 
 
89
  generated_text = full_response
90
  if formatted_prompt in full_response:
91
  parts = full_response.split(formatted_prompt)
92
  if len(parts) > 1:
93
  generated_text = parts[-1]
94
 
95
- # Step 2: Extract assistant content using multiple strategies
96
  assistant_content = generated_text
97
 
98
- # Strategy A: Look for assistant tags
99
  if "<|im_start|>assistant" in generated_text:
100
  assistant_parts = generated_text.split("<|im_start|>assistant")
101
  if len(assistant_parts) > 1:
102
  assistant_content = assistant_parts[-1]
103
- # Remove end marker if present
104
  if "<|im_end|>" in assistant_content:
105
  assistant_content = assistant_content.split("<|im_end|>")[0]
106
 
107
- # Strategy B: Look for role-based prefixes
108
- elif "assistant:" in generated_text.lower():
109
- parts = generated_text.lower().split("assistant:")
110
- if len(parts) > 1:
111
- # Get the content after the last "assistant:" occurrence
112
- assistant_content = generated_text[generated_text.lower().rfind("assistant:") + 10:]
113
-
114
- # Step 3: POWERFUL CLEANING - Remove all template artifacts
115
  clean_text = assistant_content.strip()
116
 
117
- # Remove all chat template tokens
118
  clean_text = re.sub(r'<\|im_start\|>', '', clean_text)
119
  clean_text = re.sub(r'<\|im_end\|>', '', clean_text)
120
  clean_text = re.sub(r'<\|endoftext\|>', '', clean_text)
121
 
122
- # Remove role prefixes from anywhere in text
123
  clean_text = re.sub(r'^(system|user|assistant):\s*', '', clean_text, flags=re.MULTILINE)
124
  clean_text = re.sub(r'\n(system|user|assistant):\s*', '\n', clean_text, flags=re.MULTILINE)
125
 
126
- # Remove common system prompt artifacts
127
- clean_text = re.sub(r'You are Apollo AI.*?mode[^\n]*\n?', '', clean_text, flags=re.IGNORECASE)
128
- clean_text = re.sub(r'Guidelines?:.*?\n', '', clean_text, flags=re.IGNORECASE)
129
- clean_text = re.sub(r'Example:.*?\n', '', clean_text, flags=re.IGNORECASE)
130
-
131
- # Clean up excessive whitespace but preserve formatting
132
- clean_text = re.sub(r'\n{4,}', '\n\n\n', clean_text)
133
  clean_text = clean_text.strip()
134
 
135
- # Step 4: Handle edge cases and fallbacks
136
  if not clean_text or len(clean_text) < 10:
137
- # Special handling for simple math questions
138
- if user_message and any(term in user_message.lower() for term in ['2+2', '2 + 2', 'calculate', 'what is']):
139
- if '2+2' in user_message.lower() or '2 + 2' in user_message.lower():
140
- return "4" if is_force_mode else "What do you think 2 + 2 equals? Try calculating it step by step."
141
-
142
- # Generic fallback based on mode
143
  if is_force_mode:
144
- return "I understand your question. Could you please be more specific about what you need?"
145
  else:
146
- return "That's an interesting question! What approach do you think we should take to solve this? What's your initial thought?"
147
 
148
- # Step 5: Mode-specific post-processing
149
- if is_force_mode:
150
- # For force mode, ensure response is concise
151
- if len(clean_text) > 800: # If too long, truncate but keep it coherent
152
- sentences = clean_text.split('. ')
153
- if len(sentences) > 3:
154
- clean_text = '. '.join(sentences[:3]) + '.'
155
- else:
156
- # For mentor mode, ensure it's not giving away complete solutions
157
- # Check if response contains complete code without guidance
158
- code_block_pattern = r'```[\w]*\n(.*?)\n```'
159
- code_blocks = re.findall(code_block_pattern, clean_text, re.DOTALL)
160
-
161
- for code in code_blocks:
162
- # If code looks complete and there's no guidance, add mentor touch
163
- if len(code.strip()) > 50 and 'try' not in clean_text.lower() and '?' not in clean_text:
164
- clean_text += "\n\nTry implementing this step by step. What do you think each part does?"
165
 
166
  print(f"🧹 Final cleaned answer length: {len(clean_text)}")
167
- print(f"🧹 Preview: {clean_text[:150]}...")
168
 
169
  return clean_text
170
 
171
- def generate_response(messages: list, is_force_mode: bool = False, max_tokens: int = 400, temperature: float = 0.7) -> str:
172
  """
173
- Generate response with mode-specific system prompts and proper settings.
174
  """
175
  try:
176
- # Create clean conversation with mode-specific system prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  clean_messages = []
178
 
179
- # Add mode-specific system message
180
- system_prompt = get_system_prompt(is_force_mode)
181
  clean_messages.append({
182
  "role": "system",
183
  "content": system_prompt
184
  })
185
 
186
- # Add recent conversation context (last 2-3 messages, but filter appropriately)
187
- recent_messages = messages[-3:] if len(messages) > 3 else messages
188
- for msg in recent_messages:
189
- if msg.get("role") in ["user", "assistant"]:
190
- # Skip system messages from frontend to avoid conflicts
191
- if msg.get("role") == "system":
192
- continue
193
- clean_messages.append(msg)
 
194
 
195
- print(f"🔍 Processing {len(clean_messages)} messages in {'FORCE' if is_force_mode else 'MENTOR'} mode")
196
 
197
- # Build conversation using tokenizer's chat template
198
- formatted_prompt = tokenizer.apply_chat_template(
199
- clean_messages,
200
- tokenize=False,
201
- add_generation_prompt=True
202
- )
 
 
 
 
 
203
 
204
- # Tokenize with proper length limits
205
- inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=1500)
 
 
 
 
 
206
 
207
- # Mode-specific generation settings
208
  generation_params = {
209
  "input_ids": inputs.input_ids,
210
  "attention_mask": inputs.attention_mask,
@@ -214,29 +317,29 @@ def generate_response(messages: list, is_force_mode: bool = False, max_tokens: i
214
  }
215
 
216
  if is_force_mode:
217
- # Force mode: Direct, concise answers
218
  generation_params.update({
219
- "max_new_tokens": min(max_tokens, 300), # Shorter responses
220
- "temperature": 0.3, # More focused
221
- "top_p": 0.8,
222
- "repetition_penalty": 1.1,
223
- "length_penalty": 0.8, # Encourage shorter responses
224
  })
225
  else:
226
- # Mentor mode: More thoughtful, questioning responses
227
  generation_params.update({
228
- "max_new_tokens": min(max_tokens, 500), # Allow longer explanations
229
- "temperature": 0.7, # More creative for questions
230
- "top_p": 0.9,
231
- "repetition_penalty": 1.05,
232
- "length_penalty": 1.0, # Neutral length
233
  })
234
 
235
- # Generate response
236
  with torch.no_grad():
237
  outputs = model.generate(**generation_params)
238
 
239
- # Decode the full response
240
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
241
 
242
  # Extract user message for context
@@ -246,33 +349,39 @@ def generate_response(messages: list, is_force_mode: bool = False, max_tokens: i
246
  user_message = msg.get("content", "")
247
  break
248
 
249
- # Clean and extract the answer using our SINGLE POWERFUL cleaning function
250
  clean_answer = extract_clean_answer(full_response, formatted_prompt, user_message, is_force_mode)
251
 
252
  return clean_answer
253
 
254
  except Exception as e:
255
- print(f"❌ Generation error: {e}")
256
- mode_text = "direct answer" if is_force_mode else "guided learning approach"
257
- return f"I encountered an error while generating a {mode_text}. Please try rephrasing your question."
258
 
259
  # === Routes ===
260
  @app.get("/")
261
  def root():
262
  return {
263
- "message": "🤖 Apollo AI Backend v2.0 - Mode-Specific AI",
264
- "model": "Qwen2-0.5B-Instruct with LoRA",
265
  "status": "ready",
266
- "features": ["mentor_mode", "force_mode", "single_powerful_cleaning"],
 
267
  "modes": {
268
- "mentor": "Guides learning with questions and hints",
269
- "force": "Provides direct answers and solutions"
270
  }
271
  }
272
 
273
  @app.get("/health")
274
  def health():
275
- return {"status": "healthy", "model_loaded": True, "cleaning": "single_backend_only"}
 
 
 
 
 
276
 
277
  @app.post("/v1/chat/completions")
278
  async def chat_completions(request: Request):
@@ -295,11 +404,11 @@ async def chat_completions(request: Request):
295
  try:
296
  body = await request.json()
297
  messages = body.get("messages", [])
298
- max_tokens = body.get("max_tokens", 400)
299
- temperature = body.get("temperature", 0.7)
300
 
301
- # NEW: Get mode information from request
302
- is_force_mode = body.get("force_mode", False) # Default to mentor mode
303
 
304
  if not messages or not isinstance(messages, list):
305
  raise ValueError("Messages field is required and must be a list")
@@ -310,7 +419,7 @@ async def chat_completions(request: Request):
310
  content={"error": f"Invalid request body: {str(e)}"}
311
  )
312
 
313
- # Validate messages format
314
  for i, msg in enumerate(messages):
315
  if not isinstance(msg, dict) or "role" not in msg or "content" not in msg:
316
  return JSONResponse(
@@ -319,20 +428,19 @@ async def chat_completions(request: Request):
319
  )
320
 
321
  try:
322
- # Generate response with mode-specific behavior
323
- print(f"📥 Processing {len(messages)} messages in {'FORCE' if is_force_mode else 'MENTOR'} mode")
324
  print(f"📊 Settings: max_tokens={max_tokens}, temperature={temperature}")
325
 
326
  response_content = generate_response(
327
  messages=messages,
328
  is_force_mode=is_force_mode,
329
- max_tokens=min(max_tokens, 600),
330
- temperature=max(0.1, min(temperature, 1.0))
331
  )
332
 
333
  # Return OpenAI-compatible response
334
  return {
335
- "id": f"chatcmpl-apollo-{hash(str(messages)) % 10000}",
336
  "object": "chat.completion",
337
  "created": int(torch.tensor(0).item()),
338
  "model": f"qwen2-0.5b-{'force' if is_force_mode else 'mentor'}-mode",
@@ -351,7 +459,8 @@ async def chat_completions(request: Request):
351
  "completion_tokens": len(response_content),
352
  "total_tokens": len(str(messages)) + len(response_content)
353
  },
354
- "apollo_mode": "force" if is_force_mode else "mentor"
 
355
  }
356
 
357
  except Exception as e:
@@ -361,23 +470,21 @@ async def chat_completions(request: Request):
361
  content={"error": f"Internal server error: {str(e)}"}
362
  )
363
 
364
- # === Test endpoint for debugging modes ===
365
  @app.post("/test")
366
  async def test_generation(request: Request):
367
- """Test endpoint for debugging both modes"""
368
  try:
369
  body = await request.json()
370
  prompt = body.get("prompt", "How do I print hello world in Python?")
371
- max_tokens = body.get("max_tokens", 300)
372
  test_both_modes = body.get("test_both_modes", True)
373
 
374
  results = {}
375
 
376
  # Test mentor mode
377
- messages_mentor = [
378
- {"role": "user", "content": prompt}
379
- ]
380
- mentor_response = generate_response(messages_mentor, is_force_mode=False, max_tokens=max_tokens, temperature=0.7)
381
  results["mentor_mode"] = {
382
  "response": mentor_response,
383
  "length": len(mentor_response),
@@ -386,10 +493,8 @@ async def test_generation(request: Request):
386
 
387
  if test_both_modes:
388
  # Test force mode
389
- messages_force = [
390
- {"role": "user", "content": prompt}
391
- ]
392
- force_response = generate_response(messages_force, is_force_mode=True, max_tokens=max_tokens, temperature=0.3)
393
  results["force_mode"] = {
394
  "response": force_response,
395
  "length": len(force_response),
@@ -399,8 +504,9 @@ async def test_generation(request: Request):
399
  return {
400
  "prompt": prompt,
401
  "results": results,
402
- "status": "success",
403
- "cleaning": "single_backend_only"
 
404
  }
405
 
406
  except Exception as e:
@@ -411,7 +517,8 @@ async def test_generation(request: Request):
411
 
412
  if __name__ == "__main__":
413
  import uvicorn
414
- print("🚀 Starting Apollo AI Backend v2.0...")
415
- print("📊 Features: Mode-specific prompts, Single powerful cleaning")
416
- print("🎯 Modes: Mentor (guides learning) vs Force (direct answers)")
 
417
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
8
  import re
9
 
10
  # === Setup FastAPI ===
11
+ app = FastAPI(title="Apollo AI Backend - Qwen2-0.5B Optimized", version="2.1.0")
12
 
13
  # === CORS ===
14
  app.add_middleware(
 
25
  ADAPTER_PATH = "adapter"
26
 
27
  # === Load Model ===
28
+ print("🔧 Loading tokenizer for Qwen2-0.5B...")
29
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
30
+ if tokenizer.pad_token is None:
31
+ tokenizer.pad_token = tokenizer.eos_token
32
 
33
+ print("🧠 Loading Qwen2-0.5B base model...")
34
  base_model = AutoModelForCausalLM.from_pretrained(
35
  BASE_MODEL,
36
  trust_remote_code=True,
 
38
  device_map="cpu"
39
  )
40
 
41
+ print("🔗 Applying LoRA adapter to Qwen2-0.5B...")
42
  model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
43
  model.eval()
44
 
45
+ print("✅ Qwen2-0.5B model ready with optimized settings!")
46
 
47
+ def get_simple_system_prompt(is_force_mode: bool) -> str:
48
  """
49
+ SIMPLIFIED system prompts optimized for Qwen2-0.5B's 500M parameters.
50
+ Shorter, clearer instructions that small models can follow better.
51
  """
52
  if is_force_mode:
53
+ return """You are Apollo AI. Give direct, complete answers.
54
+
55
+ Rules:
56
+ - Provide full working code
57
+ - Be concise, max 3 sentences explanation
58
+ - Never ask questions back
59
+ - Give complete solutions immediately
60
 
61
  Example:
62
+ User: "print hello world python"
63
+ You: "Use print('Hello World'). This outputs text to console."
64
  """
65
  else:
66
+ return """You are Apollo AI tutor. Guide learning with questions.
67
+
68
+ Rules:
69
+ - Ask guiding questions instead of giving answers
70
+ - Never give complete working code
71
+ - Use hints and partial examples only
72
+ - Make students think and discover
 
73
 
74
  Example:
75
+ User: "print hello world python"
76
+ You: "What function displays text in Python? Try looking up output functions."
77
  """
78
 
79
+ def create_simple_force_responses(user_message: str) -> str:
80
+ """
81
+ Pre-defined responses for common questions in force mode.
82
+ This helps the 0.5B model give consistent direct answers.
83
+ """
84
+ user_lower = user_message.lower()
85
+
86
+ # Python print
87
+ if 'print' in user_lower and ('hello' in user_lower or 'world' in user_lower):
88
+ return 'Use `print("Hello World")`. This function outputs text to the console.'
89
+
90
+ # Basic math
91
+ if '2+2' in user_lower or '2 + 2' in user_lower:
92
+ return '2 + 2 = 4. Addition combines two numbers to get their sum.'
93
+
94
+ # Python variable
95
+ if 'variable' in user_lower and ('python' in user_lower or 'create' in user_lower):
96
+ return 'Use `name = "value"`. Variables store data: `x = 5` or `text = "hello"`.'
97
+
98
+ # Python list
99
+ if 'list' in user_lower and 'python' in user_lower and 'create' in user_lower:
100
+ return 'Use square brackets: `my_list = [1, 2, 3]`. Lists store multiple items.'
101
+
102
+ # Python function
103
+ if 'function' in user_lower and 'python' in user_lower and ('create' in user_lower or 'define' in user_lower):
104
+ return '''Use def keyword:
105
+ ```python
106
+ def my_function():
107
+ return "Hello"
108
+ ```
109
+ Functions are reusable code blocks.'''
110
+
111
+ # Calculator
112
+ if 'calculator' in user_lower and ('create' in user_lower or 'make' in user_lower or 'build' in user_lower):
113
+ return '''Here's a simple calculator:
114
+ ```python
115
+ a = float(input("First number: "))
116
+ b = float(input("Second number: "))
117
+ op = input("Operator (+,-,*,/): ")
118
+ if op == '+': print(a + b)
119
+ elif op == '-': print(a - b)
120
+ elif op == '*': print(a * b)
121
+ elif op == '/': print(a / b)
122
+ ```
123
+ This performs basic math operations.'''
124
+
125
+ return None
126
+
127
+ def create_simple_mentor_responses(user_message: str) -> str:
128
+ """
129
+ Pre-defined mentor responses for common questions.
130
+ This helps the 0.5B model give consistent guided learning.
131
+ """
132
+ user_lower = user_message.lower()
133
+
134
+ # Python print
135
+ if 'print' in user_lower and ('hello' in user_lower or 'world' in user_lower):
136
+ return 'What function do you think displays text in Python? Think about showing output. What would it be called?'
137
+
138
+ # Basic math
139
+ if '2+2' in user_lower or '2 + 2' in user_lower:
140
+ return 'What do you think 2 + 2 equals? Try calculating it step by step.'
141
+
142
+ # Python variable
143
+ if 'variable' in user_lower and ('python' in user_lower or 'create' in user_lower):
144
+ return 'How do you think Python stores data? What symbol might assign a value to a name? Try: name = value'
145
+
146
+ # Python list
147
+ if 'list' in user_lower and 'python' in user_lower and 'create' in user_lower:
148
+ return 'What brackets do you think hold multiple items? Try making a list with [item1, item2]. What goes inside?'
149
+
150
+ # Python function
151
+ if 'function' in user_lower and 'python' in user_lower and ('create' in user_lower or 'define' in user_lower):
152
+ return '''What keyword defines a function in Python? Try this structure:
153
+ ```python
154
+ ___ function_name():
155
+ # your code here
156
+ ```
157
+ What goes in the blank? How would you call it?'''
158
+
159
+ # Calculator
160
+ if 'calculator' in user_lower and ('create' in user_lower or 'make' in user_lower or 'build' in user_lower):
161
+ return '''What steps would a calculator need?
162
+ 1. Get two numbers from user - what function gets input?
163
+ 2. Get operation (+,-,*,/) - how to choose?
164
+ 3. Calculate result - what structure handles choices?
165
+ 4. Show result - what displays output?
166
+
167
+ Try building step 1 first. What function gets user input?'''
168
+
169
+ return None
170
+
171
  def extract_clean_answer(full_response: str, formatted_prompt: str, user_message: str, is_force_mode: bool) -> str:
172
  """
173
+ Optimized cleaning for Qwen2-0.5B responses.
174
+ Simpler extraction since 0.5B models produce cleaner output.
175
  """
176
  if not full_response or len(full_response.strip()) < 5:
177
  return "I apologize, but I couldn't generate a response. Please try again."
 
179
  print(f"🔍 Raw response length: {len(full_response)}")
180
  print(f"🔍 Mode: {'FORCE' if is_force_mode else 'MENTOR'}")
181
 
182
+ # Check for pre-defined responses first
183
+ if is_force_mode:
184
+ predefined = create_simple_force_responses(user_message)
185
+ if predefined:
186
+ print("✅ Using predefined force response")
187
+ return predefined
188
+ else:
189
+ predefined = create_simple_mentor_responses(user_message)
190
+ if predefined:
191
+ print("✅ Using predefined mentor response")
192
+ return predefined
193
+
194
+ # Step 1: Remove the input prompt
195
  generated_text = full_response
196
  if formatted_prompt in full_response:
197
  parts = full_response.split(formatted_prompt)
198
  if len(parts) > 1:
199
  generated_text = parts[-1]
200
 
201
+ # Step 2: Extract assistant content - simplified for 0.5B
202
  assistant_content = generated_text
203
 
204
+ # Look for assistant markers
205
  if "<|im_start|>assistant" in generated_text:
206
  assistant_parts = generated_text.split("<|im_start|>assistant")
207
  if len(assistant_parts) > 1:
208
  assistant_content = assistant_parts[-1]
 
209
  if "<|im_end|>" in assistant_content:
210
  assistant_content = assistant_content.split("<|im_end|>")[0]
211
 
212
+ # Step 3: Basic cleaning - gentler for 0.5B
 
 
 
 
 
 
 
213
  clean_text = assistant_content.strip()
214
 
215
+ # Remove template tokens
216
  clean_text = re.sub(r'<\|im_start\|>', '', clean_text)
217
  clean_text = re.sub(r'<\|im_end\|>', '', clean_text)
218
  clean_text = re.sub(r'<\|endoftext\|>', '', clean_text)
219
 
220
+ # Remove role prefixes
221
  clean_text = re.sub(r'^(system|user|assistant):\s*', '', clean_text, flags=re.MULTILINE)
222
  clean_text = re.sub(r'\n(system|user|assistant):\s*', '\n', clean_text, flags=re.MULTILINE)
223
 
224
+ # Clean whitespace
225
+ clean_text = re.sub(r'\n{3,}', '\n\n', clean_text)
 
 
 
 
 
226
  clean_text = clean_text.strip()
227
 
228
+ # Step 4: Fallback handling for 0.5B
229
  if not clean_text or len(clean_text) < 10:
 
 
 
 
 
 
230
  if is_force_mode:
231
+ return "Could you please be more specific about what you need?"
232
  else:
233
+ return "What specific aspect would you like to explore? What's your approach?"
234
 
235
+ # Step 5: Length control for 0.5B
236
+ if len(clean_text) > 500: # Keep responses shorter for 0.5B
237
+ sentences = clean_text.split('. ')
238
+ if len(sentences) > 3:
239
+ clean_text = '. '.join(sentences[:3]) + '.'
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
  print(f"🧹 Final cleaned answer length: {len(clean_text)}")
 
242
 
243
  return clean_text
244
 
245
+ def generate_response(messages: list, is_force_mode: bool = False, max_tokens: int = 200, temperature: float = 0.7) -> str:
246
  """
247
+ Optimized generation for Qwen2-0.5B with shorter contexts and conservative settings.
248
  """
249
  try:
250
+ # Check for simple predefined responses first
251
+ if messages and len(messages) > 0:
252
+ last_user_msg = ""
253
+ for msg in reversed(messages):
254
+ if msg.get("role") == "user":
255
+ last_user_msg = msg.get("content", "")
256
+ break
257
+
258
+ if last_user_msg:
259
+ if is_force_mode:
260
+ predefined = create_simple_force_responses(last_user_msg)
261
+ if predefined:
262
+ return predefined
263
+ else:
264
+ predefined = create_simple_mentor_responses(last_user_msg)
265
+ if predefined:
266
+ return predefined
267
+
268
+ # Build simple conversation for 0.5B model
269
  clean_messages = []
270
 
271
+ # Add simple system prompt
272
+ system_prompt = get_simple_system_prompt(is_force_mode)
273
  clean_messages.append({
274
  "role": "system",
275
  "content": system_prompt
276
  })
277
 
278
+ # Add only the last user message to keep context short for 0.5B
279
+ if messages and len(messages) > 0:
280
+ for msg in reversed(messages):
281
+ if msg.get("role") == "user":
282
+ clean_messages.append({
283
+ "role": "user",
284
+ "content": msg.get("content", "")
285
+ })
286
+ break
287
 
288
+ print(f"🔍 Processing {len(clean_messages)} messages for Qwen2-0.5B in {'FORCE' if is_force_mode else 'MENTOR'} mode")
289
 
290
+ # Apply chat template
291
+ try:
292
+ formatted_prompt = tokenizer.apply_chat_template(
293
+ clean_messages,
294
+ tokenize=False,
295
+ add_generation_prompt=True
296
+ )
297
+ except Exception as e:
298
+ print(f"⚠️ Chat template failed, using simple format: {e}")
299
+ # Fallback to simple format
300
+ formatted_prompt = f"System: {clean_messages[0]['content']}\nUser: {clean_messages[1]['content']}\nAssistant:"
301
 
302
+ # Tokenize with conservative limits for 0.5B
303
+ inputs = tokenizer(
304
+ formatted_prompt,
305
+ return_tensors="pt",
306
+ truncation=True,
307
+ max_length=800 # Shorter context for 0.5B
308
+ )
309
 
310
+ # Conservative generation settings for 0.5B model
311
  generation_params = {
312
  "input_ids": inputs.input_ids,
313
  "attention_mask": inputs.attention_mask,
 
317
  }
318
 
319
  if is_force_mode:
320
+ # Force mode: Very conservative for 0.5B
321
  generation_params.update({
322
+ "max_new_tokens": min(max_tokens, 150), # Very short
323
+ "temperature": 0.1, # Very focused
324
+ "top_p": 0.7,
325
+ "top_k": 20,
326
+ "repetition_penalty": 1.05,
327
  })
328
  else:
329
+ # Mentor mode: Still conservative but allows more creativity
330
  generation_params.update({
331
+ "max_new_tokens": min(max_tokens, 200),
332
+ "temperature": 0.3, # Lower than original
333
+ "top_p": 0.8,
334
+ "top_k": 30,
335
+ "repetition_penalty": 1.02,
336
  })
337
 
338
+ # Generate with timeout for 0.5B
339
  with torch.no_grad():
340
  outputs = model.generate(**generation_params)
341
 
342
+ # Decode response
343
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
344
 
345
  # Extract user message for context
 
349
  user_message = msg.get("content", "")
350
  break
351
 
352
+ # Clean and return
353
  clean_answer = extract_clean_answer(full_response, formatted_prompt, user_message, is_force_mode)
354
 
355
  return clean_answer
356
 
357
  except Exception as e:
358
+ print(f"❌ Generation error with Qwen2-0.5B: {e}")
359
+ mode_text = "direct answer" if is_force_mode else "guided learning"
360
+ return f"I encountered an error generating a {mode_text}. Please try a simpler question."
361
 
362
  # === Routes ===
363
  @app.get("/")
364
  def root():
365
  return {
366
+ "message": "🤖 Apollo AI Backend v2.1 - Qwen2-0.5B Optimized",
367
+ "model": "Qwen/Qwen2-0.5B-Instruct with LoRA",
368
  "status": "ready",
369
+ "optimizations": ["short_contexts", "conservative_generation", "predefined_responses"],
370
+ "features": ["mentor_mode", "force_mode", "0.5B_optimized"],
371
  "modes": {
372
+ "mentor": "Guides learning with simple questions",
373
+ "force": "Provides direct answers quickly"
374
  }
375
  }
376
 
377
  @app.get("/health")
378
  def health():
379
+ return {
380
+ "status": "healthy",
381
+ "model_loaded": True,
382
+ "model_size": "0.5B",
383
+ "optimizations": "qwen2_0.5B_specific"
384
+ }
385
 
386
  @app.post("/v1/chat/completions")
387
  async def chat_completions(request: Request):
 
404
  try:
405
  body = await request.json()
406
  messages = body.get("messages", [])
407
+ max_tokens = min(body.get("max_tokens", 200), 300) # Cap at 300 for 0.5B
408
+ temperature = max(0.1, min(body.get("temperature", 0.5), 0.8)) # Conservative range
409
 
410
+ # Get mode information
411
+ is_force_mode = body.get("force_mode", False)
412
 
413
  if not messages or not isinstance(messages, list):
414
  raise ValueError("Messages field is required and must be a list")
 
419
  content={"error": f"Invalid request body: {str(e)}"}
420
  )
421
 
422
+ # Validate messages
423
  for i, msg in enumerate(messages):
424
  if not isinstance(msg, dict) or "role" not in msg or "content" not in msg:
425
  return JSONResponse(
 
428
  )
429
 
430
  try:
431
+ print(f"📥 Processing request for Qwen2-0.5B in {'FORCE' if is_force_mode else 'MENTOR'} mode")
 
432
  print(f"📊 Settings: max_tokens={max_tokens}, temperature={temperature}")
433
 
434
  response_content = generate_response(
435
  messages=messages,
436
  is_force_mode=is_force_mode,
437
+ max_tokens=max_tokens,
438
+ temperature=temperature
439
  )
440
 
441
  # Return OpenAI-compatible response
442
  return {
443
+ "id": f"chatcmpl-apollo-qwen05b-{hash(str(messages)) % 10000}",
444
  "object": "chat.completion",
445
  "created": int(torch.tensor(0).item()),
446
  "model": f"qwen2-0.5b-{'force' if is_force_mode else 'mentor'}-mode",
 
459
  "completion_tokens": len(response_content),
460
  "total_tokens": len(str(messages)) + len(response_content)
461
  },
462
+ "apollo_mode": "force" if is_force_mode else "mentor",
463
+ "model_optimizations": "qwen2_0.5B_specific"
464
  }
465
 
466
  except Exception as e:
 
470
  content={"error": f"Internal server error: {str(e)}"}
471
  )
472
 
473
+ # === Test endpoint optimized for 0.5B ===
474
  @app.post("/test")
475
  async def test_generation(request: Request):
476
+ """Test endpoint for debugging both modes with 0.5B optimizations"""
477
  try:
478
  body = await request.json()
479
  prompt = body.get("prompt", "How do I print hello world in Python?")
480
+ max_tokens = min(body.get("max_tokens", 200), 300)
481
  test_both_modes = body.get("test_both_modes", True)
482
 
483
  results = {}
484
 
485
  # Test mentor mode
486
+ messages_mentor = [{"role": "user", "content": prompt}]
487
+ mentor_response = generate_response(messages_mentor, is_force_mode=False, max_tokens=max_tokens, temperature=0.3)
 
 
488
  results["mentor_mode"] = {
489
  "response": mentor_response,
490
  "length": len(mentor_response),
 
493
 
494
  if test_both_modes:
495
  # Test force mode
496
+ messages_force = [{"role": "user", "content": prompt}]
497
+ force_response = generate_response(messages_force, is_force_mode=True, max_tokens=max_tokens, temperature=0.1)
 
 
498
  results["force_mode"] = {
499
  "response": force_response,
500
  "length": len(force_response),
 
504
  return {
505
  "prompt": prompt,
506
  "results": results,
507
+ "model": "Qwen2-0.5B-Instruct",
508
+ "optimizations": "0.5B_specific",
509
+ "status": "success"
510
  }
511
 
512
  except Exception as e:
 
517
 
518
  if __name__ == "__main__":
519
  import uvicorn
520
+ print("🚀 Starting Apollo AI Backend v2.1 - Qwen2-0.5B Optimized...")
521
+ print("🧠 Model: Qwen/Qwen2-0.5B-Instruct (500M parameters)")
522
+ print(" Optimizations: Short contexts, conservative generation, predefined responses")
523
+ print("🎯 Modes: Mentor (simple questions) vs Force (direct answers)")
524
  uvicorn.run(app, host="0.0.0.0", port=7860)