Ais commited on
Commit
973ee50
·
verified ·
1 Parent(s): 4be81ed

Update app/main.py

Browse files
Files changed (1) hide show
  1. app/main.py +88 -131
app/main.py CHANGED
@@ -7,7 +7,7 @@ from peft import PeftModel
7
  from starlette.middleware.cors import CORSMiddleware
8
 
9
  # === Setup FastAPI ===
10
- app = FastAPI(title="Apollo AI Backend - Qwen2-0.5B", version="4.1.0-TRULY-FIXED")
11
 
12
  # === CORS ===
13
  app.add_middleware(
@@ -46,31 +46,35 @@ print("✅ Qwen2-0.5B model ready!")
46
  def create_conversation_prompt(messages: list, is_force_mode: bool) -> str:
47
  """Create a conversation prompt with clear mode instructions"""
48
 
49
- # Get the last user message
50
- last_message = messages[-1].get("content", "") if messages else ""
51
-
52
  if is_force_mode:
53
- # FORCE MODE: Direct, complete answers
54
- system_instruction = """You are a helpful programming assistant. Answer directly and completely. Provide clear explanations with code examples when relevant. Don't ask questions back to the user."""
55
-
56
- prompt = f"""<|im_start|>system
57
- {system_instruction}<|im_end|>
58
- <|im_start|>user
59
- {last_message}<|im_end|>
60
- <|im_start|>assistant
61
- """
62
  else:
63
- # MENTOR MODE: Guide with questions
64
- system_instruction = """You are a programming mentor. Guide students to discover answers through questions and hints. Ask questions to help them think, rather than giving direct answers."""
65
-
66
- prompt = f"""<|im_start|>system
67
- {system_instruction}<|im_end|>
68
- <|im_start|>user
69
- {last_message}<|im_end|>
70
- <|im_start|>assistant
71
- """
 
 
 
 
72
 
73
- return prompt
 
 
 
 
 
 
74
 
75
  def generate_response(messages: list, is_force_mode: bool = False, max_tokens: int = 200, temperature: float = 0.7) -> str:
76
  """Generate response using the AI model"""
@@ -80,30 +84,19 @@ def generate_response(messages: list, is_force_mode: bool = False, max_tokens: i
80
 
81
  print(f"🎯 Generating {'FORCE (Direct)' if is_force_mode else 'MENTOR (Questions)'} response")
82
  print(f"🔍 Mode flag: {is_force_mode}")
83
- print(f"📝 Prompt preview: {prompt[:200]}...")
84
 
85
  # Adjust parameters based on mode
86
  if is_force_mode:
87
- generation_temp = 0.4 # More focused for direct answers
88
- generation_tokens = min(max_tokens, 350)
89
- top_p = 0.8
90
  else:
91
- generation_temp = 0.6 # More creative for questions
92
  generation_tokens = min(max_tokens, 250)
93
- top_p = 0.9
94
-
95
- # Tokenize input with proper truncation
96
- inputs = tokenizer(
97
- prompt,
98
- return_tensors="pt",
99
- max_length=1024, # Shorter context for better responses
100
- truncation=True,
101
- padding=False
102
- )
103
 
104
- print(f"🔢 Input tokens: {inputs.input_ids.shape[1]}")
 
105
 
106
- # Generate response with better parameters
107
  with torch.no_grad():
108
  outputs = model.generate(
109
  inputs.input_ids,
@@ -112,77 +105,65 @@ def generate_response(messages: list, is_force_mode: bool = False, max_tokens: i
112
  do_sample=True,
113
  pad_token_id=tokenizer.eos_token_id,
114
  eos_token_id=tokenizer.eos_token_id,
115
- top_p=top_p,
116
- repetition_penalty=1.05, # Reduced repetition penalty
117
- no_repeat_ngram_size=2, # Reduced n-gram size
118
- early_stopping=True
119
  )
120
 
121
- # Decode response properly
122
- generated_ids = outputs[0][inputs.input_ids.shape[1]:] # Only new tokens
123
- response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
124
 
125
- print(f"🔍 Raw response: {response[:150]}...")
 
126
 
127
  # Clean up response
128
  response = response.replace("<|im_end|>", "").strip()
129
 
130
- # Remove any leftover conversation markers
131
- unwanted_prefixes = ["<|im_start|>", "assistant:", "user:", "system:"]
132
- for prefix in unwanted_prefixes:
133
- if response.startswith(prefix):
134
- response = response[len(prefix):].strip()
 
 
135
 
136
- # Handle empty or very short responses
137
- if not response or len(response) < 5:
138
- if is_force_mode:
139
- return "I need more specific information to provide a helpful answer. Could you please clarify your question?"
140
- else:
141
- return "That's an interesting question! What do you think the answer might be? Have you tried experimenting with it?"
142
 
143
- # Truncate if too long but ensure complete sentences
144
- if len(response) > max_tokens * 6: # Rough character to token ratio
145
- sentences = response.split('. ')
146
- truncated = ""
147
- for sentence in sentences:
148
- if len(truncated + sentence + '. ') <= max_tokens * 5:
149
- truncated += sentence + '. '
150
- else:
151
- break
152
- response = truncated.rstrip()
153
 
154
- print(f"✅ Final response length: {len(response)}")
155
- print(f"📝 Response preview: {response[:100]}...")
 
 
 
 
 
 
156
 
157
  return response
158
 
159
  except Exception as e:
160
  print(f"❌ Generation error: {e}")
161
- import traceback
162
- traceback.print_exc()
163
-
164
  if is_force_mode:
165
- return "I encountered an error generating a response. Please try rephrasing your question."
166
  else:
167
- return "That's a challenging question! What approach do you think might work? Let's explore this step by step."
168
 
169
  # === Routes ===
170
  @app.get("/")
171
  def root():
172
  return {
173
- "message": "🤖 Apollo AI Backend v4.1-TRULY-FIXED - Qwen2-0.5B",
174
  "model": "Qwen/Qwen2-0.5B-Instruct with LoRA",
175
  "status": "ready",
176
  "modes": {
177
- "mentor": "Guides learning with questions - FIXED GENERATION",
178
- "force": "Provides direct answers - FIXED GENERATION"
179
  },
180
- "fixes": [
181
- "Fixed prompt truncation",
182
- "Improved token generation",
183
- "Better response cleaning",
184
- "Proper mode detection"
185
- ]
186
  }
187
 
188
  @app.get("/health")
@@ -191,7 +172,7 @@ def health():
191
  "status": "healthy",
192
  "model_loaded": True,
193
  "model_size": "0.5B",
194
- "version": "4.1-TRULY-FIXED"
195
  }
196
 
197
  @app.post("/v1/chat/completions")
@@ -215,28 +196,19 @@ async def chat_completions(request: Request):
215
  try:
216
  body = await request.json()
217
  messages = body.get("messages", [])
218
- max_tokens = min(body.get("max_tokens", 300), 500) # Increased default
219
  temperature = max(0.1, min(body.get("temperature", 0.7), 1.0))
220
 
221
- # CRITICAL: Get force mode flag - check multiple possible names
222
- is_force_mode = (
223
- body.get("force_mode", False) or
224
- body.get("forceMode", False) or
225
- body.get("force", False)
226
- )
227
 
228
- print(f"🚨 REQUEST RECEIVED")
229
- print(f"🎯 Force mode detected: {is_force_mode}")
230
- print(f"📊 Max tokens: {max_tokens}, Temperature: {temperature}")
231
- print(f"📝 Messages count: {len(messages)}")
232
- if messages:
233
- print(f"📝 Last message: {messages[-1].get('content', '')[:100]}...")
234
 
235
  if not messages or not isinstance(messages, list):
236
  raise ValueError("Messages field is required and must be a list")
237
 
238
  except Exception as e:
239
- print(f"❌ Request parsing error: {e}")
240
  return JSONResponse(
241
  status_code=400,
242
  content={"error": f"Invalid request body: {str(e)}"}
@@ -251,10 +223,9 @@ async def chat_completions(request: Request):
251
  )
252
 
253
  try:
254
- print(f"🔄 Processing with {len(messages)} messages")
255
- print(f"🎯 Mode: {'FORCE (Direct Answer)' if is_force_mode else 'MENTOR (Guiding Questions)'}")
256
 
257
- # Generate response
258
  response_content = generate_response(
259
  messages=messages,
260
  is_force_mode=is_force_mode,
@@ -262,19 +233,13 @@ async def chat_completions(request: Request):
262
  temperature=temperature
263
  )
264
 
265
- # Validate response
266
- if not response_content or len(response_content.strip()) < 10:
267
- response_content = "I apologize, but I couldn't generate a proper response. Please try rephrasing your question."
268
-
269
- print(f"✅ Response generated successfully")
270
- print(f"📊 Response length: {len(response_content)}")
271
- print(f"🔍 Mode used: {'force_direct' if is_force_mode else 'mentor_questions'}")
272
 
273
  return {
274
- "id": f"chatcmpl-apollo-{abs(hash(str(messages))) % 10000}",
275
  "object": "chat.completion",
276
- "created": 1704067200, # Fixed timestamp
277
- "model": f"qwen2-0.5b-{'force' if is_force_mode else 'mentor'}-v4.1",
278
  "choices": [
279
  {
280
  "index": 0,
@@ -286,34 +251,26 @@ async def chat_completions(request: Request):
286
  }
287
  ],
288
  "usage": {
289
- "prompt_tokens": sum(len(msg.get("content", "")) for msg in messages) // 4, # Rough estimate
290
- "completion_tokens": len(response_content) // 4, # Rough estimate
291
- "total_tokens": (sum(len(msg.get("content", "")) for msg in messages) + len(response_content)) // 4
292
  },
293
- "apollo_mode": "force_direct_v4.1" if is_force_mode else "mentor_questions_v4.1",
294
- "pure_ai_response": True,
295
- "generation_success": True
296
  }
297
 
298
  except Exception as e:
299
  print(f"❌ Chat completion error: {e}")
300
- import traceback
301
- traceback.print_exc()
302
-
303
  return JSONResponse(
304
  status_code=500,
305
- content={
306
- "error": f"Internal server error: {str(e)}",
307
- "type": "generation_error",
308
- "mode_requested": "force" if is_force_mode else "mentor"
309
- }
310
  )
311
 
312
  if __name__ == "__main__":
313
  import uvicorn
314
- print("🚀 Starting Apollo AI Backend v4.1-TRULY-FIXED")
315
  print("🧠 Model: Qwen/Qwen2-0.5B-Instruct (500M parameters)")
316
- print("🔧 Fixed: Prompt generation, token handling, response cleaning")
317
- print("🎯 Mentor Mode: Guides with questions")
318
- print(" Force Mode: Provides direct answers")
319
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
7
  from starlette.middleware.cors import CORSMiddleware
8
 
9
  # === Setup FastAPI ===
10
+ app = FastAPI(title="Apollo AI Backend - Qwen2-0.5B", version="4.0.0-TRULY-FIXED")
11
 
12
  # === CORS ===
13
  app.add_middleware(
 
46
  def create_conversation_prompt(messages: list, is_force_mode: bool) -> str:
47
  """Create a conversation prompt with clear mode instructions"""
48
 
 
 
 
49
  if is_force_mode:
50
+ system_prompt = """You are a helpful programming assistant. Give direct, complete answers with examples. Do not ask questions back to the user. Provide clear explanations and working code when relevant.
51
+ When asked about Python functions, provide:
52
+ 1. What the function does
53
+ 2. Clear examples with output
54
+ 3. Common use cases
55
+ Be direct and informative."""
 
 
 
56
  else:
57
+ system_prompt = """You are a programming teacher focused on helping students learn through discovery. Guide students with questions and hints rather than giving direct answers.
58
+ When asked about concepts:
59
+ 1. Ask what they think might happen
60
+ 2. Encourage them to try things out
61
+ 3. Guide them to discover patterns
62
+ 4. Ask follow-up questions to deepen understanding
63
+ Help them learn by thinking, not by giving answers directly."""
64
+
65
+ # Build conversation
66
+ conversation = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
67
+
68
+ # Add conversation history (last 4 messages for context)
69
+ recent_messages = messages[-4:] if len(messages) > 4 else messages
70
 
71
+ for msg in recent_messages:
72
+ role = msg.get("role", "")
73
+ content = msg.get("content", "")
74
+ conversation += f"<|im_start|>{role}\n{content}<|im_end|>\n"
75
+
76
+ conversation += "<|im_start|>assistant\n"
77
+ return conversation
78
 
79
  def generate_response(messages: list, is_force_mode: bool = False, max_tokens: int = 200, temperature: float = 0.7) -> str:
80
  """Generate response using the AI model"""
 
84
 
85
  print(f"🎯 Generating {'FORCE (Direct)' if is_force_mode else 'MENTOR (Questions)'} response")
86
  print(f"🔍 Mode flag: {is_force_mode}")
 
87
 
88
  # Adjust parameters based on mode
89
  if is_force_mode:
90
+ generation_temp = 0.3 # More focused for direct answers
91
+ generation_tokens = min(max_tokens, 300)
 
92
  else:
93
+ generation_temp = 0.5 # More creative for questions
94
  generation_tokens = min(max_tokens, 250)
 
 
 
 
 
 
 
 
 
 
95
 
96
+ # Tokenize input
97
+ inputs = tokenizer(prompt, return_tensors="pt", max_length=1500, truncation=True)
98
 
99
+ # Generate response
100
  with torch.no_grad():
101
  outputs = model.generate(
102
  inputs.input_ids,
 
105
  do_sample=True,
106
  pad_token_id=tokenizer.eos_token_id,
107
  eos_token_id=tokenizer.eos_token_id,
108
+ top_p=0.9,
109
+ repetition_penalty=1.1,
110
+ no_repeat_ngram_size=3
 
111
  )
112
 
113
+ # Decode response
114
+ full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
115
 
116
+ # Extract only the new generated part
117
+ response = full_response[len(prompt):].strip()
118
 
119
  # Clean up response
120
  response = response.replace("<|im_end|>", "").strip()
121
 
122
+ # Remove conversation artifacts
123
+ lines = response.split('\n')
124
+ clean_lines = []
125
+ for line in lines:
126
+ line = line.strip()
127
+ if not line.startswith(('<|im_start|>', '<|im_end|>', 'system:', 'user:', 'assistant:')):
128
+ clean_lines.append(line)
129
 
130
+ response = '\n'.join(clean_lines).strip()
 
 
 
 
 
131
 
132
+ # Take first paragraph if too long
133
+ if len(response) > max_tokens * 4:
134
+ paragraphs = response.split('\n\n')
135
+ response = paragraphs[0] if paragraphs else response[:max_tokens * 4]
 
 
 
 
 
 
136
 
137
+ print(f"✅ Generated response: {response[:100]}...")
138
+
139
+ # Simple validation - no template injection
140
+ if not response or len(response) < 10:
141
+ if is_force_mode:
142
+ return "I need more specific information to provide a direct answer. Could you clarify your question?"
143
+ else:
144
+ return "That's a great question to explore! What do you think might be the answer? Try experimenting and see what you discover!"
145
 
146
  return response
147
 
148
  except Exception as e:
149
  print(f"❌ Generation error: {e}")
 
 
 
150
  if is_force_mode:
151
+ return "I encountered an error generating a direct response. Please try rephrasing your question."
152
  else:
153
+ return "Interesting challenge! What approach do you think might work here? Let's explore this together."
154
 
155
  # === Routes ===
156
  @app.get("/")
157
  def root():
158
  return {
159
+ "message": "🤖 Apollo AI Backend v4.0-TRULY-FIXED - Qwen2-0.5B",
160
  "model": "Qwen/Qwen2-0.5B-Instruct with LoRA",
161
  "status": "ready",
162
  "modes": {
163
+ "mentor": "Guides learning with questions - REALLY FIXED",
164
+ "force": "Provides direct answers - REALLY FIXED"
165
  },
166
+ "fixes": "Removed all template responses, pure AI generation"
 
 
 
 
 
167
  }
168
 
169
  @app.get("/health")
 
172
  "status": "healthy",
173
  "model_loaded": True,
174
  "model_size": "0.5B",
175
+ "version": "4.0-TRULY-FIXED"
176
  }
177
 
178
  @app.post("/v1/chat/completions")
 
196
  try:
197
  body = await request.json()
198
  messages = body.get("messages", [])
199
+ max_tokens = min(body.get("max_tokens", 200), 400)
200
  temperature = max(0.1, min(body.get("temperature", 0.7), 1.0))
201
 
202
+ # Get force mode flag
203
+ is_force_mode = body.get("force_mode", False)
 
 
 
 
204
 
205
+ print(f"🚨 REQUEST RECEIVED - force_mode: {is_force_mode}")
206
+ print(f"📝 Last user message: {messages[-1].get('content', '') if messages else 'None'}")
 
 
 
 
207
 
208
  if not messages or not isinstance(messages, list):
209
  raise ValueError("Messages field is required and must be a list")
210
 
211
  except Exception as e:
 
212
  return JSONResponse(
213
  status_code=400,
214
  content={"error": f"Invalid request body: {str(e)}"}
 
223
  )
224
 
225
  try:
226
+ print(f"📥 Processing in {'FORCE (Direct Answer)' if is_force_mode else 'MENTOR (Guiding Questions)'} mode")
 
227
 
228
+ # Generate response - NO POST-PROCESSING
229
  response_content = generate_response(
230
  messages=messages,
231
  is_force_mode=is_force_mode,
 
233
  temperature=temperature
234
  )
235
 
236
+ print(f"✅ Pure AI response generated: {response_content[:150]}...")
 
 
 
 
 
 
237
 
238
  return {
239
+ "id": f"chatcmpl-apollo-{hash(str(messages)) % 10000}",
240
  "object": "chat.completion",
241
+ "created": int(torch.tensor(0).item()),
242
+ "model": f"qwen2-0.5b-{'force' if is_force_mode else 'mentor'}-truly-fixed",
243
  "choices": [
244
  {
245
  "index": 0,
 
251
  }
252
  ],
253
  "usage": {
254
+ "prompt_tokens": len(str(messages)),
255
+ "completion_tokens": len(response_content),
256
+ "total_tokens": len(str(messages)) + len(response_content)
257
  },
258
+ "apollo_mode": "force_direct" if is_force_mode else "mentor_questions",
259
+ "pure_ai_response": True
 
260
  }
261
 
262
  except Exception as e:
263
  print(f"❌ Chat completion error: {e}")
 
 
 
264
  return JSONResponse(
265
  status_code=500,
266
+ content={"error": f"Internal server error: {str(e)}"}
 
 
 
 
267
  )
268
 
269
  if __name__ == "__main__":
270
  import uvicorn
271
+ print("🚀 Starting Apollo AI Backend v4.0-TRULY-FIXED")
272
  print("🧠 Model: Qwen/Qwen2-0.5B-Instruct (500M parameters)")
273
+ print("🎯 Mentor Mode: Pure AI questions and guidance")
274
+ print(" Force Mode: Pure AI direct answers")
275
+ print("🚫 NO MORE TEMPLATES - Pure AI responses only")
276
  uvicorn.run(app, host="0.0.0.0", port=7860)