Spaces:

Ais203
/

aigen

Sleeping

App Files Files Community

Ais commited on 13 days ago

Commit

4be81ed

verified ·

1 Parent(s): 3c88a76

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +131 -92

app/main.py CHANGED Viewed

@@ -7,7 +7,7 @@ from peft import PeftModel
 from starlette.middleware.cors import CORSMiddleware
 # === Setup FastAPI ===
-app = FastAPI(title="Apollo AI Backend - Qwen2-0.5B", version="4.0.0-TRULY-FIXED")
 # === CORS ===
 app.add_middleware(
@@ -46,39 +46,31 @@ print("✅ Qwen2-0.5B model ready!")
 def create_conversation_prompt(messages: list, is_force_mode: bool) -> str:
     """Create a conversation prompt with clear mode instructions"""
     if is_force_mode:
-        system_prompt = """You are a helpful programming assistant. Give direct, complete answers with examples. Do not ask questions back to the user. Provide clear explanations and working code when relevant.
-When asked about Python functions, provide:
-1. What the function does
-2. Clear examples with output
-3. Common use cases
-Be direct and informative."""
     else:
-        system_prompt = """You are a programming teacher focused on helping students learn through discovery. Guide students with questions and hints rather than giving direct answers.
-When asked about concepts:
-1. Ask what they think might happen
-2. Encourage them to try things out
-3. Guide them to discover patterns
-4. Ask follow-up questions to deepen understanding
-Help them learn by thinking, not by giving answers directly."""
-    # Build conversation
-    conversation = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
-    # Add conversation history (last 4 messages for context)
-    recent_messages = messages[-4:] if len(messages) > 4 else messages
-    for msg in recent_messages:
-        role = msg.get("role", "")
-        content = msg.get("content", "")
-        conversation += f"<|im_start|>{role}\n{content}<|im_end|>\n"
-    conversation += "<|im_start|>assistant\n"
-    return conversation
 def generate_response(messages: list, is_force_mode: bool = False, max_tokens: int = 200, temperature: float = 0.7) -> str:
     """Generate response using the AI model"""
@@ -88,19 +80,30 @@ def generate_response(messages: list, is_force_mode: bool = False, max_tokens: i
         print(f"🎯 Generating {'FORCE (Direct)' if is_force_mode else 'MENTOR (Questions)'} response")
         print(f"🔍 Mode flag: {is_force_mode}")
         # Adjust parameters based on mode
         if is_force_mode:
-            generation_temp = 0.3  # More focused for direct answers
-            generation_tokens = min(max_tokens, 300)
         else:
-            generation_temp = 0.5  # More creative for questions
             generation_tokens = min(max_tokens, 250)
-        # Tokenize input
-        inputs = tokenizer(prompt, return_tensors="pt", max_length=1500, truncation=True)
-        # Generate response
         with torch.no_grad():
             outputs = model.generate(
                 inputs.input_ids,
@@ -109,65 +112,77 @@ def generate_response(messages: list, is_force_mode: bool = False, max_tokens: i
                 do_sample=True,
                 pad_token_id=tokenizer.eos_token_id,
                 eos_token_id=tokenizer.eos_token_id,
-                top_p=0.9,
-                repetition_penalty=1.1,
-                no_repeat_ngram_size=3
             )
-        # Decode response
-        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Extract only the new generated part
-        response = full_response[len(prompt):].strip()
         # Clean up response
         response = response.replace("<|im_end|>", "").strip()
-        # Remove conversation artifacts
-        lines = response.split('\n')
-        clean_lines = []
-        for line in lines:
-            line = line.strip()
-            if not line.startswith(('<|im_start|>', '<|im_end|>', 'system:', 'user:', 'assistant:')):
-                clean_lines.append(line)
-        response = '\n'.join(clean_lines).strip()
-        # Take first paragraph if too long
-        if len(response) > max_tokens * 4:
-            paragraphs = response.split('\n\n')
-            response = paragraphs[0] if paragraphs else response[:max_tokens * 4]
-        print(f"✅ Generated response: {response[:100]}...")
-        # Simple validation - no template injection
-        if not response or len(response) < 10:
             if is_force_mode:
-                return "I need more specific information to provide a direct answer. Could you clarify your question?"
             else:
-                return "That's a great question to explore! What do you think might be the answer? Try experimenting and see what you discover!"
         return response
     except Exception as e:
         print(f"❌ Generation error: {e}")
         if is_force_mode:
-            return "I encountered an error generating a direct response. Please try rephrasing your question."
         else:
-            return "Interesting challenge! What approach do you think might work here? Let's explore this together."
 # === Routes ===
 @app.get("/")
 def root():
     return {
-        "message": "🤖 Apollo AI Backend v4.0-TRULY-FIXED - Qwen2-0.5B",
         "model": "Qwen/Qwen2-0.5B-Instruct with LoRA",
         "status": "ready",
         "modes": {
-            "mentor": "Guides learning with questions - REALLY FIXED",
-            "force": "Provides direct answers - REALLY FIXED"
         },
-        "fixes": "Removed all template responses, pure AI generation"
     }
 @app.get("/health")
@@ -176,7 +191,7 @@ def health():
         "status": "healthy",
         "model_loaded": True,
         "model_size": "0.5B",
-        "version": "4.0-TRULY-FIXED"
     }
 @app.post("/v1/chat/completions")
@@ -200,19 +215,28 @@ async def chat_completions(request: Request):
     try:
         body = await request.json()
         messages = body.get("messages", [])
-        max_tokens = min(body.get("max_tokens", 200), 400)
         temperature = max(0.1, min(body.get("temperature", 0.7), 1.0))
-        # Get force mode flag
-        is_force_mode = body.get("force_mode", False)
-        print(f"🚨 REQUEST RECEIVED - force_mode: {is_force_mode}")
-        print(f"📝 Last user message: {messages[-1].get('content', '') if messages else 'None'}")
         if not messages or not isinstance(messages, list):
             raise ValueError("Messages field is required and must be a list")
     except Exception as e:
         return JSONResponse(
             status_code=400,
             content={"error": f"Invalid request body: {str(e)}"}
@@ -227,9 +251,10 @@ async def chat_completions(request: Request):
             )
     try:
-        print(f"📥 Processing in {'FORCE (Direct Answer)' if is_force_mode else 'MENTOR (Guiding Questions)'} mode")
-        # Generate response - NO POST-PROCESSING
         response_content = generate_response(
             messages=messages,
             is_force_mode=is_force_mode,
@@ -237,13 +262,19 @@ async def chat_completions(request: Request):
             temperature=temperature
         )
-        print(f"✅ Pure AI response generated: {response_content[:150]}...")
         return {
-            "id": f"chatcmpl-apollo-{hash(str(messages)) % 10000}",
             "object": "chat.completion",
-            "created": int(torch.tensor(0).item()),
-            "model": f"qwen2-0.5b-{'force' if is_force_mode else 'mentor'}-truly-fixed",
             "choices": [
                 {
                     "index": 0,
@@ -255,26 +286,34 @@ async def chat_completions(request: Request):
                 }
             ],
             "usage": {
-                "prompt_tokens": len(str(messages)),
-                "completion_tokens": len(response_content),
-                "total_tokens": len(str(messages)) + len(response_content)
             },
-            "apollo_mode": "force_direct" if is_force_mode else "mentor_questions",
-            "pure_ai_response": True
         }
     except Exception as e:
         print(f"❌ Chat completion error: {e}")
         return JSONResponse(
             status_code=500,
-            content={"error": f"Internal server error: {str(e)}"}
         )
 if __name__ == "__main__":
     import uvicorn
-    print("🚀 Starting Apollo AI Backend v4.0-TRULY-FIXED")
     print("🧠 Model: Qwen/Qwen2-0.5B-Instruct (500M parameters)")
-    print("🎯 Mentor Mode: Pure AI questions and guidance")
-    print("⚡ Force Mode: Pure AI direct answers")
-    print("🚫 NO MORE TEMPLATES - Pure AI responses only")
     uvicorn.run(app, host="0.0.0.0", port=7860)

 from starlette.middleware.cors import CORSMiddleware
 # === Setup FastAPI ===
+app = FastAPI(title="Apollo AI Backend - Qwen2-0.5B", version="4.1.0-TRULY-FIXED")
 # === CORS ===
 app.add_middleware(
 def create_conversation_prompt(messages: list, is_force_mode: bool) -> str:
     """Create a conversation prompt with clear mode instructions"""
+    # Get the last user message
+    last_message = messages[-1].get("content", "") if messages else ""
     if is_force_mode:
+        # FORCE MODE: Direct, complete answers
+        system_instruction = """You are a helpful programming assistant. Answer directly and completely. Provide clear explanations with code examples when relevant. Don't ask questions back to the user."""
+        prompt = f"""<|im_start|>system
+{system_instruction}<|im_end|>
+<|im_start|>user
+{last_message}<|im_end|>
+<|im_start|>assistant
+"""
     else:
+        # MENTOR MODE: Guide with questions
+        system_instruction = """You are a programming mentor. Guide students to discover answers through questions and hints. Ask questions to help them think, rather than giving direct answers."""
+        prompt = f"""<|im_start|>system
+{system_instruction}<|im_end|>
+<|im_start|>user
+{last_message}<|im_end|>
+<|im_start|>assistant
+"""
+    return prompt
 def generate_response(messages: list, is_force_mode: bool = False, max_tokens: int = 200, temperature: float = 0.7) -> str:
     """Generate response using the AI model"""
         print(f"🎯 Generating {'FORCE (Direct)' if is_force_mode else 'MENTOR (Questions)'} response")
         print(f"🔍 Mode flag: {is_force_mode}")
+        print(f"📝 Prompt preview: {prompt[:200]}...")
         # Adjust parameters based on mode
         if is_force_mode:
+            generation_temp = 0.4  # More focused for direct answers
+            generation_tokens = min(max_tokens, 350)
+            top_p = 0.8
         else:
+            generation_temp = 0.6  # More creative for questions
             generation_tokens = min(max_tokens, 250)
+            top_p = 0.9
+        # Tokenize input with proper truncation
+        inputs = tokenizer(
+            prompt,
+            return_tensors="pt",
+            max_length=1024,  # Shorter context for better responses
+            truncation=True,
+            padding=False
+        )
+        print(f"🔢 Input tokens: {inputs.input_ids.shape[1]}")
+        # Generate response with better parameters
         with torch.no_grad():
             outputs = model.generate(
                 inputs.input_ids,
                 do_sample=True,
                 pad_token_id=tokenizer.eos_token_id,
                 eos_token_id=tokenizer.eos_token_id,
+                top_p=top_p,
+                repetition_penalty=1.05,  # Reduced repetition penalty
+                no_repeat_ngram_size=2,   # Reduced n-gram size
+                early_stopping=True
             )
+        # Decode response properly
+        generated_ids = outputs[0][inputs.input_ids.shape[1]:]  # Only new tokens
+        response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+        print(f"🔍 Raw response: {response[:150]}...")
         # Clean up response
         response = response.replace("<|im_end|>", "").strip()
+        # Remove any leftover conversation markers
+        unwanted_prefixes = ["<|im_start|>", "assistant:", "user:", "system:"]
+        for prefix in unwanted_prefixes:
+            if response.startswith(prefix):
+                response = response[len(prefix):].strip()
+        # Handle empty or very short responses
+        if not response or len(response) < 5:
             if is_force_mode:
+                return "I need more specific information to provide a helpful answer. Could you please clarify your question?"
             else:
+                return "That's an interesting question! What do you think the answer might be? Have you tried experimenting with it?"
+        # Truncate if too long but ensure complete sentences
+        if len(response) > max_tokens * 6:  # Rough character to token ratio
+            sentences = response.split('. ')
+            truncated = ""
+            for sentence in sentences:
+                if len(truncated + sentence + '. ') <= max_tokens * 5:
+                    truncated += sentence + '. '
+                else:
+                    break
+            response = truncated.rstrip()
+        print(f"✅ Final response length: {len(response)}")
+        print(f"📝 Response preview: {response[:100]}...")
         return response
     except Exception as e:
         print(f"❌ Generation error: {e}")
+        import traceback
+        traceback.print_exc()
         if is_force_mode:
+            return "I encountered an error generating a response. Please try rephrasing your question."
         else:
+            return "That's a challenging question! What approach do you think might work? Let's explore this step by step."
 # === Routes ===
 @app.get("/")
 def root():
     return {
+        "message": "🤖 Apollo AI Backend v4.1-TRULY-FIXED - Qwen2-0.5B",
         "model": "Qwen/Qwen2-0.5B-Instruct with LoRA",
         "status": "ready",
         "modes": {
+            "mentor": "Guides learning with questions - FIXED GENERATION",
+            "force": "Provides direct answers - FIXED GENERATION"
         },
+        "fixes": [
+            "Fixed prompt truncation",
+            "Improved token generation",
+            "Better response cleaning",
+            "Proper mode detection"
+        ]
     }
 @app.get("/health")
         "status": "healthy",
         "model_loaded": True,
         "model_size": "0.5B",
+        "version": "4.1-TRULY-FIXED"
     }
 @app.post("/v1/chat/completions")
     try:
         body = await request.json()
         messages = body.get("messages", [])
+        max_tokens = min(body.get("max_tokens", 300), 500)  # Increased default
         temperature = max(0.1, min(body.get("temperature", 0.7), 1.0))
+        # CRITICAL: Get force mode flag - check multiple possible names
+        is_force_mode = (
+            body.get("force_mode", False) or
+            body.get("forceMode", False) or
+            body.get("force", False)
+        )
+        print(f"🚨 REQUEST RECEIVED")
+        print(f"🎯 Force mode detected: {is_force_mode}")
+        print(f"📊 Max tokens: {max_tokens}, Temperature: {temperature}")
+        print(f"📝 Messages count: {len(messages)}")
+        if messages:
+            print(f"📝 Last message: {messages[-1].get('content', '')[:100]}...")
         if not messages or not isinstance(messages, list):
             raise ValueError("Messages field is required and must be a list")
     except Exception as e:
+        print(f"❌ Request parsing error: {e}")
         return JSONResponse(
             status_code=400,
             content={"error": f"Invalid request body: {str(e)}"}
             )
     try:
+        print(f"🔄 Processing with {len(messages)} messages")
+        print(f"🎯 Mode: {'FORCE (Direct Answer)' if is_force_mode else 'MENTOR (Guiding Questions)'}")
+        # Generate response
         response_content = generate_response(
             messages=messages,
             is_force_mode=is_force_mode,
             temperature=temperature
         )
+        # Validate response
+        if not response_content or len(response_content.strip()) < 10:
+            response_content = "I apologize, but I couldn't generate a proper response. Please try rephrasing your question."
+        print(f"✅ Response generated successfully")
+        print(f"📊 Response length: {len(response_content)}")
+        print(f"🔍 Mode used: {'force_direct' if is_force_mode else 'mentor_questions'}")
         return {
+            "id": f"chatcmpl-apollo-{abs(hash(str(messages))) % 10000}",
             "object": "chat.completion",
+            "created": 1704067200,  # Fixed timestamp
+            "model": f"qwen2-0.5b-{'force' if is_force_mode else 'mentor'}-v4.1",
             "choices": [
                 {
                     "index": 0,
                 }
             ],
             "usage": {
+                "prompt_tokens": sum(len(msg.get("content", "")) for msg in messages) // 4,  # Rough estimate
+                "completion_tokens": len(response_content) // 4,  # Rough estimate
+                "total_tokens": (sum(len(msg.get("content", "")) for msg in messages) + len(response_content)) // 4
             },
+            "apollo_mode": "force_direct_v4.1" if is_force_mode else "mentor_questions_v4.1",
+            "pure_ai_response": True,
+            "generation_success": True
         }
     except Exception as e:
         print(f"❌ Chat completion error: {e}")
+        import traceback
+        traceback.print_exc()
         return JSONResponse(
             status_code=500,
+            content={
+                "error": f"Internal server error: {str(e)}",
+                "type": "generation_error",
+                "mode_requested": "force" if is_force_mode else "mentor"
+            }
         )
 if __name__ == "__main__":
     import uvicorn
+    print("🚀 Starting Apollo AI Backend v4.1-TRULY-FIXED")
     print("🧠 Model: Qwen/Qwen2-0.5B-Instruct (500M parameters)")
+    print("🔧 Fixed: Prompt generation, token handling, response cleaning")
+    print("🎯 Mentor Mode: Guides with questions")
+    print("⚡ Force Mode: Provides direct answers")
     uvicorn.run(app, host="0.0.0.0", port=7860)