Spaces:

Ais203
/

aigen

Sleeping

App Files Files Community

Ais commited on 15 days ago

Commit

973ee50

verified ·

1 Parent(s): 4be81ed

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +88 -131

app/main.py CHANGED Viewed

@@ -7,7 +7,7 @@ from peft import PeftModel
 from starlette.middleware.cors import CORSMiddleware
 # === Setup FastAPI ===
-app = FastAPI(title="Apollo AI Backend - Qwen2-0.5B", version="4.1.0-TRULY-FIXED")
 # === CORS ===
 app.add_middleware(
@@ -46,31 +46,35 @@ print("✅ Qwen2-0.5B model ready!")
 def create_conversation_prompt(messages: list, is_force_mode: bool) -> str:
     """Create a conversation prompt with clear mode instructions"""
-    # Get the last user message
-    last_message = messages[-1].get("content", "") if messages else ""
     if is_force_mode:
-        # FORCE MODE: Direct, complete answers
-        system_instruction = """You are a helpful programming assistant. Answer directly and completely. Provide clear explanations with code examples when relevant. Don't ask questions back to the user."""
-        prompt = f"""<|im_start|>system
-{system_instruction}<|im_end|>
-<|im_start|>user
-{last_message}<|im_end|>
-<|im_start|>assistant
-"""
     else:
-        # MENTOR MODE: Guide with questions
-        system_instruction = """You are a programming mentor. Guide students to discover answers through questions and hints. Ask questions to help them think, rather than giving direct answers."""
-        prompt = f"""<|im_start|>system
-{system_instruction}<|im_end|>
-<|im_start|>user
-{last_message}<|im_end|>
-<|im_start|>assistant
-"""
-    return prompt
 def generate_response(messages: list, is_force_mode: bool = False, max_tokens: int = 200, temperature: float = 0.7) -> str:
     """Generate response using the AI model"""
@@ -80,30 +84,19 @@ def generate_response(messages: list, is_force_mode: bool = False, max_tokens: i
         print(f"🎯 Generating {'FORCE (Direct)' if is_force_mode else 'MENTOR (Questions)'} response")
         print(f"🔍 Mode flag: {is_force_mode}")
-        print(f"📝 Prompt preview: {prompt[:200]}...")
         # Adjust parameters based on mode
         if is_force_mode:
-            generation_temp = 0.4  # More focused for direct answers
-            generation_tokens = min(max_tokens, 350)
-            top_p = 0.8
         else:
-            generation_temp = 0.6  # More creative for questions
             generation_tokens = min(max_tokens, 250)
-            top_p = 0.9
-        # Tokenize input with proper truncation
-        inputs = tokenizer(
-            prompt,
-            return_tensors="pt",
-            max_length=1024,  # Shorter context for better responses
-            truncation=True,
-            padding=False
-        )
-        print(f"🔢 Input tokens: {inputs.input_ids.shape[1]}")
-        # Generate response with better parameters
         with torch.no_grad():
             outputs = model.generate(
                 inputs.input_ids,
@@ -112,77 +105,65 @@ def generate_response(messages: list, is_force_mode: bool = False, max_tokens: i
                 do_sample=True,
                 pad_token_id=tokenizer.eos_token_id,
                 eos_token_id=tokenizer.eos_token_id,
-                top_p=top_p,
-                repetition_penalty=1.05,  # Reduced repetition penalty
-                no_repeat_ngram_size=2,   # Reduced n-gram size
-                early_stopping=True
             )
-        # Decode response properly
-        generated_ids = outputs[0][inputs.input_ids.shape[1]:]  # Only new tokens
-        response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
-        print(f"🔍 Raw response: {response[:150]}...")
         # Clean up response
         response = response.replace("<|im_end|>", "").strip()
-        # Remove any leftover conversation markers
-        unwanted_prefixes = ["<|im_start|>", "assistant:", "user:", "system:"]
-        for prefix in unwanted_prefixes:
-            if response.startswith(prefix):
-                response = response[len(prefix):].strip()
-        # Handle empty or very short responses
-        if not response or len(response) < 5:
-            if is_force_mode:
-                return "I need more specific information to provide a helpful answer. Could you please clarify your question?"
-            else:
-                return "That's an interesting question! What do you think the answer might be? Have you tried experimenting with it?"
-        # Truncate if too long but ensure complete sentences
-        if len(response) > max_tokens * 6:  # Rough character to token ratio
-            sentences = response.split('. ')
-            truncated = ""
-            for sentence in sentences:
-                if len(truncated + sentence + '. ') <= max_tokens * 5:
-                    truncated += sentence + '. '
-                else:
-                    break
-            response = truncated.rstrip()
-        print(f"✅ Final response length: {len(response)}")
-        print(f"📝 Response preview: {response[:100]}...")
         return response
     except Exception as e:
         print(f"❌ Generation error: {e}")
-        import traceback
-        traceback.print_exc()
         if is_force_mode:
-            return "I encountered an error generating a response. Please try rephrasing your question."
         else:
-            return "That's a challenging question! What approach do you think might work? Let's explore this step by step."
 # === Routes ===
 @app.get("/")
 def root():
     return {
-        "message": "🤖 Apollo AI Backend v4.1-TRULY-FIXED - Qwen2-0.5B",
         "model": "Qwen/Qwen2-0.5B-Instruct with LoRA",
         "status": "ready",
         "modes": {
-            "mentor": "Guides learning with questions - FIXED GENERATION",
-            "force": "Provides direct answers - FIXED GENERATION"
         },
-        "fixes": [
-            "Fixed prompt truncation",
-            "Improved token generation",
-            "Better response cleaning",
-            "Proper mode detection"
-        ]
     }
 @app.get("/health")
@@ -191,7 +172,7 @@ def health():
         "status": "healthy",
         "model_loaded": True,
         "model_size": "0.5B",
-        "version": "4.1-TRULY-FIXED"
     }
 @app.post("/v1/chat/completions")
@@ -215,28 +196,19 @@ async def chat_completions(request: Request):
     try:
         body = await request.json()
         messages = body.get("messages", [])
-        max_tokens = min(body.get("max_tokens", 300), 500)  # Increased default
         temperature = max(0.1, min(body.get("temperature", 0.7), 1.0))
-        # CRITICAL: Get force mode flag - check multiple possible names
-        is_force_mode = (
-            body.get("force_mode", False) or
-            body.get("forceMode", False) or
-            body.get("force", False)
-        )
-        print(f"🚨 REQUEST RECEIVED")
-        print(f"🎯 Force mode detected: {is_force_mode}")
-        print(f"📊 Max tokens: {max_tokens}, Temperature: {temperature}")
-        print(f"📝 Messages count: {len(messages)}")
-        if messages:
-            print(f"📝 Last message: {messages[-1].get('content', '')[:100]}...")
         if not messages or not isinstance(messages, list):
             raise ValueError("Messages field is required and must be a list")
     except Exception as e:
-        print(f"❌ Request parsing error: {e}")
         return JSONResponse(
             status_code=400,
             content={"error": f"Invalid request body: {str(e)}"}
@@ -251,10 +223,9 @@ async def chat_completions(request: Request):
             )
     try:
-        print(f"🔄 Processing with {len(messages)} messages")
-        print(f"🎯 Mode: {'FORCE (Direct Answer)' if is_force_mode else 'MENTOR (Guiding Questions)'}")
-        # Generate response
         response_content = generate_response(
             messages=messages,
             is_force_mode=is_force_mode,
@@ -262,19 +233,13 @@ async def chat_completions(request: Request):
             temperature=temperature
         )
-        # Validate response
-        if not response_content or len(response_content.strip()) < 10:
-            response_content = "I apologize, but I couldn't generate a proper response. Please try rephrasing your question."
-        print(f"✅ Response generated successfully")
-        print(f"📊 Response length: {len(response_content)}")
-        print(f"🔍 Mode used: {'force_direct' if is_force_mode else 'mentor_questions'}")
         return {
-            "id": f"chatcmpl-apollo-{abs(hash(str(messages))) % 10000}",
             "object": "chat.completion",
-            "created": 1704067200,  # Fixed timestamp
-            "model": f"qwen2-0.5b-{'force' if is_force_mode else 'mentor'}-v4.1",
             "choices": [
                 {
                     "index": 0,
@@ -286,34 +251,26 @@ async def chat_completions(request: Request):
                 }
             ],
             "usage": {
-                "prompt_tokens": sum(len(msg.get("content", "")) for msg in messages) // 4,  # Rough estimate
-                "completion_tokens": len(response_content) // 4,  # Rough estimate
-                "total_tokens": (sum(len(msg.get("content", "")) for msg in messages) + len(response_content)) // 4
             },
-            "apollo_mode": "force_direct_v4.1" if is_force_mode else "mentor_questions_v4.1",
-            "pure_ai_response": True,
-            "generation_success": True
         }
     except Exception as e:
         print(f"❌ Chat completion error: {e}")
-        import traceback
-        traceback.print_exc()
         return JSONResponse(
             status_code=500,
-            content={
-                "error": f"Internal server error: {str(e)}",
-                "type": "generation_error",
-                "mode_requested": "force" if is_force_mode else "mentor"
-            }
         )
 if __name__ == "__main__":
     import uvicorn
-    print("🚀 Starting Apollo AI Backend v4.1-TRULY-FIXED")
     print("🧠 Model: Qwen/Qwen2-0.5B-Instruct (500M parameters)")
-    print("🔧 Fixed: Prompt generation, token handling, response cleaning")
-    print("🎯 Mentor Mode: Guides with questions")
-    print("⚡ Force Mode: Provides direct answers")
     uvicorn.run(app, host="0.0.0.0", port=7860)

 from starlette.middleware.cors import CORSMiddleware
 # === Setup FastAPI ===
+app = FastAPI(title="Apollo AI Backend - Qwen2-0.5B", version="4.0.0-TRULY-FIXED")
 # === CORS ===
 app.add_middleware(
 def create_conversation_prompt(messages: list, is_force_mode: bool) -> str:
     """Create a conversation prompt with clear mode instructions"""
     if is_force_mode:
+        system_prompt = """You are a helpful programming assistant. Give direct, complete answers with examples. Do not ask questions back to the user. Provide clear explanations and working code when relevant.
+When asked about Python functions, provide:
+1. What the function does
+2. Clear examples with output
+3. Common use cases
+Be direct and informative."""
     else:
+        system_prompt = """You are a programming teacher focused on helping students learn through discovery. Guide students with questions and hints rather than giving direct answers.
+When asked about concepts:
+1. Ask what they think might happen
+2. Encourage them to try things out
+3. Guide them to discover patterns
+4. Ask follow-up questions to deepen understanding
+Help them learn by thinking, not by giving answers directly."""
+    # Build conversation
+    conversation = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
+    # Add conversation history (last 4 messages for context)
+    recent_messages = messages[-4:] if len(messages) > 4 else messages
+    for msg in recent_messages:
+        role = msg.get("role", "")
+        content = msg.get("content", "")
+        conversation += f"<|im_start|>{role}\n{content}<|im_end|>\n"
+    conversation += "<|im_start|>assistant\n"
+    return conversation
 def generate_response(messages: list, is_force_mode: bool = False, max_tokens: int = 200, temperature: float = 0.7) -> str:
     """Generate response using the AI model"""
         print(f"🎯 Generating {'FORCE (Direct)' if is_force_mode else 'MENTOR (Questions)'} response")
         print(f"🔍 Mode flag: {is_force_mode}")
         # Adjust parameters based on mode
         if is_force_mode:
+            generation_temp = 0.3  # More focused for direct answers
+            generation_tokens = min(max_tokens, 300)
         else:
+            generation_temp = 0.5  # More creative for questions
             generation_tokens = min(max_tokens, 250)
+        # Tokenize input
+        inputs = tokenizer(prompt, return_tensors="pt", max_length=1500, truncation=True)
+        # Generate response
         with torch.no_grad():
             outputs = model.generate(
                 inputs.input_ids,
                 do_sample=True,
                 pad_token_id=tokenizer.eos_token_id,
                 eos_token_id=tokenizer.eos_token_id,
+                top_p=0.9,
+                repetition_penalty=1.1,
+                no_repeat_ngram_size=3
             )
+        # Decode response
+        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the new generated part
+        response = full_response[len(prompt):].strip()
         # Clean up response
         response = response.replace("<|im_end|>", "").strip()
+        # Remove conversation artifacts
+        lines = response.split('\n')
+        clean_lines = []
+        for line in lines:
+            line = line.strip()
+            if not line.startswith(('<|im_start|>', '<|im_end|>', 'system:', 'user:', 'assistant:')):
+                clean_lines.append(line)
+        response = '\n'.join(clean_lines).strip()
+        # Take first paragraph if too long
+        if len(response) > max_tokens * 4:
+            paragraphs = response.split('\n\n')
+            response = paragraphs[0] if paragraphs else response[:max_tokens * 4]
+        print(f"✅ Generated response: {response[:100]}...")
+        # Simple validation - no template injection
+        if not response or len(response) < 10:
+            if is_force_mode:
+                return "I need more specific information to provide a direct answer. Could you clarify your question?"
+            else:
+                return "That's a great question to explore! What do you think might be the answer? Try experimenting and see what you discover!"
         return response
     except Exception as e:
         print(f"❌ Generation error: {e}")
         if is_force_mode:
+            return "I encountered an error generating a direct response. Please try rephrasing your question."
         else:
+            return "Interesting challenge! What approach do you think might work here? Let's explore this together."
 # === Routes ===
 @app.get("/")
 def root():
     return {
+        "message": "🤖 Apollo AI Backend v4.0-TRULY-FIXED - Qwen2-0.5B",
         "model": "Qwen/Qwen2-0.5B-Instruct with LoRA",
         "status": "ready",
         "modes": {
+            "mentor": "Guides learning with questions - REALLY FIXED",
+            "force": "Provides direct answers - REALLY FIXED"
         },
+        "fixes": "Removed all template responses, pure AI generation"
     }
 @app.get("/health")
         "status": "healthy",
         "model_loaded": True,
         "model_size": "0.5B",
+        "version": "4.0-TRULY-FIXED"
     }
 @app.post("/v1/chat/completions")
     try:
         body = await request.json()
         messages = body.get("messages", [])
+        max_tokens = min(body.get("max_tokens", 200), 400)
         temperature = max(0.1, min(body.get("temperature", 0.7), 1.0))
+        # Get force mode flag
+        is_force_mode = body.get("force_mode", False)
+        print(f"🚨 REQUEST RECEIVED - force_mode: {is_force_mode}")
+        print(f"📝 Last user message: {messages[-1].get('content', '') if messages else 'None'}")
         if not messages or not isinstance(messages, list):
             raise ValueError("Messages field is required and must be a list")
     except Exception as e:
         return JSONResponse(
             status_code=400,
             content={"error": f"Invalid request body: {str(e)}"}
             )
     try:
+        print(f"📥 Processing in {'FORCE (Direct Answer)' if is_force_mode else 'MENTOR (Guiding Questions)'} mode")
+        # Generate response - NO POST-PROCESSING
         response_content = generate_response(
             messages=messages,
             is_force_mode=is_force_mode,
             temperature=temperature
         )
+        print(f"✅ Pure AI response generated: {response_content[:150]}...")
         return {
+            "id": f"chatcmpl-apollo-{hash(str(messages)) % 10000}",
             "object": "chat.completion",
+            "created": int(torch.tensor(0).item()),
+            "model": f"qwen2-0.5b-{'force' if is_force_mode else 'mentor'}-truly-fixed",
             "choices": [
                 {
                     "index": 0,
                 }
             ],
             "usage": {
+                "prompt_tokens": len(str(messages)),
+                "completion_tokens": len(response_content),
+                "total_tokens": len(str(messages)) + len(response_content)
             },
+            "apollo_mode": "force_direct" if is_force_mode else "mentor_questions",
+            "pure_ai_response": True
         }
     except Exception as e:
         print(f"❌ Chat completion error: {e}")
         return JSONResponse(
             status_code=500,
+            content={"error": f"Internal server error: {str(e)}"}
         )
 if __name__ == "__main__":
     import uvicorn
+    print("🚀 Starting Apollo AI Backend v4.0-TRULY-FIXED")
     print("🧠 Model: Qwen/Qwen2-0.5B-Instruct (500M parameters)")
+    print("🎯 Mentor Mode: Pure AI questions and guidance")
+    print("⚡ Force Mode: Pure AI direct answers")
+    print("🚫 NO MORE TEMPLATES - Pure AI responses only")
     uvicorn.run(app, host="0.0.0.0", port=7860)