import os import torch from fastapi import FastAPI, Request from fastapi.responses import JSONResponse from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from starlette.middleware.cors import CORSMiddleware import re # === Setup FastAPI === app = FastAPI(title="Apollo AI Backend - Qwen2-0.5B Optimized", version="2.1.0") # === CORS === app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # === Configuration === API_KEY = os.getenv("API_KEY", "aigenapikey1234567890") BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct" ADAPTER_PATH = "adapter" # === Load Model === print("๐Ÿ”ง Loading tokenizer for Qwen2-0.5B...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print("๐Ÿง  Loading Qwen2-0.5B base model...") base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, trust_remote_code=True, torch_dtype=torch.float32, device_map="cpu" ) print("๐Ÿ”— Applying LoRA adapter to Qwen2-0.5B...") model = PeftModel.from_pretrained(base_model, ADAPTER_PATH) model.eval() print("โœ… Qwen2-0.5B model ready with optimized settings!") def get_simple_system_prompt(is_force_mode: bool) -> str: """ SIMPLIFIED system prompts optimized for Qwen2-0.5B's 500M parameters. Shorter, clearer instructions that small models can follow better. """ if is_force_mode: return """You are Apollo AI. Give direct, complete answers. Rules: - Provide full working code - Be concise, max 3 sentences explanation - Never ask questions back - Give complete solutions immediately Example: User: "print hello world python" You: "Use print('Hello World'). This outputs text to console." """ else: return """You are Apollo AI tutor. Guide learning with questions. Rules: - Ask guiding questions instead of giving answers - Never give complete working code - Use hints and partial examples only - Make students think and discover Example: User: "print hello world python" You: "What function displays text in Python? Try looking up output functions." """ def create_simple_force_responses(user_message: str) -> str: """ Pre-defined responses for common questions in force mode. This helps the 0.5B model give consistent direct answers. """ user_lower = user_message.lower() # Python print if 'print' in user_lower and ('hello' in user_lower or 'world' in user_lower): return 'Use `print("Hello World")`. This function outputs text to the console.' # Basic math if '2+2' in user_lower or '2 + 2' in user_lower: return '2 + 2 = 4. Addition combines two numbers to get their sum.' # Python variable if 'variable' in user_lower and ('python' in user_lower or 'create' in user_lower): return 'Use `name = "value"`. Variables store data: `x = 5` or `text = "hello"`.' # Python list if 'list' in user_lower and 'python' in user_lower and 'create' in user_lower: return 'Use square brackets: `my_list = [1, 2, 3]`. Lists store multiple items.' # Python function if 'function' in user_lower and 'python' in user_lower and ('create' in user_lower or 'define' in user_lower): return '''Use def keyword: ```python def my_function(): return "Hello" ``` Functions are reusable code blocks.''' # Calculator if 'calculator' in user_lower and ('create' in user_lower or 'make' in user_lower or 'build' in user_lower): return '''Here's a simple calculator: ```python a = float(input("First number: ")) b = float(input("Second number: ")) op = input("Operator (+,-,*,/): ") if op == '+': print(a + b) elif op == '-': print(a - b) elif op == '*': print(a * b) elif op == '/': print(a / b) ``` This performs basic math operations.''' return None def create_simple_mentor_responses(user_message: str) -> str: """ Pre-defined mentor responses for common questions. This helps the 0.5B model give consistent guided learning. """ user_lower = user_message.lower() # Python print if 'print' in user_lower and ('hello' in user_lower or 'world' in user_lower): return 'What function do you think displays text in Python? Think about showing output. What would it be called?' # Basic math if '2+2' in user_lower or '2 + 2' in user_lower: return 'What do you think 2 + 2 equals? Try calculating it step by step.' # Python variable if 'variable' in user_lower and ('python' in user_lower or 'create' in user_lower): return 'How do you think Python stores data? What symbol might assign a value to a name? Try: name = value' # Python list if 'list' in user_lower and 'python' in user_lower and 'create' in user_lower: return 'What brackets do you think hold multiple items? Try making a list with [item1, item2]. What goes inside?' # Python function if 'function' in user_lower and 'python' in user_lower and ('create' in user_lower or 'define' in user_lower): return '''What keyword defines a function in Python? Try this structure: ```python ___ function_name(): # your code here ``` What goes in the blank? How would you call it?''' # Calculator if 'calculator' in user_lower and ('create' in user_lower or 'make' in user_lower or 'build' in user_lower): return '''What steps would a calculator need? 1. Get two numbers from user - what function gets input? 2. Get operation (+,-,*,/) - how to choose? 3. Calculate result - what structure handles choices? 4. Show result - what displays output? Try building step 1 first. What function gets user input?''' return None def extract_clean_answer(full_response: str, formatted_prompt: str, user_message: str, is_force_mode: bool) -> str: """ Optimized cleaning for Qwen2-0.5B responses. Simpler extraction since 0.5B models produce cleaner output. """ if not full_response or len(full_response.strip()) < 5: return "I apologize, but I couldn't generate a response. Please try again." print(f"๐Ÿ” Raw response length: {len(full_response)}") print(f"๐Ÿ” Mode: {'FORCE' if is_force_mode else 'MENTOR'}") # Check for pre-defined responses first if is_force_mode: predefined = create_simple_force_responses(user_message) if predefined: print("โœ… Using predefined force response") return predefined else: predefined = create_simple_mentor_responses(user_message) if predefined: print("โœ… Using predefined mentor response") return predefined # Step 1: Remove the input prompt generated_text = full_response if formatted_prompt in full_response: parts = full_response.split(formatted_prompt) if len(parts) > 1: generated_text = parts[-1] # Step 2: Extract assistant content - simplified for 0.5B assistant_content = generated_text # Look for assistant markers if "<|im_start|>assistant" in generated_text: assistant_parts = generated_text.split("<|im_start|>assistant") if len(assistant_parts) > 1: assistant_content = assistant_parts[-1] if "<|im_end|>" in assistant_content: assistant_content = assistant_content.split("<|im_end|>")[0] # Step 3: Basic cleaning - gentler for 0.5B clean_text = assistant_content.strip() # Remove template tokens clean_text = re.sub(r'<\|im_start\|>', '', clean_text) clean_text = re.sub(r'<\|im_end\|>', '', clean_text) clean_text = re.sub(r'<\|endoftext\|>', '', clean_text) # Remove role prefixes clean_text = re.sub(r'^(system|user|assistant):\s*', '', clean_text, flags=re.MULTILINE) clean_text = re.sub(r'\n(system|user|assistant):\s*', '\n', clean_text, flags=re.MULTILINE) # Clean whitespace clean_text = re.sub(r'\n{3,}', '\n\n', clean_text) clean_text = clean_text.strip() # Step 4: Fallback handling for 0.5B if not clean_text or len(clean_text) < 10: if is_force_mode: return "Could you please be more specific about what you need?" else: return "What specific aspect would you like to explore? What's your approach?" # Step 5: Length control for 0.5B if len(clean_text) > 500: # Keep responses shorter for 0.5B sentences = clean_text.split('. ') if len(sentences) > 3: clean_text = '. '.join(sentences[:3]) + '.' print(f"๐Ÿงน Final cleaned answer length: {len(clean_text)}") return clean_text def generate_response(messages: list, is_force_mode: bool = False, max_tokens: int = 200, temperature: float = 0.7) -> str: """ Optimized generation for Qwen2-0.5B with shorter contexts and conservative settings. """ try: # Check for simple predefined responses first if messages and len(messages) > 0: last_user_msg = "" for msg in reversed(messages): if msg.get("role") == "user": last_user_msg = msg.get("content", "") break if last_user_msg: if is_force_mode: predefined = create_simple_force_responses(last_user_msg) if predefined: return predefined else: predefined = create_simple_mentor_responses(last_user_msg) if predefined: return predefined # Build simple conversation for 0.5B model clean_messages = [] # Add simple system prompt system_prompt = get_simple_system_prompt(is_force_mode) clean_messages.append({ "role": "system", "content": system_prompt }) # Add only the last user message to keep context short for 0.5B if messages and len(messages) > 0: for msg in reversed(messages): if msg.get("role") == "user": clean_messages.append({ "role": "user", "content": msg.get("content", "") }) break print(f"๐Ÿ” Processing {len(clean_messages)} messages for Qwen2-0.5B in {'FORCE' if is_force_mode else 'MENTOR'} mode") # Apply chat template try: formatted_prompt = tokenizer.apply_chat_template( clean_messages, tokenize=False, add_generation_prompt=True ) except Exception as e: print(f"โš ๏ธ Chat template failed, using simple format: {e}") # Fallback to simple format formatted_prompt = f"System: {clean_messages[0]['content']}\nUser: {clean_messages[1]['content']}\nAssistant:" # Tokenize with conservative limits for 0.5B inputs = tokenizer( formatted_prompt, return_tensors="pt", truncation=True, max_length=800 # Shorter context for 0.5B ) # Conservative generation settings for 0.5B model generation_params = { "input_ids": inputs.input_ids, "attention_mask": inputs.attention_mask, "pad_token_id": tokenizer.eos_token_id, "eos_token_id": tokenizer.eos_token_id, "do_sample": True, } if is_force_mode: # Force mode: Very conservative for 0.5B generation_params.update({ "max_new_tokens": min(max_tokens, 150), # Very short "temperature": 0.1, # Very focused "top_p": 0.7, "top_k": 20, "repetition_penalty": 1.05, }) else: # Mentor mode: Still conservative but allows more creativity generation_params.update({ "max_new_tokens": min(max_tokens, 200), "temperature": 0.3, # Lower than original "top_p": 0.8, "top_k": 30, "repetition_penalty": 1.02, }) # Generate with timeout for 0.5B with torch.no_grad(): outputs = model.generate(**generation_params) # Decode response full_response = tokenizer.decode(outputs[0], skip_special_tokens=False) # Extract user message for context user_message = "" for msg in reversed(clean_messages): if msg.get("role") == "user": user_message = msg.get("content", "") break # Clean and return clean_answer = extract_clean_answer(full_response, formatted_prompt, user_message, is_force_mode) return clean_answer except Exception as e: print(f"โŒ Generation error with Qwen2-0.5B: {e}") mode_text = "direct answer" if is_force_mode else "guided learning" return f"I encountered an error generating a {mode_text}. Please try a simpler question." # === Routes === @app.get("/") def root(): return { "message": "๐Ÿค– Apollo AI Backend v2.1 - Qwen2-0.5B Optimized", "model": "Qwen/Qwen2-0.5B-Instruct with LoRA", "status": "ready", "optimizations": ["short_contexts", "conservative_generation", "predefined_responses"], "features": ["mentor_mode", "force_mode", "0.5B_optimized"], "modes": { "mentor": "Guides learning with simple questions", "force": "Provides direct answers quickly" } } @app.get("/health") def health(): return { "status": "healthy", "model_loaded": True, "model_size": "0.5B", "optimizations": "qwen2_0.5B_specific" } @app.post("/v1/chat/completions") async def chat_completions(request: Request): # Validate API key auth_header = request.headers.get("Authorization", "") if not auth_header.startswith("Bearer "): return JSONResponse( status_code=401, content={"error": "Missing or invalid Authorization header"} ) token = auth_header.replace("Bearer ", "").strip() if token != API_KEY: return JSONResponse( status_code=401, content={"error": "Invalid API key"} ) # Parse request body try: body = await request.json() messages = body.get("messages", []) max_tokens = min(body.get("max_tokens", 200), 300) # Cap at 300 for 0.5B temperature = max(0.1, min(body.get("temperature", 0.5), 0.8)) # Conservative range # Get mode information is_force_mode = body.get("force_mode", False) if not messages or not isinstance(messages, list): raise ValueError("Messages field is required and must be a list") except Exception as e: return JSONResponse( status_code=400, content={"error": f"Invalid request body: {str(e)}"} ) # Validate messages for i, msg in enumerate(messages): if not isinstance(msg, dict) or "role" not in msg or "content" not in msg: return JSONResponse( status_code=400, content={"error": f"Invalid message format at index {i}"} ) try: print(f"๐Ÿ“ฅ Processing request for Qwen2-0.5B in {'FORCE' if is_force_mode else 'MENTOR'} mode") print(f"๐Ÿ“Š Settings: max_tokens={max_tokens}, temperature={temperature}") response_content = generate_response( messages=messages, is_force_mode=is_force_mode, max_tokens=max_tokens, temperature=temperature ) # Return OpenAI-compatible response return { "id": f"chatcmpl-apollo-qwen05b-{hash(str(messages)) % 10000}", "object": "chat.completion", "created": int(torch.tensor(0).item()), "model": f"qwen2-0.5b-{'force' if is_force_mode else 'mentor'}-mode", "choices": [ { "index": 0, "message": { "role": "assistant", "content": response_content }, "finish_reason": "stop" } ], "usage": { "prompt_tokens": len(str(messages)), "completion_tokens": len(response_content), "total_tokens": len(str(messages)) + len(response_content) }, "apollo_mode": "force" if is_force_mode else "mentor", "model_optimizations": "qwen2_0.5B_specific" } except Exception as e: print(f"โŒ Chat completion error: {e}") return JSONResponse( status_code=500, content={"error": f"Internal server error: {str(e)}"} ) # === Test endpoint optimized for 0.5B === @app.post("/test") async def test_generation(request: Request): """Test endpoint for debugging both modes with 0.5B optimizations""" try: body = await request.json() prompt = body.get("prompt", "How do I print hello world in Python?") max_tokens = min(body.get("max_tokens", 200), 300) test_both_modes = body.get("test_both_modes", True) results = {} # Test mentor mode messages_mentor = [{"role": "user", "content": prompt}] mentor_response = generate_response(messages_mentor, is_force_mode=False, max_tokens=max_tokens, temperature=0.3) results["mentor_mode"] = { "response": mentor_response, "length": len(mentor_response), "mode": "mentor" } if test_both_modes: # Test force mode messages_force = [{"role": "user", "content": prompt}] force_response = generate_response(messages_force, is_force_mode=True, max_tokens=max_tokens, temperature=0.1) results["force_mode"] = { "response": force_response, "length": len(force_response), "mode": "force" } return { "prompt": prompt, "results": results, "model": "Qwen2-0.5B-Instruct", "optimizations": "0.5B_specific", "status": "success" } except Exception as e: return JSONResponse( status_code=500, content={"error": str(e)} ) if __name__ == "__main__": import uvicorn print("๐Ÿš€ Starting Apollo AI Backend v2.1 - Qwen2-0.5B Optimized...") print("๐Ÿง  Model: Qwen/Qwen2-0.5B-Instruct (500M parameters)") print("โšก Optimizations: Short contexts, conservative generation, predefined responses") print("๐ŸŽฏ Modes: Mentor (simple questions) vs Force (direct answers)") uvicorn.run(app, host="0.0.0.0", port=7860)