|
import os |
|
import torch |
|
from fastapi import FastAPI, Request |
|
from fastapi.responses import JSONResponse |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from peft import PeftModel |
|
from starlette.middleware.cors import CORSMiddleware |
|
import re |
|
|
|
|
|
app = FastAPI(title="Apollo AI Backend - Qwen2-0.5B Optimized", version="2.1.0") |
|
|
|
|
|
app.add_middleware( |
|
CORSMiddleware, |
|
allow_origins=["*"], |
|
allow_credentials=True, |
|
allow_methods=["*"], |
|
allow_headers=["*"], |
|
) |
|
|
|
|
|
API_KEY = os.getenv("API_KEY", "aigenapikey1234567890") |
|
BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct" |
|
ADAPTER_PATH = "adapter" |
|
|
|
|
|
print("🔧 Loading tokenizer for Qwen2-0.5B...") |
|
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) |
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
print("🧠 Loading Qwen2-0.5B base model...") |
|
base_model = AutoModelForCausalLM.from_pretrained( |
|
BASE_MODEL, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float32, |
|
device_map="cpu" |
|
) |
|
|
|
print("🔗 Applying LoRA adapter to Qwen2-0.5B...") |
|
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH) |
|
model.eval() |
|
|
|
print("✅ Qwen2-0.5B model ready with optimized settings!") |
|
|
|
def get_simple_system_prompt(is_force_mode: bool) -> str: |
|
""" |
|
SIMPLIFIED system prompts optimized for Qwen2-0.5B's 500M parameters. |
|
Shorter, clearer instructions that small models can follow better. |
|
""" |
|
if is_force_mode: |
|
return """You are Apollo AI. Give direct, complete answers. |
|
|
|
Rules: |
|
- Provide full working code |
|
- Be concise, max 3 sentences explanation |
|
- Never ask questions back |
|
- Give complete solutions immediately |
|
|
|
Example: |
|
User: "print hello world python" |
|
You: "Use print('Hello World'). This outputs text to console." |
|
""" |
|
else: |
|
return """You are Apollo AI tutor. Guide learning with questions. |
|
|
|
Rules: |
|
- Ask guiding questions instead of giving answers |
|
- Never give complete working code |
|
- Use hints and partial examples only |
|
- Make students think and discover |
|
|
|
Example: |
|
User: "print hello world python" |
|
You: "What function displays text in Python? Try looking up output functions." |
|
""" |
|
|
|
def create_simple_force_responses(user_message: str) -> str: |
|
""" |
|
Pre-defined responses for common questions in force mode. |
|
This helps the 0.5B model give consistent direct answers. |
|
""" |
|
user_lower = user_message.lower() |
|
|
|
|
|
if 'print' in user_lower and ('hello' in user_lower or 'world' in user_lower): |
|
return 'Use `print("Hello World")`. This function outputs text to the console.' |
|
|
|
|
|
if '2+2' in user_lower or '2 + 2' in user_lower: |
|
return '2 + 2 = 4. Addition combines two numbers to get their sum.' |
|
|
|
|
|
if 'variable' in user_lower and ('python' in user_lower or 'create' in user_lower): |
|
return 'Use `name = "value"`. Variables store data: `x = 5` or `text = "hello"`.' |
|
|
|
|
|
if 'list' in user_lower and 'python' in user_lower and 'create' in user_lower: |
|
return 'Use square brackets: `my_list = [1, 2, 3]`. Lists store multiple items.' |
|
|
|
|
|
if 'function' in user_lower and 'python' in user_lower and ('create' in user_lower or 'define' in user_lower): |
|
return '''Use def keyword: |
|
```python |
|
def my_function(): |
|
return "Hello" |
|
``` |
|
Functions are reusable code blocks.''' |
|
|
|
|
|
if 'calculator' in user_lower and ('create' in user_lower or 'make' in user_lower or 'build' in user_lower): |
|
return '''Here's a simple calculator: |
|
```python |
|
a = float(input("First number: ")) |
|
b = float(input("Second number: ")) |
|
op = input("Operator (+,-,*,/): ") |
|
if op == '+': print(a + b) |
|
elif op == '-': print(a - b) |
|
elif op == '*': print(a * b) |
|
elif op == '/': print(a / b) |
|
``` |
|
This performs basic math operations.''' |
|
|
|
return None |
|
|
|
def create_simple_mentor_responses(user_message: str) -> str: |
|
""" |
|
Pre-defined mentor responses for common questions. |
|
This helps the 0.5B model give consistent guided learning. |
|
""" |
|
user_lower = user_message.lower() |
|
|
|
|
|
if 'print' in user_lower and ('hello' in user_lower or 'world' in user_lower): |
|
return 'What function do you think displays text in Python? Think about showing output. What would it be called?' |
|
|
|
|
|
if '2+2' in user_lower or '2 + 2' in user_lower: |
|
return 'What do you think 2 + 2 equals? Try calculating it step by step.' |
|
|
|
|
|
if 'variable' in user_lower and ('python' in user_lower or 'create' in user_lower): |
|
return 'How do you think Python stores data? What symbol might assign a value to a name? Try: name = value' |
|
|
|
|
|
if 'list' in user_lower and 'python' in user_lower and 'create' in user_lower: |
|
return 'What brackets do you think hold multiple items? Try making a list with [item1, item2]. What goes inside?' |
|
|
|
|
|
if 'function' in user_lower and 'python' in user_lower and ('create' in user_lower or 'define' in user_lower): |
|
return '''What keyword defines a function in Python? Try this structure: |
|
```python |
|
___ function_name(): |
|
# your code here |
|
``` |
|
What goes in the blank? How would you call it?''' |
|
|
|
|
|
if 'calculator' in user_lower and ('create' in user_lower or 'make' in user_lower or 'build' in user_lower): |
|
return '''What steps would a calculator need? |
|
1. Get two numbers from user - what function gets input? |
|
2. Get operation (+,-,*,/) - how to choose? |
|
3. Calculate result - what structure handles choices? |
|
4. Show result - what displays output? |
|
|
|
Try building step 1 first. What function gets user input?''' |
|
|
|
return None |
|
|
|
def extract_clean_answer(full_response: str, formatted_prompt: str, user_message: str, is_force_mode: bool) -> str: |
|
""" |
|
Optimized cleaning for Qwen2-0.5B responses. |
|
Simpler extraction since 0.5B models produce cleaner output. |
|
""" |
|
if not full_response or len(full_response.strip()) < 5: |
|
return "I apologize, but I couldn't generate a response. Please try again." |
|
|
|
print(f"🔍 Raw response length: {len(full_response)}") |
|
print(f"🔍 Mode: {'FORCE' if is_force_mode else 'MENTOR'}") |
|
|
|
|
|
if is_force_mode: |
|
predefined = create_simple_force_responses(user_message) |
|
if predefined: |
|
print("✅ Using predefined force response") |
|
return predefined |
|
else: |
|
predefined = create_simple_mentor_responses(user_message) |
|
if predefined: |
|
print("✅ Using predefined mentor response") |
|
return predefined |
|
|
|
|
|
generated_text = full_response |
|
if formatted_prompt in full_response: |
|
parts = full_response.split(formatted_prompt) |
|
if len(parts) > 1: |
|
generated_text = parts[-1] |
|
|
|
|
|
assistant_content = generated_text |
|
|
|
|
|
if "<|im_start|>assistant" in generated_text: |
|
assistant_parts = generated_text.split("<|im_start|>assistant") |
|
if len(assistant_parts) > 1: |
|
assistant_content = assistant_parts[-1] |
|
if "<|im_end|>" in assistant_content: |
|
assistant_content = assistant_content.split("<|im_end|>")[0] |
|
|
|
|
|
clean_text = assistant_content.strip() |
|
|
|
|
|
clean_text = re.sub(r'<\|im_start\|>', '', clean_text) |
|
clean_text = re.sub(r'<\|im_end\|>', '', clean_text) |
|
clean_text = re.sub(r'<\|endoftext\|>', '', clean_text) |
|
|
|
|
|
clean_text = re.sub(r'^(system|user|assistant):\s*', '', clean_text, flags=re.MULTILINE) |
|
clean_text = re.sub(r'\n(system|user|assistant):\s*', '\n', clean_text, flags=re.MULTILINE) |
|
|
|
|
|
clean_text = re.sub(r'\n{3,}', '\n\n', clean_text) |
|
clean_text = clean_text.strip() |
|
|
|
|
|
if not clean_text or len(clean_text) < 10: |
|
if is_force_mode: |
|
return "Could you please be more specific about what you need?" |
|
else: |
|
return "What specific aspect would you like to explore? What's your approach?" |
|
|
|
|
|
if len(clean_text) > 500: |
|
sentences = clean_text.split('. ') |
|
if len(sentences) > 3: |
|
clean_text = '. '.join(sentences[:3]) + '.' |
|
|
|
print(f"🧹 Final cleaned answer length: {len(clean_text)}") |
|
|
|
return clean_text |
|
|
|
def generate_response(messages: list, is_force_mode: bool = False, max_tokens: int = 200, temperature: float = 0.7) -> str: |
|
""" |
|
Optimized generation for Qwen2-0.5B with shorter contexts and conservative settings. |
|
""" |
|
try: |
|
|
|
if messages and len(messages) > 0: |
|
last_user_msg = "" |
|
for msg in reversed(messages): |
|
if msg.get("role") == "user": |
|
last_user_msg = msg.get("content", "") |
|
break |
|
|
|
if last_user_msg: |
|
if is_force_mode: |
|
predefined = create_simple_force_responses(last_user_msg) |
|
if predefined: |
|
return predefined |
|
else: |
|
predefined = create_simple_mentor_responses(last_user_msg) |
|
if predefined: |
|
return predefined |
|
|
|
|
|
clean_messages = [] |
|
|
|
|
|
system_prompt = get_simple_system_prompt(is_force_mode) |
|
clean_messages.append({ |
|
"role": "system", |
|
"content": system_prompt |
|
}) |
|
|
|
|
|
if messages and len(messages) > 0: |
|
for msg in reversed(messages): |
|
if msg.get("role") == "user": |
|
clean_messages.append({ |
|
"role": "user", |
|
"content": msg.get("content", "") |
|
}) |
|
break |
|
|
|
print(f"🔍 Processing {len(clean_messages)} messages for Qwen2-0.5B in {'FORCE' if is_force_mode else 'MENTOR'} mode") |
|
|
|
|
|
try: |
|
formatted_prompt = tokenizer.apply_chat_template( |
|
clean_messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
except Exception as e: |
|
print(f"⚠️ Chat template failed, using simple format: {e}") |
|
|
|
formatted_prompt = f"System: {clean_messages[0]['content']}\nUser: {clean_messages[1]['content']}\nAssistant:" |
|
|
|
|
|
inputs = tokenizer( |
|
formatted_prompt, |
|
return_tensors="pt", |
|
truncation=True, |
|
max_length=800 |
|
) |
|
|
|
|
|
generation_params = { |
|
"input_ids": inputs.input_ids, |
|
"attention_mask": inputs.attention_mask, |
|
"pad_token_id": tokenizer.eos_token_id, |
|
"eos_token_id": tokenizer.eos_token_id, |
|
"do_sample": True, |
|
} |
|
|
|
if is_force_mode: |
|
|
|
generation_params.update({ |
|
"max_new_tokens": min(max_tokens, 150), |
|
"temperature": 0.1, |
|
"top_p": 0.7, |
|
"top_k": 20, |
|
"repetition_penalty": 1.05, |
|
}) |
|
else: |
|
|
|
generation_params.update({ |
|
"max_new_tokens": min(max_tokens, 200), |
|
"temperature": 0.3, |
|
"top_p": 0.8, |
|
"top_k": 30, |
|
"repetition_penalty": 1.02, |
|
}) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model.generate(**generation_params) |
|
|
|
|
|
full_response = tokenizer.decode(outputs[0], skip_special_tokens=False) |
|
|
|
|
|
user_message = "" |
|
for msg in reversed(clean_messages): |
|
if msg.get("role") == "user": |
|
user_message = msg.get("content", "") |
|
break |
|
|
|
|
|
clean_answer = extract_clean_answer(full_response, formatted_prompt, user_message, is_force_mode) |
|
|
|
return clean_answer |
|
|
|
except Exception as e: |
|
print(f"❌ Generation error with Qwen2-0.5B: {e}") |
|
mode_text = "direct answer" if is_force_mode else "guided learning" |
|
return f"I encountered an error generating a {mode_text}. Please try a simpler question." |
|
|
|
|
|
@app.get("/") |
|
def root(): |
|
return { |
|
"message": "🤖 Apollo AI Backend v2.1 - Qwen2-0.5B Optimized", |
|
"model": "Qwen/Qwen2-0.5B-Instruct with LoRA", |
|
"status": "ready", |
|
"optimizations": ["short_contexts", "conservative_generation", "predefined_responses"], |
|
"features": ["mentor_mode", "force_mode", "0.5B_optimized"], |
|
"modes": { |
|
"mentor": "Guides learning with simple questions", |
|
"force": "Provides direct answers quickly" |
|
} |
|
} |
|
|
|
@app.get("/health") |
|
def health(): |
|
return { |
|
"status": "healthy", |
|
"model_loaded": True, |
|
"model_size": "0.5B", |
|
"optimizations": "qwen2_0.5B_specific" |
|
} |
|
|
|
@app.post("/v1/chat/completions") |
|
async def chat_completions(request: Request): |
|
|
|
auth_header = request.headers.get("Authorization", "") |
|
if not auth_header.startswith("Bearer "): |
|
return JSONResponse( |
|
status_code=401, |
|
content={"error": "Missing or invalid Authorization header"} |
|
) |
|
|
|
token = auth_header.replace("Bearer ", "").strip() |
|
if token != API_KEY: |
|
return JSONResponse( |
|
status_code=401, |
|
content={"error": "Invalid API key"} |
|
) |
|
|
|
|
|
try: |
|
body = await request.json() |
|
messages = body.get("messages", []) |
|
max_tokens = min(body.get("max_tokens", 200), 300) |
|
temperature = max(0.1, min(body.get("temperature", 0.5), 0.8)) |
|
|
|
|
|
is_force_mode = body.get("force_mode", False) |
|
|
|
if not messages or not isinstance(messages, list): |
|
raise ValueError("Messages field is required and must be a list") |
|
|
|
except Exception as e: |
|
return JSONResponse( |
|
status_code=400, |
|
content={"error": f"Invalid request body: {str(e)}"} |
|
) |
|
|
|
|
|
for i, msg in enumerate(messages): |
|
if not isinstance(msg, dict) or "role" not in msg or "content" not in msg: |
|
return JSONResponse( |
|
status_code=400, |
|
content={"error": f"Invalid message format at index {i}"} |
|
) |
|
|
|
try: |
|
print(f"📥 Processing request for Qwen2-0.5B in {'FORCE' if is_force_mode else 'MENTOR'} mode") |
|
print(f"📊 Settings: max_tokens={max_tokens}, temperature={temperature}") |
|
|
|
response_content = generate_response( |
|
messages=messages, |
|
is_force_mode=is_force_mode, |
|
max_tokens=max_tokens, |
|
temperature=temperature |
|
) |
|
|
|
|
|
return { |
|
"id": f"chatcmpl-apollo-qwen05b-{hash(str(messages)) % 10000}", |
|
"object": "chat.completion", |
|
"created": int(torch.tensor(0).item()), |
|
"model": f"qwen2-0.5b-{'force' if is_force_mode else 'mentor'}-mode", |
|
"choices": [ |
|
{ |
|
"index": 0, |
|
"message": { |
|
"role": "assistant", |
|
"content": response_content |
|
}, |
|
"finish_reason": "stop" |
|
} |
|
], |
|
"usage": { |
|
"prompt_tokens": len(str(messages)), |
|
"completion_tokens": len(response_content), |
|
"total_tokens": len(str(messages)) + len(response_content) |
|
}, |
|
"apollo_mode": "force" if is_force_mode else "mentor", |
|
"model_optimizations": "qwen2_0.5B_specific" |
|
} |
|
|
|
except Exception as e: |
|
print(f"❌ Chat completion error: {e}") |
|
return JSONResponse( |
|
status_code=500, |
|
content={"error": f"Internal server error: {str(e)}"} |
|
) |
|
|
|
|
|
@app.post("/test") |
|
async def test_generation(request: Request): |
|
"""Test endpoint for debugging both modes with 0.5B optimizations""" |
|
try: |
|
body = await request.json() |
|
prompt = body.get("prompt", "How do I print hello world in Python?") |
|
max_tokens = min(body.get("max_tokens", 200), 300) |
|
test_both_modes = body.get("test_both_modes", True) |
|
|
|
results = {} |
|
|
|
|
|
messages_mentor = [{"role": "user", "content": prompt}] |
|
mentor_response = generate_response(messages_mentor, is_force_mode=False, max_tokens=max_tokens, temperature=0.3) |
|
results["mentor_mode"] = { |
|
"response": mentor_response, |
|
"length": len(mentor_response), |
|
"mode": "mentor" |
|
} |
|
|
|
if test_both_modes: |
|
|
|
messages_force = [{"role": "user", "content": prompt}] |
|
force_response = generate_response(messages_force, is_force_mode=True, max_tokens=max_tokens, temperature=0.1) |
|
results["force_mode"] = { |
|
"response": force_response, |
|
"length": len(force_response), |
|
"mode": "force" |
|
} |
|
|
|
return { |
|
"prompt": prompt, |
|
"results": results, |
|
"model": "Qwen2-0.5B-Instruct", |
|
"optimizations": "0.5B_specific", |
|
"status": "success" |
|
} |
|
|
|
except Exception as e: |
|
return JSONResponse( |
|
status_code=500, |
|
content={"error": str(e)} |
|
) |
|
|
|
if __name__ == "__main__": |
|
import uvicorn |
|
print("🚀 Starting Apollo AI Backend v2.1 - Qwen2-0.5B Optimized...") |
|
print("🧠 Model: Qwen/Qwen2-0.5B-Instruct (500M parameters)") |
|
print("⚡ Optimizations: Short contexts, conservative generation, predefined responses") |
|
print("🎯 Modes: Mentor (simple questions) vs Force (direct answers)") |
|
uvicorn.run(app, host="0.0.0.0", port=7860) |