|
import os |
|
import torch |
|
from fastapi import FastAPI, Request |
|
from fastapi.responses import JSONResponse |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from peft import PeftModel |
|
from starlette.middleware.cors import CORSMiddleware |
|
|
|
|
|
app = FastAPI(title="Apollo AI Backend - Qwen2-0.5B", version="4.0.0-TRULY-FIXED") |
|
|
|
|
|
app.add_middleware( |
|
CORSMiddleware, |
|
allow_origins=["*"], |
|
allow_credentials=True, |
|
allow_methods=["*"], |
|
allow_headers=["*"], |
|
) |
|
|
|
|
|
API_KEY = os.getenv("API_KEY", "aigenapikey1234567890") |
|
BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct" |
|
ADAPTER_PATH = "adapter" |
|
|
|
|
|
print("🔧 Loading tokenizer for Qwen2-0.5B...") |
|
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) |
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
print("🧠 Loading Qwen2-0.5B base model...") |
|
base_model = AutoModelForCausalLM.from_pretrained( |
|
BASE_MODEL, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float32, |
|
device_map="cpu" |
|
) |
|
|
|
print("🔗 Applying LoRA adapter to Qwen2-0.5B...") |
|
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH) |
|
model.eval() |
|
|
|
print("✅ Qwen2-0.5B model ready!") |
|
|
|
def create_conversation_prompt(messages: list, is_force_mode: bool) -> str: |
|
"""Create a conversation prompt with clear mode instructions""" |
|
|
|
if is_force_mode: |
|
system_prompt = """You are a helpful programming assistant. Give direct, complete answers with examples. Do not ask questions back to the user. Provide clear explanations and working code when relevant. |
|
|
|
When asked about Python functions, provide: |
|
1. What the function does |
|
2. Clear examples with output |
|
3. Common use cases |
|
|
|
Be direct and informative.""" |
|
else: |
|
system_prompt = """You are a programming teacher focused on helping students learn through discovery. Guide students with questions and hints rather than giving direct answers. |
|
|
|
When asked about concepts: |
|
1. Ask what they think might happen |
|
2. Encourage them to try things out |
|
3. Guide them to discover patterns |
|
4. Ask follow-up questions to deepen understanding |
|
|
|
Help them learn by thinking, not by giving answers directly.""" |
|
|
|
|
|
conversation = f"<|im_start|>system\n{system_prompt}<|im_end|>\n" |
|
|
|
|
|
recent_messages = messages[-4:] if len(messages) > 4 else messages |
|
|
|
for msg in recent_messages: |
|
role = msg.get("role", "") |
|
content = msg.get("content", "") |
|
conversation += f"<|im_start|>{role}\n{content}<|im_end|>\n" |
|
|
|
conversation += "<|im_start|>assistant\n" |
|
return conversation |
|
|
|
def generate_response(messages: list, is_force_mode: bool = False, max_tokens: int = 200, temperature: float = 0.7) -> str: |
|
"""Generate response using the AI model""" |
|
try: |
|
|
|
prompt = create_conversation_prompt(messages, is_force_mode) |
|
|
|
print(f"🎯 Generating {'FORCE (Direct)' if is_force_mode else 'MENTOR (Questions)'} response") |
|
print(f"🔍 Mode flag: {is_force_mode}") |
|
|
|
|
|
if is_force_mode: |
|
generation_temp = 0.3 |
|
generation_tokens = min(max_tokens, 300) |
|
else: |
|
generation_temp = 0.5 |
|
generation_tokens = min(max_tokens, 250) |
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt", max_length=1500, truncation=True) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model.generate( |
|
inputs.input_ids, |
|
max_new_tokens=generation_tokens, |
|
temperature=generation_temp, |
|
do_sample=True, |
|
pad_token_id=tokenizer.eos_token_id, |
|
eos_token_id=tokenizer.eos_token_id, |
|
top_p=0.9, |
|
repetition_penalty=1.1, |
|
no_repeat_ngram_size=3 |
|
) |
|
|
|
|
|
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
response = full_response[len(prompt):].strip() |
|
|
|
|
|
response = response.replace("<|im_end|>", "").strip() |
|
|
|
|
|
lines = response.split('\n') |
|
clean_lines = [] |
|
for line in lines: |
|
line = line.strip() |
|
if not line.startswith(('<|im_start|>', '<|im_end|>', 'system:', 'user:', 'assistant:')): |
|
clean_lines.append(line) |
|
|
|
response = '\n'.join(clean_lines).strip() |
|
|
|
|
|
if len(response) > max_tokens * 4: |
|
paragraphs = response.split('\n\n') |
|
response = paragraphs[0] if paragraphs else response[:max_tokens * 4] |
|
|
|
print(f"✅ Generated response: {response[:100]}...") |
|
|
|
|
|
if not response or len(response) < 10: |
|
if is_force_mode: |
|
return "I need more specific information to provide a direct answer. Could you clarify your question?" |
|
else: |
|
return "That's a great question to explore! What do you think might be the answer? Try experimenting and see what you discover!" |
|
|
|
return response |
|
|
|
except Exception as e: |
|
print(f"❌ Generation error: {e}") |
|
if is_force_mode: |
|
return "I encountered an error generating a direct response. Please try rephrasing your question." |
|
else: |
|
return "Interesting challenge! What approach do you think might work here? Let's explore this together." |
|
|
|
|
|
@app.get("/") |
|
def root(): |
|
return { |
|
"message": "🤖 Apollo AI Backend v4.0-TRULY-FIXED - Qwen2-0.5B", |
|
"model": "Qwen/Qwen2-0.5B-Instruct with LoRA", |
|
"status": "ready", |
|
"modes": { |
|
"mentor": "Guides learning with questions - REALLY FIXED", |
|
"force": "Provides direct answers - REALLY FIXED" |
|
}, |
|
"fixes": "Removed all template responses, pure AI generation" |
|
} |
|
|
|
@app.get("/health") |
|
def health(): |
|
return { |
|
"status": "healthy", |
|
"model_loaded": True, |
|
"model_size": "0.5B", |
|
"version": "4.0-TRULY-FIXED" |
|
} |
|
|
|
@app.post("/v1/chat/completions") |
|
async def chat_completions(request: Request): |
|
|
|
auth_header = request.headers.get("Authorization", "") |
|
if not auth_header.startswith("Bearer "): |
|
return JSONResponse( |
|
status_code=401, |
|
content={"error": "Missing or invalid Authorization header"} |
|
) |
|
|
|
token = auth_header.replace("Bearer ", "").strip() |
|
if token != API_KEY: |
|
return JSONResponse( |
|
status_code=401, |
|
content={"error": "Invalid API key"} |
|
) |
|
|
|
|
|
try: |
|
body = await request.json() |
|
messages = body.get("messages", []) |
|
max_tokens = min(body.get("max_tokens", 200), 400) |
|
temperature = max(0.1, min(body.get("temperature", 0.7), 1.0)) |
|
|
|
|
|
is_force_mode = body.get("force_mode", False) |
|
|
|
print(f"🚨 REQUEST RECEIVED - force_mode: {is_force_mode}") |
|
print(f"📝 Last user message: {messages[-1].get('content', '') if messages else 'None'}") |
|
|
|
if not messages or not isinstance(messages, list): |
|
raise ValueError("Messages field is required and must be a list") |
|
|
|
except Exception as e: |
|
return JSONResponse( |
|
status_code=400, |
|
content={"error": f"Invalid request body: {str(e)}"} |
|
) |
|
|
|
|
|
for i, msg in enumerate(messages): |
|
if not isinstance(msg, dict) or "role" not in msg or "content" not in msg: |
|
return JSONResponse( |
|
status_code=400, |
|
content={"error": f"Invalid message format at index {i}"} |
|
) |
|
|
|
try: |
|
print(f"📥 Processing in {'FORCE (Direct Answer)' if is_force_mode else 'MENTOR (Guiding Questions)'} mode") |
|
|
|
|
|
response_content = generate_response( |
|
messages=messages, |
|
is_force_mode=is_force_mode, |
|
max_tokens=max_tokens, |
|
temperature=temperature |
|
) |
|
|
|
print(f"✅ Pure AI response generated: {response_content[:150]}...") |
|
|
|
return { |
|
"id": f"chatcmpl-apollo-{hash(str(messages)) % 10000}", |
|
"object": "chat.completion", |
|
"created": int(torch.tensor(0).item()), |
|
"model": f"qwen2-0.5b-{'force' if is_force_mode else 'mentor'}-truly-fixed", |
|
"choices": [ |
|
{ |
|
"index": 0, |
|
"message": { |
|
"role": "assistant", |
|
"content": response_content |
|
}, |
|
"finish_reason": "stop" |
|
} |
|
], |
|
"usage": { |
|
"prompt_tokens": len(str(messages)), |
|
"completion_tokens": len(response_content), |
|
"total_tokens": len(str(messages)) + len(response_content) |
|
}, |
|
"apollo_mode": "force_direct" if is_force_mode else "mentor_questions", |
|
"pure_ai_response": True |
|
} |
|
|
|
except Exception as e: |
|
print(f"❌ Chat completion error: {e}") |
|
return JSONResponse( |
|
status_code=500, |
|
content={"error": f"Internal server error: {str(e)}"} |
|
) |
|
|
|
if __name__ == "__main__": |
|
import uvicorn |
|
print("🚀 Starting Apollo AI Backend v4.0-TRULY-FIXED") |
|
print("🧠 Model: Qwen/Qwen2-0.5B-Instruct (500M parameters)") |
|
print("🎯 Mentor Mode: Pure AI questions and guidance") |
|
print("⚡ Force Mode: Pure AI direct answers") |
|
print("🚫 NO MORE TEMPLATES - Pure AI responses only") |
|
uvicorn.run(app, host="0.0.0.0", port=7860) |