Spaces:

Ais203
/

aigen

Sleeping

App Files Files Community

Ais commited on 22 days ago

Commit

0ee4730

verified ·

1 Parent(s): 3afe501

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +236 -133

app/main.py CHANGED Viewed

@@ -5,11 +5,12 @@ from fastapi.responses import JSONResponse
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 from starlette.middleware.cors import CORSMiddleware
 # === Setup FastAPI ===
-app = FastAPI()
-# === CORS (optional for frontend access) ===
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -18,190 +19,292 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# === Load API Key from Hugging Face Secrets ===
-API_KEY = os.getenv("API_KEY", "undefined")  # Add API_KEY in your HF Space Secrets
-# === Model Settings ===
 BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
 ADAPTER_PATH = "adapter"
 print("🔧 Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
-print("🧠 Loading base model on CPU...")
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     trust_remote_code=True,
-    torch_dtype=torch.float32
-).cpu()
 print("🔗 Applying LoRA adapter...")
-model = PeftModel.from_pretrained(base_model, ADAPTER_PATH).cpu()
 model.eval()
-print("✅ Model and adapter loaded successfully.")
-def clean_response(raw_response):
     """
-    Clean the model response by removing unwanted artifacts while preserving the actual answer.
     """
-    if not raw_response or len(raw_response.strip()) < 2:
-        return "I apologize, but I couldn't generate a proper response. Please try again."
-    # Remove common chat template artifacts
-    cleaned = raw_response.strip()
-    # Remove system/user/assistant prefixes that might leak through
-    prefixes_to_remove = [
-        "system\n", "user\n", "assistant\n",
-        "System:", "User:", "Assistant:",
-        "<|im_start|>", "<|im_end|>",
-        "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
-        "You are a helpful assistant.",
-        "I am a helpful assistant.",
-        "As a helpful assistant,",
     ]
-    for prefix in prefixes_to_remove:
-        if cleaned.lower().startswith(prefix.lower()):
-            cleaned = cleaned[len(prefix):].strip()
-    # Remove any remaining template artifacts
-    lines = cleaned.split('\n')
-    filtered_lines = []
     for line in lines:
-        line_stripped = line.strip()
-        # Skip empty lines at the beginning
-        if not line_stripped and not filtered_lines:
             continue
-        # Skip obvious template artifacts
-        if line_stripped in ["system", "user", "assistant"]:
             continue
-        filtered_lines.append(line)
-    cleaned = '\n'.join(filtered_lines).strip()
-    # If we still have content, return it
-    if cleaned and len(cleaned) > 5:
-        return cleaned
-    # Fallback only if truly empty
-    return "I understand your question. Let me help you with that."
-# === Root Route ===
 @app.get("/")
 def root():
-    return {"message": "🧠 Qwen2.5-0.5B-Instruct API is running on CPU!"}
-# === Chat Completion API ===
 @app.post("/v1/chat/completions")
-async def chat(request: Request):
-    # ✅ API Key Authorization
     auth_header = request.headers.get("Authorization", "")
     if not auth_header.startswith("Bearer "):
-        return JSONResponse(status_code=401, content={"error": "Missing Bearer token in Authorization header."})
     token = auth_header.replace("Bearer ", "").strip()
     if token != API_KEY:
-        return JSONResponse(status_code=401, content={"error": "Invalid API key."})
-    # ✅ Parse Request
     try:
         body = await request.json()
         messages = body.get("messages", [])
         if not messages or not isinstance(messages, list):
-            raise ValueError("Invalid or missing 'messages' field.")
     except Exception as e:
-        return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})
-    # ✅ FIXED: Use proper Qwen2.5 chat template formatting
     try:
-        # Use the tokenizer's built-in chat template - this is the correct way!
-        formatted_prompt = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
         )
-        print(f"🔍 Formatted prompt: {formatted_prompt}")
     except Exception as e:
-        print(f"❌ Chat template error: {e}")
-        # Fallback to manual formatting if needed
-        formatted_prompt = ""
-        for msg in messages:
-            role = msg.get("role", "user")
-            content = msg.get("content", "")
-            if role == "system":
-                formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
-            elif role == "user":
-                formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
-            elif role == "assistant":
-                formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
-        formatted_prompt += "<|im_start|>assistant\n"
-    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
-    # ✅ Generate Response with optimized settings
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=512,  # Increased for better responses
-            temperature=0.7,
-            top_p=0.9,
-            do_sample=True,
-            pad_token_id=tokenizer.eos_token_id,
-            repetition_penalty=1.05,  # Slightly reduced
-            length_penalty=1.0,
-            early_stopping=True
         )
-    # ✅ FIXED: Better response extraction
-    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    print(f"🔍 Full generated response: {full_response}")
-    # Extract only the new generated part (after the prompt)
-    if formatted_prompt in full_response:
-        generated_part = full_response.split(formatted_prompt)[-1].strip()
-    else:
-        # If we can't find the exact prompt, try to extract the assistant's response
-        assistant_marker = "<|im_start|>assistant\n"
-        if assistant_marker in full_response:
-            parts = full_response.split(assistant_marker)
-            generated_part = parts[-1].split("<|im_end|>")[0].strip() if len(parts) > 1 else full_response
-        else:
-            generated_part = full_response
-    print(f"🔍 Extracted generated part: {generated_part}")
-    # ✅ Clean the response but keep it intact
-    final_answer = clean_response(generated_part)
-    print(f"🔍 Final cleaned answer: {final_answer}")
-    # ✅ OpenAI-style Response
-    return {
-        "id": "chatcmpl-local-001",
-        "object": "chat.completion",
-        "model": "Qwen2.5-0.5B-Instruct-LoRA",
-        "choices": [
-            {
-                "index": 0,
-                "message": {
-                    "role": "assistant",
-                    "content": final_answer
-                },
-                "finish_reason": "stop"
-            }
-        ],
-        "usage": {
-            "prompt_tokens": len(inputs.input_ids[0]),
-            "completion_tokens": len(outputs[0]) - len(inputs.input_ids[0]),
-            "total_tokens": len(outputs[0])
         }
-    }

 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 from starlette.middleware.cors import CORSMiddleware
+import re
 # === Setup FastAPI ===
+app = FastAPI(title="Apollo AI Backend", version="1.0.0")
+# === CORS ===
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# === Configuration ===
+API_KEY = os.getenv("API_KEY", "aigenapikey1234567890")
 BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
 ADAPTER_PATH = "adapter"
+# === Load Model ===
 print("🔧 Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
+print("🧠 Loading base model...")
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     trust_remote_code=True,
+    torch_dtype=torch.float32,
+    device_map="cpu"
+)
 print("🔗 Applying LoRA adapter...")
+model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
 model.eval()
+print("✅ Model ready!")
+def extract_clean_answer(full_response: str, formatted_prompt: str, user_message: str) -> str:
     """
+    Extract only the AI's response, removing all template artifacts and system prompt leaks.
     """
+    if not full_response or len(full_response.strip()) < 5:
+        return "I apologize, but I couldn't generate a response. Please try again."
+    print(f"🔍 Input full_response length: {len(full_response)}")
+    print(f"🔍 Input full_response preview: {full_response[:200]}...")
+    # Step 1: Remove the input prompt to isolate the generated part
+    generated_text = full_response
+    if formatted_prompt in full_response:
+        generated_text = full_response.split(formatted_prompt)[-1]
+    # Step 2: Extract content between assistant tags
+    assistant_pattern = r'<\|im_start\|>assistant\n(.*?)(?:<\|im_end\|>|$)'
+    assistant_matches = re.findall(assistant_pattern, generated_text, re.DOTALL)
+    if assistant_matches:
+        generated_text = assistant_matches[-1]  # Get the last (newest) assistant response
+    # Step 3: Remove common template artifacts
+    artifacts_to_remove = [
+        r'<\|im_start\|>.*?<\|im_end\|>',
+        r'<\|im_start\|>.*',
+        r'<\|im_end\|>.*',
+        r'^(system|user|assistant):\s*',
+        r'^\s*(system|user|assistant)\s*\n',
+    ]
+    for pattern in artifacts_to_remove:
+        generated_text = re.sub(pattern, '', generated_text, flags=re.MULTILINE | re.IGNORECASE)
+    # Step 4: Aggressive system prompt leak removal
+    system_leaks = [
+        r'You are.*?(?=\n\n|\n[A-Z]|\.|$)',
+        r'Guidelines:.*?(?=\n\n|\n[A-Z]|$)',
+        r'Response format:.*?(?=\n\n|\n[A-Z]|$)',
+        r'- Provide.*?(?=\n\n|\n[A-Z]|$)',
+        r'- Use.*?(?=\n\n|\n[A-Z]|$)',
+        r'NEVER include.*?(?=\n\n|\n[A-Z]|$)',
+        r'VS Code Context:.*?(?=\n\n|\n[A-Z]|$)',
+        r'\[VS Code Context:.*?\]',
     ]
+    for leak_pattern in system_leaks:
+        generated_text = re.sub(leak_pattern, '', generated_text, flags=re.DOTALL | re.IGNORECASE)
+    # Step 5: Clean up whitespace and format
+    lines = generated_text.split('\n')
+    clean_lines = []
     for line in lines:
+        line = line.strip()
+        # Skip empty lines at the start
+        if not line and not clean_lines:
             continue
+        # Skip lines that are obviously system prompts
+        skip_patterns = [
+            'you are a helpful', 'guidelines', 'response format', 'provide clear',
+            'use markdown', 'never include', 'vs code context', 'current request'
+        ]
+        if any(pattern in line.lower() for pattern in skip_patterns):
             continue
+        clean_lines.append(line)
+    # Step 6: Reconstruct the response
+    final_answer = '\n'.join(clean_lines).strip()
+    # Step 7: Handle edge cases
+    if not final_answer or len(final_answer) < 10:
+        return "I understand your question. Could you please rephrase it for a clearer answer?"
+    # Step 8: Remove any remaining question echoes
+    if user_message and len(user_message) > 10:
+        user_words = set(user_message.lower().split())
+        first_sentence = final_answer.split('.')[0]
+        if len(set(first_sentence.lower().split()) & user_words) > len(user_words) * 0.7:
+            # First sentence likely echoes the question, remove it
+            remaining = '.'.join(final_answer.split('.')[1:]).strip()
+            if remaining and len(remaining) > 20:
+                final_answer = remaining
+    print(f"🧹 Final cleaned answer: {final_answer}")
+    return final_answer
+def generate_response(messages: list, max_tokens: int = 300, temperature: float = 0.7) -> str:
+    """
+    Generate response using the model with proper chat formatting.
+    """
+    try:
+        # Build the conversation using tokenizer's chat template
+        formatted_prompt = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        print(f"🔍 Formatted prompt: {formatted_prompt}")
+        # Tokenize
+        inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=2048)
+        # Generate
+        with torch.no_grad():
+            outputs = model.generate(
+                inputs.input_ids,
+                attention_mask=inputs.attention_mask,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                top_p=0.9,
+                do_sample=True,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                repetition_penalty=1.05,
+                length_penalty=1.0,
+                early_stopping=True
+            )
+        # Decode the full response
+        full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
+        # Extract user message for cleaning
+        user_message = ""
+        for msg in messages:
+            if msg.get("role") == "user":
+                user_message = msg.get("content", "")
+        # Clean and extract the answer
+        clean_answer = extract_clean_answer(full_response, formatted_prompt, user_message)
+        return clean_answer
+    except Exception as e:
+        print(f"❌ Generation error: {e}")
+        return f"I encountered an error while processing your request. Please try again."
+# === Routes ===
 @app.get("/")
 def root():
+    return {
+        "message": "🤖 Apollo AI Backend is running!",
+        "model": "Qwen2-0.5B-Instruct with LoRA",
+        "status": "ready"
+    }
+@app.get("/health")
+def health():
+    return {"status": "healthy", "model_loaded": True}
 @app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    # Validate API key
     auth_header = request.headers.get("Authorization", "")
     if not auth_header.startswith("Bearer "):
+        return JSONResponse(
+            status_code=401,
+            content={"error": "Missing or invalid Authorization header"}
+        )
     token = auth_header.replace("Bearer ", "").strip()
     if token != API_KEY:
+        return JSONResponse(
+            status_code=401,
+            content={"error": "Invalid API key"}
+        )
+    # Parse request body
     try:
         body = await request.json()
         messages = body.get("messages", [])
+        max_tokens = body.get("max_tokens", 300)
+        temperature = body.get("temperature", 0.7)
         if not messages or not isinstance(messages, list):
+            raise ValueError("Messages field is required and must be a list")
     except Exception as e:
+        return JSONResponse(
+            status_code=400,
+            content={"error": f"Invalid request body: {str(e)}"}
+        )
+    # Validate messages format
+    for i, msg in enumerate(messages):
+        if not isinstance(msg, dict) or "role" not in msg or "content" not in msg:
+            return JSONResponse(
+                status_code=400,
+                content={"error": f"Invalid message format at index {i}"}
+            )
     try:
+        # Generate response
+        print(f"📥 Processing {len(messages)} messages")
+        response_content = generate_response(
+            messages=messages,
+            max_tokens=min(max_tokens, 500),  # Cap max tokens
+            temperature=max(0.1, min(temperature, 1.0))  # Clamp temperature
         )
+        # Return OpenAI-compatible response
+        return {
+            "id": f"chatcmpl-apollo-{hash(str(messages)) % 10000}",
+            "object": "chat.completion",
+            "created": int(torch.tensor(0).item()),  # Simple timestamp
+            "model": "qwen2-0.5b-instruct-lora",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": response_content
+                    },
+                    "finish_reason": "stop"
+                }
+            ],
+            "usage": {
+                "prompt_tokens": len(str(messages)),  # Approximate
+                "completion_tokens": len(response_content),  # Approximate
+                "total_tokens": len(str(messages)) + len(response_content)
+            }
+        }
     except Exception as e:
+        print(f"❌ Chat completion error: {e}")
+        return JSONResponse(
+            status_code=500,
+            content={"error": f"Internal server error: {str(e)}"}
         )
+# === Test endpoint for debugging ===
+@app.post("/test")
+async def test_generation(request: Request):
+    """Test endpoint for debugging the model directly"""
+    try:
+        body = await request.json()
+        prompt = body.get("prompt", "Hello, how are you?")
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        response = generate_response(messages, max_tokens=200, temperature=0.7)
+        return {
+            "prompt": prompt,
+            "response": response,
+            "status": "success"
         }
+    except Exception as e:
+        return JSONResponse(
+            status_code=500,
+            content={"error": str(e)}
+        )
+if __name__ == "__main__":
+    import uvicorn
+    print("🚀 Starting Apollo AI Backend...")
+    uvicorn.run(app, host="0.0.0.0", port=7860)