Spaces:

Ais203
/

aigen

Sleeping

App Files Files Community

Ais commited on 17 days ago

Commit

4ca2587

verified ·

1 Parent(s): 6df15e3

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +38 -34

app/main.py CHANGED Viewed

@@ -9,7 +9,7 @@ from starlette.middleware.cors import CORSMiddleware
 # === Setup FastAPI ===
 app = FastAPI()
-# === CORS for frontend testing (optional) ===
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -18,10 +18,10 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# === Load Secret API Key from Hugging Face Secrets ===
-API_KEY = os.getenv("API_KEY", "undefined")
-# === Load Model and Adapter (CPU only) ===
 BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
 ADAPTER_PATH = "adapter"
@@ -32,47 +32,51 @@ print("🧠 Loading base model on CPU...")
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     trust_remote_code=True,
-    torch_dtype=torch.float32  # CPU only
 ).cpu()
 print("🔗 Applying LoRA adapter...")
 model = PeftModel.from_pretrained(base_model, ADAPTER_PATH).cpu()
 model.eval()
-print("✅ Model and adapter loaded.")
-# === Root route for test ===
 @app.get("/")
-def read_root():
     return {"message": "🧠 Qwen2.5-0.5B-Instruct API is running on CPU!"}
-# === POST /v1/chat/completions (OpenAI-style) ===
 @app.post("/v1/chat/completions")
 async def chat(request: Request):
-    # ✅ Check API key from headers
-    auth = request.headers.get("Authorization", "")
-    if not auth.startswith("Bearer "):
         return JSONResponse(status_code=401, content={"error": "Missing Bearer token in Authorization header."})
-    token = auth.replace("Bearer ", "").strip()
     if token != API_KEY:
         return JSONResponse(status_code=401, content={"error": "Invalid API key."})
-    # ✅ Parse user prompt
-    body = await request.json()
-    messages = body.get("messages", [])
-    if not messages or not isinstance(messages, list):
-        return JSONResponse(status_code=400, content={"error": "No messages provided."})
-    user_prompt = messages[-1]["content"]
-    # ✅ Format prompt for Qwen chat model
-    prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
-    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
-    # ✅ Generate
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
@@ -83,12 +87,12 @@ async def chat(request: Request):
             pad_token_id=tokenizer.eos_token_id
         )
-    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    answer = full_output.split("<|im_start|>assistant\n")[-1].strip()
-    # ✅ Return in OpenAI-style format
     return {
-        "id": "chatcmpl-custom-001",
         "object": "chat.completion",
         "model": "Qwen2.5-0.5B-Instruct-LoRA",
         "choices": [
@@ -96,9 +100,9 @@ async def chat(request: Request):
                 "index": 0,
                 "message": {
                     "role": "assistant",
-                    "content": answer
                 },
                 "finish_reason": "stop"
             }
         ]
-    }

 # === Setup FastAPI ===
 app = FastAPI()
+# === CORS (optional for frontend access) ===
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# === Load API Key from Hugging Face Secrets ===
+API_KEY = os.getenv("API_KEY", "undefined")  # Add API_KEY in your HF Space Secrets
+# === Model Settings ===
 BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
 ADAPTER_PATH = "adapter"
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     trust_remote_code=True,
+    torch_dtype=torch.float32
 ).cpu()
 print("🔗 Applying LoRA adapter...")
 model = PeftModel.from_pretrained(base_model, ADAPTER_PATH).cpu()
 model.eval()
+print("✅ Model and adapter loaded successfully.")
+# === Root Route ===
 @app.get("/")
+def root():
     return {"message": "🧠 Qwen2.5-0.5B-Instruct API is running on CPU!"}
+# === Chat Completion API ===
 @app.post("/v1/chat/completions")
 async def chat(request: Request):
+    # ✅ API Key Authorization
+    auth_header = request.headers.get("Authorization", "")
+    if not auth_header.startswith("Bearer "):
         return JSONResponse(status_code=401, content={"error": "Missing Bearer token in Authorization header."})
+    token = auth_header.replace("Bearer ", "").strip()
     if token != API_KEY:
         return JSONResponse(status_code=401, content={"error": "Invalid API key."})
+    # ✅ Parse Request
+    try:
+        body = await request.json()
+        messages = body.get("messages", [])
+        if not messages or not isinstance(messages, list):
+            raise ValueError("Invalid or missing 'messages' field.")
+        user_prompt = messages[-1]["content"]
+    except Exception as e:
+        return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})
+    # ✅ Format Prompt for Qwen
+    formatted_prompt = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
+    # ✅ Generate Response
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
             pad_token_id=tokenizer.eos_token_id
         )
+    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    final_answer = decoded.split("<|im_start|>assistant\n")[-1].strip()
+    # ✅ OpenAI-style Response
     return {
+        "id": "chatcmpl-local-001",
         "object": "chat.completion",
         "model": "Qwen2.5-0.5B-Instruct-LoRA",
         "choices": [
                 "index": 0,
                 "message": {
                     "role": "assistant",
+                    "content": final_answer
                 },
                 "finish_reason": "stop"
             }
         ]
+    }