File size: 4,967 Bytes
6df15e3 158ce9c 6df15e3 158ce9c 6df15e3 18aea39 6df15e3 90ddcea 18aea39 4ca2587 158ce9c 6df15e3 158ce9c 4ca2587 18f4dad 6df15e3 4ca2587 6df15e3 4ca2587 6df15e3 158ce9c 4ca2587 48b2ebf 4ca2587 158ce9c 4ca2587 6df15e3 4ca2587 6df15e3 4ca2587 6df15e3 730d86c 4ca2587 6df15e3 4ca2587 730d86c 4ca2587 18f4dad 730d86c 18f4dad 730d86c 18f4dad 4ca2587 6df15e3 730d86c 6df15e3 18f4dad 6df15e3 4ca2587 730d86c 18f4dad 4ca2587 730d86c 18f4dad 730d86c 18f4dad 6df15e3 4ca2587 6df15e3 4ca2587 6df15e3 4ca2587 6df15e3 4ca2587 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import os
import torch
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from starlette.middleware.cors import CORSMiddleware
# === Setup FastAPI ===
app = FastAPI()
# === CORS (optional for frontend access) ===
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# === Load API Key from Hugging Face Secrets ===
API_KEY = os.getenv("API_KEY", "undefined")
# === Model Settings ===
BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
ADAPTER_PATH = "adapter"
print("🔧 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
print("🧠 Loading base model on CPU...")
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
trust_remote_code=True,
torch_dtype=torch.float32
).cpu()
print("🔗 Applying LoRA adapter...")
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH).cpu()
model.eval()
print("✅ Model and adapter loaded successfully.")
# === Root Route ===
@app.get("/")
def root():
return {"message": "🧠 Qwen2.5-0.5B-Instruct API is running on CPU!"}
# === Chat Completion API ===
@app.post("/v1/chat/completions")
async def chat(request: Request):
# ✅ API Key Authorization
auth_header = request.headers.get("Authorization", "")
if not auth_header.startswith("Bearer "):
return JSONResponse(status_code=401, content={"error": "Missing Bearer token in Authorization header."})
token = auth_header.replace("Bearer ", "").strip()
if token != API_KEY:
return JSONResponse(status_code=401, content={"error": "Invalid API key."})
# ✅ Parse Request
try:
body = await request.json()
messages = body.get("messages", [])
if not messages or not isinstance(messages, list):
raise ValueError("Invalid or missing 'messages' field.")
temperature = body.get("temperature", 0.7)
max_tokens = body.get("max_tokens", 512)
except Exception as e:
return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})
# ✅ FIXED: Only use last 4 messages to prevent stacking
recent_messages = messages[-4:] if len(messages) > 4 else messages
# ✅ Build clean conversation prompt
formatted_prompt = ""
for message in recent_messages:
role = message.get("role", "")
content = message.get("content", "")
if role == "system":
formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
elif role == "user":
formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
elif role == "assistant":
formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
# Add the assistant start token for generation
formatted_prompt += "<|im_start|>assistant\n"
print(f"🤖 Processing {len(recent_messages)} recent messages")
inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
# ✅ Generate Response
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id
)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
# ✅ FIXED: Extract ONLY the new assistant response
final_answer = decoded.split("<|im_start|>assistant\n")[-1].strip()
# Remove any end tokens or artifacts
if "<|im_end|>" in final_answer:
final_answer = final_answer.split("<|im_end|>")[0].strip()
# Remove any repeated system prompts or guidelines that leaked through
if "Guidelines:" in final_answer:
final_answer = final_answer.split("Guidelines:")[0].strip()
if "Response format:" in final_answer:
final_answer = final_answer.split("Response format:")[0].strip()
# Remove VS Code context if it leaked through
if "[VS Code Context:" in final_answer:
lines = final_answer.split('\n')
cleaned_lines = [line for line in lines if not line.strip().startswith('[VS Code Context:')]
final_answer = '\n'.join(cleaned_lines).strip()
print(f"✅ Clean response: {final_answer[:100]}...")
# ✅ OpenAI-style Response
return {
"id": "chatcmpl-local-001",
"object": "chat.completion",
"model": "Qwen2.5-0.5B-Instruct-LoRA",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": final_answer
},
"finish_reason": "stop"
}
]
} |