File size: 4,967 Bytes
6df15e3
158ce9c
6df15e3
 
 
158ce9c
6df15e3
18aea39
6df15e3
90ddcea
18aea39
4ca2587
158ce9c
 
6df15e3
158ce9c
 
 
 
 
4ca2587
18f4dad
6df15e3
4ca2587
6df15e3
 
 
 
 
 
 
 
 
 
4ca2587
6df15e3
 
 
 
158ce9c
 
4ca2587
48b2ebf
4ca2587
158ce9c
4ca2587
6df15e3
 
4ca2587
6df15e3
 
4ca2587
 
 
6df15e3
730d86c
4ca2587
6df15e3
 
 
4ca2587
 
 
 
 
 
730d86c
 
 
 
4ca2587
 
 
18f4dad
 
 
 
730d86c
 
18f4dad
730d86c
 
 
 
 
 
 
 
 
 
 
 
 
18f4dad
 
4ca2587
 
 
6df15e3
 
 
730d86c
 
6df15e3
 
18f4dad
 
6df15e3
 
4ca2587
730d86c
18f4dad
4ca2587
730d86c
18f4dad
730d86c
 
 
18f4dad
 
 
 
 
 
 
 
 
 
 
 
 
 
6df15e3
4ca2587
6df15e3
4ca2587
6df15e3
 
 
 
 
 
 
4ca2587
6df15e3
 
 
 
4ca2587
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import torch
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from starlette.middleware.cors import CORSMiddleware

# === Setup FastAPI ===
app = FastAPI()

# === CORS (optional for frontend access) ===
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# === Load API Key from Hugging Face Secrets ===
API_KEY = os.getenv("API_KEY", "undefined")

# === Model Settings ===
BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
ADAPTER_PATH = "adapter"

print("🔧 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

print("🧠 Loading base model on CPU...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    trust_remote_code=True,
    torch_dtype=torch.float32
).cpu()

print("🔗 Applying LoRA adapter...")
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH).cpu()
model.eval()

print("✅ Model and adapter loaded successfully.")

# === Root Route ===
@app.get("/")
def root():
    return {"message": "🧠 Qwen2.5-0.5B-Instruct API is running on CPU!"}

# === Chat Completion API ===
@app.post("/v1/chat/completions")
async def chat(request: Request):
    # ✅ API Key Authorization
    auth_header = request.headers.get("Authorization", "")
    if not auth_header.startswith("Bearer "):
        return JSONResponse(status_code=401, content={"error": "Missing Bearer token in Authorization header."})
    
    token = auth_header.replace("Bearer ", "").strip()
    if token != API_KEY:
        return JSONResponse(status_code=401, content={"error": "Invalid API key."})

    # ✅ Parse Request
    try:
        body = await request.json()
        messages = body.get("messages", [])
        if not messages or not isinstance(messages, list):
            raise ValueError("Invalid or missing 'messages' field.")
        
        temperature = body.get("temperature", 0.7)
        max_tokens = body.get("max_tokens", 512)
        
    except Exception as e:
        return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})

    # ✅ FIXED: Only use last 4 messages to prevent stacking
    recent_messages = messages[-4:] if len(messages) > 4 else messages
    
    # ✅ Build clean conversation prompt
    formatted_prompt = ""
    
    for message in recent_messages:
        role = message.get("role", "")
        content = message.get("content", "")
        
        if role == "system":
            formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
        elif role == "user":
            formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
        elif role == "assistant":
            formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
    
    # Add the assistant start token for generation
    formatted_prompt += "<|im_start|>assistant\n"
    
    print(f"🤖 Processing {len(recent_messages)} recent messages")
    
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")

    # ✅ Generate Response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # ✅ FIXED: Extract ONLY the new assistant response
    final_answer = decoded.split("<|im_start|>assistant\n")[-1].strip()
    
    # Remove any end tokens or artifacts
    if "<|im_end|>" in final_answer:
        final_answer = final_answer.split("<|im_end|>")[0].strip()
    
    # Remove any repeated system prompts or guidelines that leaked through
    if "Guidelines:" in final_answer:
        final_answer = final_answer.split("Guidelines:")[0].strip()
    
    if "Response format:" in final_answer:
        final_answer = final_answer.split("Response format:")[0].strip()
        
    # Remove VS Code context if it leaked through
    if "[VS Code Context:" in final_answer:
        lines = final_answer.split('\n')
        cleaned_lines = [line for line in lines if not line.strip().startswith('[VS Code Context:')]
        final_answer = '\n'.join(cleaned_lines).strip()
    
    print(f"✅ Clean response: {final_answer[:100]}...")

    # ✅ OpenAI-style Response
    return {
        "id": "chatcmpl-local-001",
        "object": "chat.completion",
        "model": "Qwen2.5-0.5B-Instruct-LoRA",
        "choices": [
            {
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": final_answer
                },
                "finish_reason": "stop"
            }
        ]
    }