File size: 1,968 Bytes
158ce9c
 
90ddcea
158ce9c
 
 
 
6668ea3
158ce9c
 
 
 
18aea39
158ce9c
90ddcea
18aea39
158ce9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90ddcea
158ce9c
48b2ebf
158ce9c
 
 
90ddcea
 
158ce9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18aea39
158ce9c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from fastapi import FastAPI, Request, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import os

# === CONFIG ===
HF_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
ADAPTER_PATH = "adapter"  # folder where your LoRA is saved
API_KEY = os.getenv("API_KEY", "your-secret-key")  # Set in HF Space secrets

# === FastAPI Setup ===
app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # adjust if needed
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# === Load Model & Tokenizer (CPU only) ===
print("🔧 Loading model on CPU...")
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(HF_MODEL, torch_dtype=torch.float32, trust_remote_code=True)
model = PeftModel.from_pretrained(model, ADAPTER_PATH)
model = model.to("cpu")
model.eval()
print("✅ Model ready on CPU.")

# === Request Schema ===
class ChatRequest(BaseModel):
    prompt: str
    api_key: str

@app.get("/")
def root():
    return {"message": "✅ Qwen2.5 Chat API running."}

@app.post("/chat")
def chat(req: ChatRequest):
    if req.api_key != API_KEY:
        raise HTTPException(status_code=401, detail="Invalid API Key")

    input_text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{req.prompt}<|im_end|>\n<|im_start|>assistant\n"

    inputs = tokenizer(input_text, return_tensors="pt").to("cpu")
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract response after assistant tag
    final_resp = response.split("<|im_start|>assistant\n")[-1].strip()
    return {"response": final_resp}