File size: 1,968 Bytes
158ce9c 90ddcea 158ce9c 6668ea3 158ce9c 18aea39 158ce9c 90ddcea 18aea39 158ce9c 90ddcea 158ce9c 48b2ebf 158ce9c 90ddcea 158ce9c 18aea39 158ce9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from fastapi import FastAPI, Request, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import os
# === CONFIG ===
HF_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
ADAPTER_PATH = "adapter" # folder where your LoRA is saved
API_KEY = os.getenv("API_KEY", "your-secret-key") # Set in HF Space secrets
# === FastAPI Setup ===
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # adjust if needed
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# === Load Model & Tokenizer (CPU only) ===
print("🔧 Loading model on CPU...")
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(HF_MODEL, torch_dtype=torch.float32, trust_remote_code=True)
model = PeftModel.from_pretrained(model, ADAPTER_PATH)
model = model.to("cpu")
model.eval()
print("✅ Model ready on CPU.")
# === Request Schema ===
class ChatRequest(BaseModel):
prompt: str
api_key: str
@app.get("/")
def root():
return {"message": "✅ Qwen2.5 Chat API running."}
@app.post("/chat")
def chat(req: ChatRequest):
if req.api_key != API_KEY:
raise HTTPException(status_code=401, detail="Invalid API Key")
input_text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{req.prompt}<|im_end|>\n<|im_start|>assistant\n"
inputs = tokenizer(input_text, return_tensors="pt").to("cpu")
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract response after assistant tag
final_resp = response.split("<|im_start|>assistant\n")[-1].strip()
return {"response": final_resp} |