aigen / app /main.py
Ais
Update app/main.py
158ce9c verified
raw
history blame
1.97 kB
from fastapi import FastAPI, Request, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import os
# === CONFIG ===
HF_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
ADAPTER_PATH = "adapter" # folder where your LoRA is saved
API_KEY = os.getenv("API_KEY", "your-secret-key") # Set in HF Space secrets
# === FastAPI Setup ===
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # adjust if needed
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# === Load Model & Tokenizer (CPU only) ===
print("🔧 Loading model on CPU...")
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(HF_MODEL, torch_dtype=torch.float32, trust_remote_code=True)
model = PeftModel.from_pretrained(model, ADAPTER_PATH)
model = model.to("cpu")
model.eval()
print("✅ Model ready on CPU.")
# === Request Schema ===
class ChatRequest(BaseModel):
prompt: str
api_key: str
@app.get("/")
def root():
return {"message": "✅ Qwen2.5 Chat API running."}
@app.post("/chat")
def chat(req: ChatRequest):
if req.api_key != API_KEY:
raise HTTPException(status_code=401, detail="Invalid API Key")
input_text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{req.prompt}<|im_end|>\n<|im_start|>assistant\n"
inputs = tokenizer(input_text, return_tensors="pt").to("cpu")
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract response after assistant tag
final_resp = response.split("<|im_start|>assistant\n")[-1].strip()
return {"response": final_resp}