from fastapi import FastAPI from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel import torch app = FastAPI() tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2.5-0.5B-Instruct", torch_dtype=torch.float32, trust_remote_code=True ) model = PeftModel.from_pretrained(model, "./adapter", is_trainable=False) model.eval() def build_prompt(messages): prompt = "" for msg in messages: role = "User" if msg["role"] == "user" else "Assistant" prompt += f"### {role}:\n{msg['content']}\n" prompt += "### Assistant:\n" return prompt class ChatRequest(BaseModel): messages: list # [{"role": "user", "content": "..."}] @app.post("/chat") async def chat(req: ChatRequest): prompt = build_prompt(req.messages) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.95, eos_token_id=tokenizer.eos_token_id ) output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) reply = output_text.split("### Assistant:")[-1].strip() return {"response": reply}