# app/main.py from fastapi import FastAPI, Form from fastapi.responses import HTMLResponse from fastapi.middleware.cors import CORSMiddleware from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from peft import PeftModel import torch import os from app.download_adapter import download_latest_adapter # === Step 1: Download Adapter === download_latest_adapter() # === Step 2: Load Model and Tokenizer === BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct" ADAPTER_FOLDER = "adapter" HF_TOKEN = os.environ.get("HF_TOKEN", None) print("🚀 Loading base model...") base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.float16, device_map="auto", token=HF_TOKEN, trust_remote_code=True ) print("🔧 Applying LoRA adapter...") model = PeftModel.from_pretrained(base_model, ADAPTER_FOLDER) print("🧠 Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) # === Step 3: FastAPI App === app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], # Allow all origins for testing allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.get("/", response_class=HTMLResponse) async def form(): return """ Qwen Chat

Ask something:


""" @app.post("/", response_class=HTMLResponse) async def generate(prompt: str = Form(...)): full_prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" output = pipe(full_prompt, max_new_tokens=256, do_sample=True, temperature=0.7) response = output[0]["generated_text"].split("<|im_start|>assistant\n")[-1].strip() return f""" Qwen Chat

Your Prompt:

{prompt}

Response:

{response}

Ask again """