RishitMishra's picture
Update app.py
7cbfd20 verified
from fastapi import FastAPI, UploadFile, Form
from PIL import Image
from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
from transformers import Qwen2VLProcessor
from gui_actor.inference import inference
import torch
import io
app = FastAPI()
# Load model + processor at startup
MODEL_NAME = "microsoft/GUI-Actor-2B-Qwen2-VL"
processor = Qwen2VLProcessor.from_pretrained(MODEL_NAME)
tokenizer = processor.tokenizer
model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float32,
device_map="auto"
).eval()
@app.get("/")
def home():
return {"message": "GUI-Actor Space is running"}
@app.post("/predict/")
async def predict(
instruction: str = Form(...),
image: UploadFile = Form(...)
):
# Read and process image
img_bytes = await image.read()
img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
# Auto resize if needed
max_width, max_height = 480, 270
if img.width > max_width or img.height > max_height:
img.thumbnail((max_width, max_height))
# Run inference
click_point = inference(
instruction=instruction,
image=img,
model=model,
processor=processor,
tokenizer=tokenizer
)
return {"click_point": click_point}
if __name__ == "__main__":
import uvicorn
uvicorn.run("app:app", host="0.0.0.0", port=7860)