from fastapi import FastAPI, UploadFile, Form from PIL import Image from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer from transformers import Qwen2VLProcessor from gui_actor.inference import inference import torch import io app = FastAPI() # Load model + processor at startup MODEL_NAME = "microsoft/GUI-Actor-2B-Qwen2-VL" processor = Qwen2VLProcessor.from_pretrained(MODEL_NAME) tokenizer = processor.tokenizer model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained( MODEL_NAME, torch_dtype=torch.float32, device_map="auto" ).eval() @app.get("/") def home(): return {"message": "GUI-Actor Space is running"} @app.post("/predict/") async def predict( instruction: str = Form(...), image: UploadFile = Form(...) ): # Read and process image img_bytes = await image.read() img = Image.open(io.BytesIO(img_bytes)).convert("RGB") # Auto resize if needed max_width, max_height = 480, 270 if img.width > max_width or img.height > max_height: img.thumbnail((max_width, max_height)) # Run inference click_point = inference( instruction=instruction, image=img, model=model, processor=processor, tokenizer=tokenizer ) return {"click_point": click_point} if __name__ == "__main__": import uvicorn uvicorn.run("app:app", host="0.0.0.0", port=7860)