from fastapi import FastAPI, UploadFile, Form
from PIL import Image
from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
from transformers import Qwen2VLProcessor
from gui_actor.inference import inference
import torch
import io

app = FastAPI()

# Load model + processor at startup
MODEL_NAME = "microsoft/GUI-Actor-2B-Qwen2-VL"
processor = Qwen2VLProcessor.from_pretrained(MODEL_NAME)
tokenizer = processor.tokenizer
model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,
    device_map="auto"
).eval()

@app.get("/")
def home():
    return {"message": "GUI-Actor Space is running"}

@app.post("/predict/")
async def predict(
    instruction: str = Form(...),
    image: UploadFile = Form(...)
):
    # Read and process image
    img_bytes = await image.read()
    img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
    
    # Auto resize if needed
    max_width, max_height = 480, 270
    if img.width > max_width or img.height > max_height:
        img.thumbnail((max_width, max_height))
    
    # Run inference
    click_point = inference(
        instruction=instruction,
        image=img,
        model=model,
        processor=processor,
        tokenizer=tokenizer
    )
    return {"click_point": click_point}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run("app:app", host="0.0.0.0", port=7860)