Spaces:

abiyyufahri
/

GUI-Agent

Sleeping

App Files Files Community

abiyyufahri commited on 16 days ago

Commit

55b2cb1

1 Parent(s): 2cf117f

Install error fix

Browse files

Files changed (3) hide show

Dockerfile +9 -5
app.py +15 -58
requirements.txt +1 -1

Dockerfile CHANGED Viewed

@@ -1,16 +1,20 @@
-FROM python:3.9
-RUN apt-get update && apt-get install -y git && \
-    useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
-COPY --chown=user requirements.txt .
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
 COPY --chown=user . .
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

+FROM python:3.9-slim
+# Install dependencies
+RUN apt-get update && apt-get install -y git curl && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+# Create non-root user
+RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
+COPY --chown=user requirements.txt ./
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
 COPY --chown=user . .
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from fastapi import FastAPI, UploadFile, Form
 from fastapi.responses import JSONResponse
 from PIL import Image
 from io import BytesIO
-import torch
 import base64
 from transformers import Qwen2VLProcessor
 from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
@@ -17,66 +18,22 @@ processor = Qwen2VLProcessor.from_pretrained(model_name)
 tokenizer = processor.tokenizer
 model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
     model_name,
-    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto",
-    attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
 ).eval()
-@app.post("/click_base64")
-async def predict_click_base64(
-    image_base64: str = Form(...),
-    instruction: str = Form(...)
-):
-    # Decode base64 image
-    try:
-        if "," in image_base64:
-            image_base64 = image_base64.split(",")[1]
-        image_data = base64.b64decode(image_base64)
-        pil_image = Image.open(BytesIO(image_data)).convert("RGB")
-    except Exception as e:
-        return JSONResponse(status_code=400, content={"error": f"Invalid image format: {str(e)}"})
-    # Prepare conversation
-    conversation = [
-        {
-            "role": "system",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
-                }
-            ]
-        },
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image": pil_image,
-                },
-                {
-                    "type": "text",
-                    "text": instruction,
-                },
-            ],
-        },
-    ]
-    # Inference
-    try:
-        pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
-        px, py = pred["topk_points"][0]
-        return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})
-    except Exception as e:
-        return JSONResponse(status_code=500, content={"error": f"Inference failed: {str(e)}"})
-@app.post("/click")
-async def predict_click(image: UploadFile, instruction: str = Form(...)):
-    # Load image
-    contents = await image.read()
-    pil_image = Image.open(BytesIO(contents)).convert("RGB")
     conversation = [
         {
@@ -97,7 +54,7 @@ async def predict_click(image: UploadFile, instruction: str = Form(...)):
                 },
                 {
                     "type": "text",
-                    "text": instruction,
                 },
             ],
         },

+from fastapi import FastAPI, Form
 from fastapi.responses import JSONResponse
+from pydantic import BaseModel
 from PIL import Image
 from io import BytesIO
 import base64
+import torch
 from transformers import Qwen2VLProcessor
 from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
 tokenizer = processor.tokenizer
 model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
     model_name,
+    torch_dtype=torch.float32,  # use float32 for CPU
+    device_map=None,            # don't map to cuda
+    attn_implementation=None,
 ).eval()
+class Base64Request(BaseModel):
+    image_base64: str
+    instruction: str
+@app.post("/click/base64")
+async def predict_click_base64(data: Base64Request):
+    # Decode base64 to image
+    image_data = base64.b64decode(data.image_base64.split(",")[-1])
+    pil_image = Image.open(BytesIO(image_data)).convert("RGB")
     conversation = [
         {
                 },
                 {
                     "type": "text",
+                    "text": data.instruction,
                 },
             ],
         },

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 fastapi
 uvicorn[standard]
 transformers
-torch
 datasets
 Pillow
 accelerate

 fastapi
 uvicorn[standard]
 transformers
+torch==2.1.2
 datasets
 Pillow
 accelerate