Spaces:

abiyyufahri
/

GUI-Agent

Sleeping

App Files Files Community

abiyyufahri commited on 16 days ago

Commit

563f88c

verified ·

1 Parent(s): 6f88ac4

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -155

app.py CHANGED Viewed

@@ -1,14 +1,13 @@
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import JSONResponse
-from pydantic import BaseModel
 from PIL import Image
-from io import BytesIO
-import base64
 import torch
 import re
 import logging
-import asyncio
-from contextlib import asynccontextmanager
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -21,7 +20,7 @@ tokenizer = None
 model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
 model_loaded = False
-async def load_model():
     """Load model with proper error handling and fallback strategies"""
     global model, processor, tokenizer, model_loaded
@@ -40,8 +39,8 @@ async def load_model():
             model = Qwen2VLForConditionalGeneration.from_pretrained(
                 model_name,
-                torch_dtype=torch.float32,
-                device_map=None,  # CPU only
                 trust_remote_code=True,
                 low_cpu_mem_usage=True
             ).eval()
@@ -53,17 +52,17 @@ async def load_model():
             logger.info("Trying AutoProcessor and AutoModel fallback...")
             try:
-                from transformers import AutoProcessor, AutoModel
                 processor = AutoProcessor.from_pretrained(
                     model_name,
                     trust_remote_code=True
                 )
-                model = AutoModel.from_pretrained(
                     model_name,
-                    torch_dtype=torch.float32,
-                    device_map=None,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True
                 ).eval()
@@ -75,7 +74,7 @@ async def load_model():
                 logger.info("Trying generic transformers approach...")
                 # Last fallback - try loading as generic model
-                from transformers import AutoConfig, AutoTokenizer
                 import transformers
                 config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
@@ -97,8 +96,8 @@ async def load_model():
                 model = ModelClass.from_pretrained(
                     model_name,
                     config=config,
-                    torch_dtype=torch.float32,
-                    device_map=None,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True
                 ).eval()
@@ -117,30 +116,8 @@ async def load_model():
         model_loaded = False
         return False
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    # Startup
-    logger.info("Starting up GUI-Actor API...")
-    await load_model()
-    yield
-    # Shutdown
-    logger.info("Shutting down GUI-Actor API...")
-# Initialize FastAPI app with lifespan
-app = FastAPI(
-    title="GUI-Actor API",
-    version="1.0.0",
-    lifespan=lifespan
-)
-class Base64Request(BaseModel):
-    image_base64: str
-    instruction: str
-def extract_coordinates(text):
-    """
-    Extract coordinates from model output text
-    """
     # Pattern untuk mencari koordinat dalam berbagai format
     patterns = [
         r'click\s*\(\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)',  # click(x, y)
@@ -166,11 +143,38 @@ def extract_coordinates(text):
     # Default ke center jika tidak ditemukan
     return [(0.5, 0.5)]
-def cpu_inference(conversation, model, tokenizer, processor):
-    """
-    Inference function untuk CPU with better error handling
-    """
     try:
         # Apply chat template
         text = processor.apply_chat_template(
             conversation,
@@ -186,11 +190,15 @@ def cpu_inference(conversation, model, tokenizer, processor):
             text=[text],
             images=[image],
             return_tensors="pt",
-            padding=True,  # Enable padding
-            truncation=True,  # Enable truncation for long texts
-            max_length=512  # Set reasonable max length
         )
         # Generate response with proper error handling
         with torch.no_grad():
             try:
@@ -218,119 +226,97 @@ def cpu_inference(conversation, model, tokenizer, processor):
         # Extract coordinates
         coordinates = extract_coordinates(response)
-        return {
-            "topk_points": coordinates,
-            "response": response,
-            "success": True
-        }
     except Exception as e:
         logger.error(f"Inference error: {e}")
-        return {
-            "topk_points": [(0.5, 0.5)],
-            "response": f"Error during inference: {str(e)}",
-            "success": False
-        }
-@app.get("/")
-async def root():
-    return {
-        "message": "GUI-Actor API is running",
-        "status": "healthy",
-        "model_loaded": model_loaded,
-        "model_name": model_name
-    }
-@app.post("/click/base64")
-async def predict_click_base64(data: Base64Request):
-    if not model_loaded:
-        raise HTTPException(
-            status_code=503,
-            detail="Model not loaded properly"
-        )
-    try:
-        # Decode base64 to image
-        try:
-            # Handle data URL format
-            if "," in data.image_base64:
-                image_data = base64.b64decode(data.image_base64.split(",")[-1])
-            else:
-                image_data = base64.b64decode(data.image_base64)
-        except Exception as e:
-            raise HTTPException(status_code=400, detail=f"Invalid base64 image: {e}")
-        try:
-            pil_image = Image.open(BytesIO(image_data)).convert("RGB")
-        except Exception as e:
-            raise HTTPException(status_code=400, detail=f"Invalid image format: {e}")
-        conversation = [
-            {
-                "role": "system",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task. Please provide the click coordinates.",
-                    }
-                ]
-            },
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "image": pil_image,
-                    },
-                    {
-                        "type": "text",
-                        "text": data.instruction,
-                    },
-                ],
-            },
-        ]
-        # Run inference
-        pred = cpu_inference(conversation, model, tokenizer, processor)
-        px, py = pred["topk_points"][0]
-        return JSONResponse(content={
-            "x": round(px, 4),
-            "y": round(py, 4),
-            "response": pred["response"],
-            "success": pred["success"]
-        })
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"Prediction error: {e}")
-        raise HTTPException(
-            status_code=500,
-            detail=f"Internal server error: {str(e)}"
-        )
-@app.get("/health")
-async def health_check():
-    return {
-        "status": "healthy" if model_loaded else "unhealthy",
-        "model": model_name,
-        "device": "cpu",
-        "torch_dtype": "float32",
-        "model_loaded": model_loaded
-    }
-@app.get("/debug")
-async def debug_info():
-    """Debug endpoint to check model loading status"""
-    import transformers
-    available_classes = [attr for attr in dir(transformers) if 'Qwen' in attr or 'VL' in attr]
-    return {
-        "model_loaded": model_loaded,
-        "processor_type": type(processor).__name__ if processor else None,
-        "model_type": type(model).__name__ if model else None,
-        "available_qwen_classes": available_classes,
-        "transformers_version": transformers.__version__
-    }

+import os
+import spaces
+import gradio as gr
 from PIL import Image
 import torch
 import re
 import logging
+from typing import Tuple, List
+import base64
+from io import BytesIO
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
 model_loaded = False
+def load_model():
     """Load model with proper error handling and fallback strategies"""
     global model, processor, tokenizer, model_loaded
             model = Qwen2VLForConditionalGeneration.from_pretrained(
                 model_name,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                device_map="auto" if torch.cuda.is_available() else None,
                 trust_remote_code=True,
                 low_cpu_mem_usage=True
             ).eval()
             logger.info("Trying AutoProcessor and AutoModel fallback...")
             try:
+                from transformers import AutoProcessor, AutoModelForVision2Seq
                 processor = AutoProcessor.from_pretrained(
                     model_name,
                     trust_remote_code=True
                 )
+                model = AutoModelForVision2Seq.from_pretrained(
                     model_name,
+                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                    device_map="auto" if torch.cuda.is_available() else None,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True
                 ).eval()
                 logger.info("Trying generic transformers approach...")
                 # Last fallback - try loading as generic model
+                from transformers import AutoConfig, AutoProcessor
                 import transformers
                 config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
                 model = ModelClass.from_pretrained(
                     model_name,
                     config=config,
+                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                    device_map="auto" if torch.cuda.is_available() else None,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True
                 ).eval()
         model_loaded = False
         return False
+def extract_coordinates(text: str) -> List[Tuple[float, float]]:
+    """Extract coordinates from model output text"""
     # Pattern untuk mencari koordinat dalam berbagai format
     patterns = [
         r'click\s*\(\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)',  # click(x, y)
     # Default ke center jika tidak ditemukan
     return [(0.5, 0.5)]
+@spaces.GPU  # Decorator untuk menggunakan GPU di Hugging Face Spaces
+def inference(pil_image: Image.Image, instruction: str):
+    """Inference function with Spaces GPU support"""
+    if not model_loaded:
+        return "Model not loaded properly", 0.5, 0.5
     try:
+        conversation = [
+            {
+                "role": "system",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task. Please provide the click coordinates.",
+                    }
+                ]
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": pil_image,
+                    },
+                    {
+                        "type": "text",
+                        "text": instruction,
+                    },
+                ],
+            },
+        ]
         # Apply chat template
         text = processor.apply_chat_template(
             conversation,
             text=[text],
             images=[image],
             return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=512
         )
+        # Move inputs to the same device as model
+        if torch.cuda.is_available():
+            inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
         # Generate response with proper error handling
         with torch.no_grad():
             try:
         # Extract coordinates
         coordinates = extract_coordinates(response)
+        px, py = coordinates[0]
+        return response, round(px, 4), round(py, 4)
     except Exception as e:
         logger.error(f"Inference error: {e}")
+        return f"Error during inference: {str(e)}", 0.5, 0.5
+def process_image(image: Image.Image, instruction: str):
+    """Process the uploaded image and instruction"""
+    if image is None:
+        return "Please upload an image", 0.5, 0.5
+    if not instruction.strip():
+        return "Please provide an instruction", 0.5, 0.5
+    # Convert image to RGB if needed
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    # Run inference
+    response, x, y = inference(image, instruction)
+    return response, x, y
+# Load model on startup
+logger.info("Loading model...")
+load_model()
+# Create Gradio interface
+with gr.Blocks(title="GUI-Actor Click Prediction", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# GUI-Actor Click Prediction")
+    gr.Markdown("Upload a screenshot and provide instructions to get click coordinates prediction.")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(
+                type="pil",
+                label="Upload Screenshot",
+                height=400
+            )
+            instruction_input = gr.Textbox(
+                label="Instruction",
+                placeholder="e.g., Click on the login button",
+                lines=3
+            )
+            submit_btn = gr.Button("Predict Click Location", variant="primary")
+        with gr.Column():
+            response_output = gr.Textbox(
+                label="Model Response",
+                lines=5,
+                interactive=False
+            )
+            with gr.Row():
+                x_output = gr.Number(
+                    label="X Coordinate (normalized)",
+                    precision=4,
+                    interactive=False
+                )
+                y_output = gr.Number(
+                    label="Y Coordinate (normalized)",
+                    precision=4,
+                    interactive=False
+                )
+    # Status indicator
+    with gr.Row():
+        gr.Markdown(f"**Model Status:** {'✅ Loaded' if model_loaded else '❌ Not Loaded'}")
+        gr.Markdown(f"**Device:** {'GPU' if torch.cuda.is_available() else 'CPU'}")
+    # Examples
+    gr.Examples(
+        examples=[
+            ["Click on the search button", None],
+            ["Select the dropdown menu", None],
+            ["Click on the submit form", None],
+        ],
+        inputs=[instruction_input, image_input],
+    )
+    # Event handlers
+    submit_btn.click(
+        fn=process_image,
+        inputs=[image_input, instruction_input],
+        outputs=[response_output, x_output, y_output]
+    )
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True
+    )