Spaces:

ttomy
/

proxy-lite-demo-v2

Paused

App Files Files Community

Trisha Tomy commited on Jun 27

Commit

6a0e448

1 Parent(s): 40f15d7

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

CODEOWNERS +1 -0
Dockerfile +59 -0
LICENSE +3 -0
Makefile +11 -0
Procfile +1 -0
README.md +3 -3
app.py +133 -0
pyproject.toml +65 -0
requirements.txt +6 -0
src/proxy_lite.egg-info/PKG-INFO +36 -0
src/proxy_lite.egg-info/SOURCES.txt +35 -0
src/proxy_lite.egg-info/dependency_links.txt +1 -0
src/proxy_lite.egg-info/entry_points.txt +2 -0
src/proxy_lite.egg-info/requires.txt +18 -0
src/proxy_lite.egg-info/top_level.txt +1 -0
src/proxy_lite/__init__.py +3 -0
src/proxy_lite/__pycache__/__init__.cpython-313.pyc +0 -0
src/proxy_lite/__pycache__/cli.cpython-313.pyc +0 -0
src/proxy_lite/__pycache__/client.cpython-313.pyc +0 -0
src/proxy_lite/__pycache__/gif_maker.cpython-313.pyc +0 -0
src/proxy_lite/__pycache__/history.cpython-313.pyc +0 -0
src/proxy_lite/__pycache__/logger.cpython-313.pyc +0 -0
src/proxy_lite/__pycache__/recorder.cpython-313.pyc +0 -0
src/proxy_lite/__pycache__/runner.cpython-313.pyc +0 -0
src/proxy_lite/__pycache__/serializer.cpython-313.pyc +0 -0
src/proxy_lite/agents/__init__.py +18 -0
src/proxy_lite/agents/__pycache__/__init__.cpython-313.pyc +0 -0
src/proxy_lite/agents/__pycache__/agent_base.cpython-313.pyc +0 -0
src/proxy_lite/agents/__pycache__/proxy_lite_agent.cpython-313.pyc +0 -0
src/proxy_lite/agents/agent_base.py +238 -0
src/proxy_lite/agents/proxy_lite_agent.py +54 -0
src/proxy_lite/app.py +239 -0
src/proxy_lite/browser/__init__.py +0 -0
src/proxy_lite/browser/__pycache__/__init__.cpython-313.pyc +0 -0
src/proxy_lite/browser/__pycache__/bounding_boxes.cpython-313.pyc +0 -0
src/proxy_lite/browser/__pycache__/browser.cpython-313.pyc +0 -0
src/proxy_lite/browser/add_custom_select.js +123 -0
src/proxy_lite/browser/bounding_boxes.py +210 -0
src/proxy_lite/browser/browser.py +508 -0
src/proxy_lite/browser/find_pois.js +397 -0
src/proxy_lite/cli.py +112 -0
src/proxy_lite/client.py +171 -0
src/proxy_lite/configs/default.yaml +23 -0
src/proxy_lite/environments/__init__.py +32 -0
src/proxy_lite/environments/__pycache__/__init__.cpython-313.pyc +0 -0
src/proxy_lite/environments/__pycache__/environment_base.cpython-313.pyc +0 -0
src/proxy_lite/environments/__pycache__/webbrowser.cpython-313.pyc +0 -0
src/proxy_lite/environments/environment_base.py +161 -0
src/proxy_lite/environments/webbrowser.py +194 -0
src/proxy_lite/gif_maker.py +122 -0

CODEOWNERS ADDED Viewed

	@@ -0,0 +1 @@


1	+ * @aptoul @Fraser-Greenlee @XanderJC

Dockerfile ADDED Viewed

	@@ -0,0 +1,59 @@

+# Use an official Playwright Docker image for Python, matching your Playwright version and Debian base
+FROM mcr.microsoft.com/playwright/python:v1.53.0-noble
+# Set the working directory inside the container
+WORKDIR /app
+# The official Playwright image comes with most necessary system dependencies,
+# so we only need to add git for proxy-lite and potentially any very specific missing libs.
+# Removing the extensive list as it's largely redundant with the Playwright base image.
+RUN apt-get update && apt-get install -y \
+    git \
+    xvfb \
+    # Clean up apt caches to reduce image size
+    && rm -rf /var/lib/apt/lists/*
+# Copy common Python dependencies first (needed for pip installs)
+COPY requirements.txt .
+# Copy your Flask application code (app.py) and other project files.
+COPY . .
+# --- START: Directory permission workaround ---
+# Create the directory proxy-lite's recorder insists on writing to
+# and grant full permissions. This addresses the PermissionError.
+# This line creates the directory *directly* under /app, which is now the correct path
+RUN mkdir -p /app/local_trajectories \
+    && chmod -R 777 /app/local_trajectories
+# --- END: Directory permission workaround ---
+# Upgrade pip, setuptools, and wheel for a robust Python build environment.
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel
+# Install your local proxy-lite package in editable mode.
+RUN pip install --no-cache-dir --no-input -e .
+# Install the rest of the Python dependencies from requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# Set environment variables required for Playwright at runtime
+ENV DISPLAY=:99
+ENV XDG_RUNTIME_DIR=/tmp
+# Removed PLAYWRIGHT_BROWSERS_PATH and PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD
+# as the official Playwright image manages these internally, defaulting to /ms-playwright.
+# --- Debugging: Check Playwright version and browser installation (moved AFTER install in the original setup) ---
+# Now checking the default Playwright browser installation path /ms-playwright
+RUN echo "--- Checking Playwright Version (from base image) ---"
+RUN python -m playwright --version
+RUN echo "--- Listing Playwright Browser Cache (Recursive, from base image) ---"
+RUN ls -alR /ms-playwright/
+RUN echo "-----------------------------------"
+# --- End Debugging ---
+# Expose the port your Flask app will listen on. Hugging Face Spaces requires 7860.
+EXPOSE 7860
+# Define the command to run your Flask application using Gunicorn for production.
+CMD exec gunicorn --bind 0.0.0.0:7860 --workers 2 --worker-class gevent app:app --timeout 300

LICENSE ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ Creative Commons Attribution-NonCommercial 4.0 International
2	+
3	+ This work is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License. To view a copy of this license, visit https://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.

Makefile ADDED Viewed

	@@ -0,0 +1,11 @@

+.PHONY: proxy
+proxy:
+	pip install uv
+	uv venv --python 3.11 --python-preference managed
+	uv sync
+	uv pip install -e .
+	playwright install
+app:
+	streamlit run src/proxy_lite/app.py

Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: gunicorn --bind 0.0.0.0:7860 --workers 2 --worker-class gevent app:app --timeout 300

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Proxy Lite Demo V2
-emoji: 📉
 colorFrom: indigo
-colorTo: indigo
 sdk: docker
 pinned: false
 ---

 ---
+title: Proxy Lite Demo For Setup
+emoji: 😻
 colorFrom: indigo
+colorTo: gray
 sdk: docker
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import gevent.monkey
+gevent.monkey.patch_all(asyncio=True) # Keep this at the very top
+import asyncio # Keep this
+from flask import Flask, request, jsonify
+from proxy_lite import Runner, RunnerConfig
+import os
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+app = Flask(__name__)
+_runner = None
+async def initialize_runner():
+    global _runner
+    if _runner is None:
+        logger.info("Initializing Proxy-lite Runner...")
+        hf_api_token = os.environ.get("HF_API_TOKEN")
+        if not hf_api_token:
+            logger.error("HF_API_TOKEN environment variable not set. Cannot initialize Runner.")
+            raise ValueError("HF_API_TOKEN environment variable not set. Please set it as a Space secret.")
+        config = RunnerConfig.from_dict({
+            "environment": {
+                "name": "webbrowser",
+                # Set homepage to Salesforce's generic login URL to avoid premature waits for target page elements.
+                "homepage": "https://login.salesforce.com/",
+                "headless": False, # Keep this False for local testing
+                "launch_args": ["--no-sandbox", "--disable-setuid-sandbox"],
+                "screenshot_delay": 0.5, # Reduced for faster debugging cycles
+                "include_html": True,
+                "include_poi_text": True,
+            },
+            "solver": {
+                "name": "simple",
+                "agent": {
+                    "name": "proxy_lite",
+                    "client": {
+                        "name": "convergence",
+                        "model_id": "convergence-ai/proxy-lite-3b",
+                        "api_base": "https://convergence-ai-demo-api.hf.space/v1",
+                        "api_key": hf_api_token
+                    }
+                }
+            },
+            "environment_timeout": 1800.0,
+            "action_timeout": 1800.0,
+            "task_timeout": 18000.0,
+            "max_steps": 150,
+            "logger_level": "DEBUG",
+        })
+        logger.info(f"DEBUG: app.py - Initializing Runner with environment_timeout: {config.environment_timeout} seconds")
+        logger.info(f"DEBUG: app.py - Full config used: {config.model_dump_json(indent=2)}")
+        _runner = Runner(config=config)
+        logger.info("Proxy-lite Runner initialized successfully.")
+    return _runner
+@app.route('/run_proxy_task', methods=['POST'])
+async def run_proxy_task_endpoint():
+    data = request.json
+    request_task_instruction = data.get('task')
+    if not request_task_instruction:
+        logger.warning("Received request without 'task' field. Returning 400.")
+        return jsonify({"error": "No 'task' provided in request body"}), 400
+    logger.info(f"Received user request task: '{request_task_instruction}'")
+    salesforce_username = os.environ.get("SALESFORCE_USERNAME")
+    salesforce_password = os.environ.get("SALESFORCE_PASSWORD")
+    if not salesforce_username or not salesforce_password:
+        logger.error("Salesforce credentials (SALESFORCE_USERNAME, SALESFORCE_PASSWORD) environment variables not set.")
+        return jsonify({"error": "Salesforce credentials not configured. Please set SALESFORCE_USERNAME and SALESFORCE_PASSWORD as Space secrets."}), 500
+    # Define the specific Account Forecast Settings URL
+    account_forecast_url = "https://dwd000006jia1mae.lightning.force.com/lightning/setup/AccountForecastSettings/home"
+    # Define the tool code block to open a new tab and navigate after login
+    # Using a raw f-string for multiline tool code block
+    tool_code_block_new_tab = fr"""
+<tool_code>
+await browser.open_new_tab_and_go_to(url='{account_forecast_url}')
+</tool_code>
+"""
+    # Refined agent_task instruction to be sequential and robust to Salesforce redirects
+    agent_task = f"""
+    **Task Instructions for Proxy Lite Agent:**
+    1.  **Start on Login Page:** Navigate to the Salesforce login page.
+    2.  **Perform Login:** Log in to Salesforce using the provided username '{salesforce_username}' and password '{salesforce_password}'. Ensure all login fields are filled and the 'Log In' button is clicked.
+    3.  **Handle Post-Login Redirect:** After clicking the 'Log In' button:
+        * Observe the current URL. If the URL has changed from the initial login domain (e.g., from `login.salesforce.com` or `my.salesforce.com`) **immediately execute the following tool code block to open a new tab and navigate directly to the Account Forecast Settings page (`{account_forecast_url}`) to bypass any persistent loading issues or internal redirects:**
+        {tool_code_block_new_tab.strip()}
+    4.  **Confirm Target Page Load:** After successfully navigating to '{account_forecast_url}' (either directly after login or via the new tab strategy), ensure the page is fully loaded and stable. This means no loading spinners should be visible, and the main content for 'Account Forecast Settings' (like a clear heading, relevant toggles, or data tables) should be present and interactive.
+    5.  **Execute Main Task:** Once the Account Forecast Settings page is confirmed loaded and stable, proceed with the original user request: {request_task_instruction}.
+    6.  **Report Final Status:** Report the final status of the requested action, confirming both successful login and complete page load of the Account Forecast Settings.
+    """
+    logger.info(f"Executing agent task (truncated for log): '{agent_task[:500]}...'")
+    try:
+        runner = await initialize_runner()
+        result = await runner.run(agent_task)
+        logger.info(f"Proxy-lite task completed. Output (truncated for log): {result[:500]}...")
+        return jsonify({"output": result})
+    except Exception as e:
+        logger.exception(f"Error processing Salesforce task: {e}")
+        return jsonify({"error": f"An error occurred: {str(e)}. Check logs for details."}), 500
+@app.route('/')
+def root():
+    logger.info("Root endpoint accessed.")
+    return "Proxy-lite API is running. Send POST requests to /run_proxy_task with a 'task' in JSON body."
+if __name__ == '__main__':
+    # It is crucial to set HF_API_TOKEN as an environment variable (e.g., in a .env file or directly)
+    # for local testing as well, otherwise initialize_runner will fail.
+    if not os.environ.get("HF_API_TOKEN"):
+        logger.error("HF_API_TOKEN environment variable is not set. Please set it for local testing.")
+        # Removed exit(1) to allow the Flask app to start for basic connectivity checks,
+        # but runner initialization will still fail if token is missing.
+        # For full functionality, the token is essential.
+    logger.info("Starting Flask development server on 0.0.0.0:7860...")
+    app.run(host='0.0.0.0', port=7860, debug=True)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,65 @@

+[project]
+name = "proxy-lite"
+version = "0.1.0"
+description = "Proxy Lite - A mini, open-weights, version of the Convergence AI Proxy assistant."
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "omegaconf>=2.3.0",
+    "openai>=1.61.1",
+    "opencv-python>=4.11.0.86",
+    "opencv-python-headless>=4.11.0.86",
+    "playwright-stealth>=1.0.6",
+    "playwright>=1.50.0",
+    "pydantic>=2.10.6",
+    "rich>=13.9.4",
+    "setuptools>=75.8.0",
+    "tenacity>=9.0.0",
+    "torch>=2.5.1",
+    "torchvision>=0.20.1",
+    "streamlit>=1.40.2",
+    "pre-commit>=4.1.0",
+]
+[project.scripts]
+proxy = "proxy_lite.cli:main"
+[project.optional-dependencies]
+serving = [
+    "transformers",
+    "vllm==0.7.2",
+]
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools]
+packages = { find = { where = ["src"] } }
+[tool.setuptools.package-data]
+proxy_lite = ["**/*.json"]
+[tool.ruff]
+line-length = 120
+[tool.ruff.lint]
+select = ["E", "F", "B", "I", "SIM"]
+ignore = [
+    "B028",
+    "E722", # ignore bare except
+    "B904", # ignore raise from requirement
+    "FA102",
+]
+[tool.ruff.lint.flake8-bugbear]
+extend-immutable-calls = [
+    "fastapi.Depends",
+    "fastapi.params.Depends",
+    "fastapi.Query",
+    "fastapi.params.Query",
+]
+[tool.uv.sources]
+transformers = { git = "https://github.com/huggingface/transformers.git", rev = "336dc69d63d56f232a183a3e7f52790429b871ef" }

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+Flask[async]
+-e .
+playwright
+playwright-stealth==1.0.6
+gunicorn
+gevent

src/proxy_lite.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,36 @@

+Metadata-Version: 2.4
+Name: proxy-lite
+Version: 0.1.0
+Summary: Proxy Lite - A mini, open-weights, version of the Convergence AI Proxy assistant.
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: omegaconf>=2.3.0
+Requires-Dist: openai>=1.61.1
+Requires-Dist: opencv-python>=4.11.0.86
+Requires-Dist: opencv-python-headless>=4.11.0.86
+Requires-Dist: playwright-stealth>=1.0.6
+Requires-Dist: playwright>=1.50.0
+Requires-Dist: pydantic>=2.10.6
+Requires-Dist: rich>=13.9.4
+Requires-Dist: setuptools>=75.8.0
+Requires-Dist: tenacity>=9.0.0
+Requires-Dist: torch>=2.5.1
+Requires-Dist: torchvision>=0.20.1
+Requires-Dist: streamlit>=1.40.2
+Requires-Dist: pre-commit>=4.1.0
+Provides-Extra: serving
+Requires-Dist: transformers; extra == "serving"
+Requires-Dist: vllm==0.7.2; extra == "serving"
+Dynamic: license-file
+---
+title: Proxy Lite Demo For Setup
+emoji: 😻
+colorFrom: indigo
+colorTo: gray
+sdk: docker
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

src/proxy_lite.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,35 @@

+LICENSE
+README.md
+pyproject.toml
+src/proxy_lite/__init__.py
+src/proxy_lite/app.py
+src/proxy_lite/cli.py
+src/proxy_lite/client.py
+src/proxy_lite/gif_maker.py
+src/proxy_lite/history.py
+src/proxy_lite/logger.py
+src/proxy_lite/recorder.py
+src/proxy_lite/runner.py
+src/proxy_lite/serializer.py
+src/proxy_lite.egg-info/PKG-INFO
+src/proxy_lite.egg-info/SOURCES.txt
+src/proxy_lite.egg-info/dependency_links.txt
+src/proxy_lite.egg-info/entry_points.txt
+src/proxy_lite.egg-info/requires.txt
+src/proxy_lite.egg-info/top_level.txt
+src/proxy_lite/agents/__init__.py
+src/proxy_lite/agents/agent_base.py
+src/proxy_lite/agents/proxy_lite_agent.py
+src/proxy_lite/browser/__init__.py
+src/proxy_lite/browser/bounding_boxes.py
+src/proxy_lite/browser/browser.py
+src/proxy_lite/environments/__init__.py
+src/proxy_lite/environments/environment_base.py
+src/proxy_lite/environments/webbrowser.py
+src/proxy_lite/solvers/__init__.py
+src/proxy_lite/solvers/simple_solver.py
+src/proxy_lite/solvers/solver_base.py
+src/proxy_lite/tools/__init__.py
+src/proxy_lite/tools/browser_tool.py
+src/proxy_lite/tools/return_tool.py
+src/proxy_lite/tools/tool_base.py

src/proxy_lite.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/proxy_lite.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ proxy = proxy_lite.cli:main

src/proxy_lite.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+omegaconf>=2.3.0
+openai>=1.61.1
+opencv-python>=4.11.0.86
+opencv-python-headless>=4.11.0.86
+playwright-stealth>=1.0.6
+playwright>=1.50.0
+pydantic>=2.10.6
+rich>=13.9.4
+setuptools>=75.8.0
+tenacity>=9.0.0
+torch>=2.5.1
+torchvision>=0.20.1
+streamlit>=1.40.2
+pre-commit>=4.1.0
+[serving]
+transformers
+vllm==0.7.2

src/proxy_lite.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ proxy_lite

src/proxy_lite/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .runner import Runner, RunnerConfig
2	+
3	+ __all__ = ["Runner", "RunnerConfig"]

src/proxy_lite/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (258 Bytes). View file

src/proxy_lite/__pycache__/cli.cpython-313.pyc ADDED Viewed

Binary file (5.59 kB). View file

src/proxy_lite/__pycache__/client.cpython-313.pyc ADDED Viewed

Binary file (9.28 kB). View file

src/proxy_lite/__pycache__/gif_maker.cpython-313.pyc ADDED Viewed

Binary file (6.39 kB). View file

src/proxy_lite/__pycache__/history.cpython-313.pyc ADDED Viewed

Binary file (10.5 kB). View file

src/proxy_lite/__pycache__/logger.cpython-313.pyc ADDED Viewed

Binary file (3.56 kB). View file

src/proxy_lite/__pycache__/recorder.cpython-313.pyc ADDED Viewed

Binary file (6.73 kB). View file

src/proxy_lite/__pycache__/runner.cpython-313.pyc ADDED Viewed

Binary file (14.7 kB). View file

src/proxy_lite/__pycache__/serializer.cpython-313.pyc ADDED Viewed

Binary file (3.04 kB). View file

src/proxy_lite/agents/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from typing import Union
+from .agent_base import Agents, BaseAgent, BaseAgentConfig
+from .proxy_lite_agent import ProxyLiteAgent, ProxyLiteAgentConfig
+AgentTypes = Union[*list(Agents._agent_registry.values())]
+AgentConfigTypes = Union[*list(Agents._agent_config_registry.values())]
+__all__ = [
+    "AgentConfigTypes",
+    "AgentTypes",
+    "Agents",
+    "BaseAgent",
+    "BaseAgentConfig",
+    "ProxyLiteAgent",
+    "ProxyLiteAgentConfig",
+]

src/proxy_lite/agents/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (758 Bytes). View file

src/proxy_lite/agents/__pycache__/agent_base.cpython-313.pyc ADDED Viewed

Binary file (12.8 kB). View file

src/proxy_lite/agents/__pycache__/proxy_lite_agent.cpython-313.pyc ADDED Viewed

Binary file (3.63 kB). View file

src/proxy_lite/agents/agent_base.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import json
+import logging
+from abc import ABC, abstractmethod
+from contextlib import AsyncExitStack
+from functools import cached_property
+from typing import Any, Optional, Type, cast
+from pydantic import BaseModel, Field
+from tenacity import before_sleep_log, retry, stop_after_attempt, wait_exponential
+from proxy_lite.client import BaseClient, ClientConfigTypes, OpenAIClientConfig
+from proxy_lite.history import (
+    AssistantMessage,
+    MessageHistory,
+    MessageLabel,
+    SystemMessage,
+    Text,
+    ToolCall,
+    ToolMessage,
+    UserMessage,
+)
+from proxy_lite.logger import logger
+from proxy_lite.tools import Tool
+# if TYPE_CHECKING:
+#     from proxy_lite.tools import Tool
+class BaseAgentConfig(BaseModel):
+    client: ClientConfigTypes = Field(default_factory=OpenAIClientConfig)
+    history_messages_limit: dict[MessageLabel, int] = Field(default_factory=lambda: dict())
+    history_messages_include: Optional[dict[MessageLabel, int]] = Field(
+        default=None,
+        description="If set, overrides history_messages_limit by setting all message types to 0 except those specified",
+    )
+    def model_post_init(self, __context: Any) -> None:
+        if self.history_messages_include is not None:
+            self.history_messages_limit = {label: 0 for label in MessageLabel}
+            self.history_messages_limit.update(self.history_messages_include)
+class BaseAgent(BaseModel, ABC):
+    config: BaseAgentConfig
+    temperature: float = Field(default=0.7, ge=0, le=2)
+    history: MessageHistory = Field(default_factory=MessageHistory)
+    client: Optional[BaseClient] = None
+    env_tools: list[Tool] = Field(default_factory=list)
+    task: Optional[str] = Field(default=None)
+    seed: Optional[int] = Field(default=None)
+    class Config:
+        arbitrary_types_allowed = True
+    def __init__(self, **data) -> None:
+        super().__init__(**data)
+        self._exit_stack = AsyncExitStack()
+        self._tools_init_task = None
+    def model_post_init(self, __context: Any) -> None:
+        super().model_post_init(__context)
+        self.client = BaseClient.create(self.config.client)
+    @property
+    @abstractmethod
+    def system_prompt(self) -> str: ...
+    @cached_property
+    @abstractmethod
+    def tools(self) -> list[Tool]: ...
+    @cached_property
+    def tool_descriptions(self) -> str:
+        tool_descriptions = []
+        for tool in self.tools:
+            func_descriptions = "\n".join("- {name}: {description}".format(**schema) for schema in tool.schema)
+            tool_title = f"{tool.__class__.__name__}:\n" if len(self.tools) > 1 else ""
+            tool_descriptions.append(f"{tool_title}{func_descriptions}")
+        return "\n\n".join(tool_descriptions)
+    async def get_history_view(self) -> MessageHistory:
+        return MessageHistory(
+            messages=[SystemMessage(content=[Text(text=self.system_prompt)])],
+        ) + self.history.history_view(
+            limits=self.config.history_messages_limit,
+        )
+    @retry(
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        stop=stop_after_attempt(3),
+        reraise=True,
+        before_sleep=before_sleep_log(logger, logging.ERROR),
+    )
+    async def generate_output(
+        self,
+        use_tool: bool = False,
+        response_format: Optional[type[BaseModel]] = None,
+        append_assistant_message: bool = True,
+    ) -> AssistantMessage:
+        messages: MessageHistory = await self.get_history_view()
+        response_content = (
+            await self.client.create_completion(
+                messages=messages,
+                temperature=self.temperature,
+                seed=self.seed,
+                response_format=response_format,
+                tools=self.tools if use_tool else None,
+            )
+        ).model_dump()
+        response_content = response_content["choices"][0]["message"]
+        assistant_message = AssistantMessage(
+            role=response_content["role"],
+            content=[Text(text=response_content["content"])] if response_content["content"] else [],
+            tool_calls=response_content["tool_calls"],
+        )
+        if append_assistant_message:
+            self.history.append(message=assistant_message, label=self.message_label)
+        return assistant_message
+    def receive_user_message(
+        self,
+        text: Optional[str] = None,
+        image: list[bytes] = None,
+        label: MessageLabel = None,
+        is_base64: bool = False,
+    ) -> None:
+        message = UserMessage.from_media(
+            text=text,
+            image=image,
+            is_base64=is_base64,
+        )
+        self.history.append(message=message, label=label)
+    def receive_system_message(
+        self,
+        text: Optional[str] = None,
+        label: MessageLabel = None,
+    ) -> None:
+        message = SystemMessage.from_media(text=text)
+        self.history.append(message=message, label=label)
+    def receive_assistant_message(
+        self,
+        content: Optional[str] = None,
+        tool_calls: Optional[list[ToolCall]] = None,
+        label: MessageLabel = None,
+    ) -> None:
+        message = AssistantMessage(
+            content=[Text(text=content)] if content else [],
+            tool_calls=tool_calls,
+        )
+        self.history.append(message=message, label=label)
+    async def use_tool(self, tool_call: ToolCall):
+        function = tool_call.function
+        for tool in self.tools:
+            if hasattr(tool, function["name"]):
+                return await getattr(tool, function["name"])(
+                    **json.loads(function["arguments"]),
+                )
+        msg = f'No tool function with name "{function["name"]}"'
+        raise ValueError(msg)
+    async def receive_tool_message(
+        self,
+        text: str,
+        tool_id: str,
+        label: MessageLabel = None,
+    ) -> None:
+        self.history.append(
+            message=ToolMessage(content=[Text(text=text)], tool_call_id=tool_id),
+            label=label,
+        )
+class Agents:
+    _agent_registry: dict[str, type[BaseAgent]] = {}
+    _agent_config_registry: dict[str, type[BaseAgentConfig]] = {}
+    @classmethod
+    def register_agent(cls, name: str):
+        """
+        Decorator to register an Agent class under a given name.
+        Example:
+            @Agents.register_agent("browser")
+            class BrowserAgent(BaseAgent):
+                ...
+        """
+        def decorator(agent_cls: type[BaseAgent]) -> type[BaseAgent]:
+            cls._agent_registry[name] = agent_cls
+            return agent_cls
+        return decorator
+    @classmethod
+    def register_agent_config(cls, name: str):
+        """
+        Decorator to register a configuration class under a given name.
+        Example:
+            @Agents.register_agent_config("browser")
+            class BrowserAgentConfig(BaseAgentConfig):
+                ...
+        """
+        def decorator(config_cls: type[BaseAgentConfig]) -> type[BaseAgentConfig]:
+            cls._agent_config_registry[name] = config_cls
+            return config_cls
+        return decorator
+    @classmethod
+    def get(cls, name: str) -> type[BaseAgent]:
+        """
+        Retrieve a registered Agent class by its name.
+        Raises:
+            ValueError: If no such agent is found.
+        """
+        try:
+            return cast(Type[BaseAgent], cls._agent_registry[name])
+        except KeyError:
+            raise ValueError(f"Agent '{name}' not found.")
+    @classmethod
+    def get_config(cls, name: str) -> type[BaseAgentConfig]:
+        """
+        Retrieve a registered Agent configuration class by its name.
+        Raises:
+            ValueError: If no such config is found.
+        """
+        try:
+            return cast(type[BaseAgentConfig], cls._agent_config_registry[name])
+        except KeyError:
+            raise ValueError(f"Agent config for '{name}' not found.")

src/proxy_lite/agents/proxy_lite_agent.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from functools import cached_property
+from typing import Literal
+from pydantic import Field
+from proxy_lite.history import MessageHistory, MessageLabel, SystemMessage, Text
+from proxy_lite.tools import Tool
+from .agent_base import Agents, BaseAgent, BaseAgentConfig
+MODEL_SYSTEM_PROMPT = """You are Proxy-Lite, an AI assistant that can perform actions on a computer screen.
+You were developed by Convergence AI.
+The user will instuct you to perform a task.
+You will be shown a screen as well as relevant interactable elements highlighted by mark_ids and you will be given a set of tools to use to perform the task.
+You should make observations about the screen, putting them in <observation></observation> tags.
+You should then reason about what needs to be done to complete the task, putting your thoughts in <thinking></thinking> tags.
+You should then use the tools to perform the task, putting the tool calls in <tool_call></tool_call> tags.
+"""  # noqa: E501
+MAX_MESSAGES_FOR_CONTEXT_WINDOW = {
+    MessageLabel.SCREENSHOT: 1,
+}
+@Agents.register_agent_config("proxy_lite")
+class ProxyLiteAgentConfig(BaseAgentConfig):
+    name: Literal["proxy_lite"] = "proxy_lite"
+    history_messages_limit: dict[MessageLabel, int] = Field(
+        default_factory=lambda: MAX_MESSAGES_FOR_CONTEXT_WINDOW,
+    )
+@Agents.register_agent("proxy_lite")
+class ProxyLiteAgent(BaseAgent):
+    config: ProxyLiteAgentConfig
+    message_label: MessageLabel = MessageLabel.AGENT_MODEL_RESPONSE
+    def __init__(self, **data):
+        super().__init__(**data)
+    @property
+    def system_prompt(self) -> str:
+        return MODEL_SYSTEM_PROMPT
+    @cached_property
+    def tools(self) -> list[Tool]:
+        return self.env_tools
+    async def get_history_view(self) -> MessageHistory:
+        return MessageHistory(
+            messages=[SystemMessage(content=[Text(text=self.system_prompt)])],
+        ) + self.history.history_view(
+            limits=self.config.history_messages_limit,
+        )

src/proxy_lite/app.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import asyncio
+import base64
+from io import BytesIO
+import streamlit as st
+from PIL import Image
+from proxy_lite import Runner, RunnerConfig
+def get_user_config(config_expander):
+    config = {
+        "environment": {
+            "name": "webbrowser",
+            "annotate_image": True,
+            "screenshot_delay": 2.0,
+            "include_html": False,
+            "viewport_width": 1280,
+            "viewport_height": 1920,
+            "include_poi_text": True,
+            "homepage": "https://dwd000006jia1mae.lightning.force.com/lightning/setup/AccountForecastSettings/home",
+            "keep_original_image": False,
+            "headless": False,  # without proxies headless mode often results in getting bot blocked
+        },
+        "solver": {
+            "name": "simple",
+            "agent": {
+                "name": "proxy_lite",
+                "client": {
+                    "name": "convergence",
+                    "model_id": "convergence-ai/proxy-lite-3b",
+                    "api_base": "https://convergence-ai-demo-api.hf.space/v1",
+                },
+            },
+        },
+        "local_view": False,
+        "verbose": True,
+        "task_timeout": 1800,  # 30 minutes
+        "action_timeout": 300,
+        "environment_timeout": 120,
+    }
+    with config_expander:
+        st.subheader("Environment Settings")
+        col1, col2 = st.columns(2)
+        with col1:
+            config["environment"]["include_html"] = st.checkbox(
+                "Include HTML",
+                value=config["environment"]["include_html"],
+                help="Include HTML in observations",
+            )
+            config["environment"]["include_poi_text"] = st.checkbox(
+                "Include POI Text",
+                value=config["environment"]["include_poi_text"],
+                help="Include points of interest text in observations",
+            )
+            config["environment"]["homepage"] = st.text_input(
+                "Homepage",
+                value=config["environment"]["homepage"],
+                help="Homepage to start from",
+            )
+        with col2:
+            config["solver"]["agent"]["client"]["api_base"] = st.text_input(
+                "VLLM Server URL",
+                value=config["solver"]["agent"]["client"]["api_base"],
+                help="URL of a vllm server running proxy-lite",
+            )
+            config["environment"]["screenshot_delay"] = st.slider(
+                "Screenshot Delay (seconds)",
+                min_value=0.5,
+                max_value=10.0,
+                value=config["environment"]["screenshot_delay"],
+                step=0.5,
+                help="Delay before taking screenshots",
+            )
+        st.subheader("Advanced Settings")
+        config["task_timeout"] = st.number_input(
+            "Task Timeout (seconds)",
+            min_value=60,
+            max_value=3600,
+            step=60,
+            value=config["task_timeout"],
+            help="Maximum time allowed for task completion",
+        )
+        config["action_timeout"] = st.number_input(
+            "Action Timeout (seconds)",
+            min_value=10,
+            max_value=300,
+            step=10,
+            value=config["action_timeout"],
+            help="Maximum time allowed for an action to complete",
+        )
+        config["environment_timeout"] = st.number_input(
+            "Environment Timeout (seconds)",
+            min_value=10,
+            max_value=300,
+            step=10,
+            value=config["environment_timeout"],
+            help="Maximum time allowed for environment to respond",
+        )
+    return config
+async def run_task_async(
+    task: str,
+    status_placeholder,
+    action_placeholder,
+    environment_placeholder,
+    image_placeholder,
+    history_placeholder,
+    config: dict,
+):
+    try:
+        config = RunnerConfig.from_dict(config)
+    except Exception as e:
+        st.error(f"Error loading RunnerConfig: {e!s}")
+        return
+    print(config)
+    runner = Runner(config=config)
+    # Add the spinning animation using HTML
+    status_placeholder.markdown(
+        """
+        <style>
+        @keyframes spin {
+            0% { content: "⚡"; }
+            25% { content: "⚡."; }
+            50% { content: "⚡.."; }
+            75% { content: "⚡..."; }
+        }
+        .spinner::before {
+            content: "⚡";
+            animation: spin 2s linear infinite;
+            display: inline-block;
+        }
+        </style>
+        <div><b>Resolving your task  </b><span class="spinner"></span></div>
+        """,
+        unsafe_allow_html=True,
+    )
+    all_steps = []
+    all_screenshots = []
+    all_soms = []
+    async for run in runner.run_generator(task):
+        # Update status with latest step
+        if run.actions:
+            latest_step = run.actions[-1].text
+            latest_step += "".join(
+                [
+                    f'<tool_call>{{"name": {tool_call.function["name"]}, "arguments": {tool_call.function["arguments"]}}}</tool_call>'  # noqa: E501
+                    for tool_call in run.actions[-1].tool_calls
+                ]
+            )
+            action_placeholder.write(f"⚡ **Latest Step:** {latest_step}")
+            all_steps.append(latest_step)
+        # Update image if available
+        if run.observations and run.observations[-1].state.image:
+            environment_placeholder.write("🌐 **Environment:**")
+            image_bytes = base64.b64decode(run.observations[-1].state.image)
+            image = Image.open(BytesIO(image_bytes))
+            image_placeholder.image(image, use_container_width=True)
+            all_screenshots.append(image)
+            som = run.observations[-1].state.text
+            all_soms.append(som)
+        # Update history
+        with history_placeholder, st.expander("🕝 **History**"):
+            for idx, (action, img, som) in enumerate(zip(all_steps, all_screenshots, all_soms, strict=False)):
+                st.write(f"**Step {idx + 1}**")
+                st.image(img, use_container_width=True)
+                st.markdown(som)
+                st.write(action)
+    action_placeholder.write(" ")
+    status_placeholder.write(f"✨ **Result:** {latest_step}")
+def main():
+    st.title("⚡ Proxy-Lite")
+    def img_to_base64(image_path):
+        with open(image_path, "rb") as img_file:
+            return base64.b64encode(img_file.read()).decode("utf-8")
+    st.markdown("Powered by **Proxy-Lite**", unsafe_allow_html=True)
+    if "config_expanded" not in st.session_state:
+        st.session_state.config_expanded = False
+    if "settings_expanded" not in st.session_state:
+        st.session_state.settings_expanded = False
+    config_expander = st.expander("⚙️ Proxy-Lite Configuration", expanded=st.session_state.config_expanded)
+    config = get_user_config(config_expander)
+    with st.form(key="run_task_form"):
+        task = st.text_input(
+            "Submit a task",
+            key="task_input",
+            help="Enter a task to be completed",
+        )
+        submit_button = st.form_submit_button("Submit a task", type="primary", use_container_width=True)
+        if submit_button:
+            st.session_state.config_expanded = False
+            if task:
+                # Create placeholders for dynamic updates
+                status_placeholder = st.empty()
+                st.write(" ")
+                action_placeholder = st.empty()
+                environment_placeholder = st.empty()
+                image_placeholder = st.empty()
+                history_placeholder = st.empty()
+                # Run the async task
+                asyncio.run(
+                    run_task_async(
+                        task,
+                        status_placeholder,
+                        action_placeholder,
+                        environment_placeholder,
+                        image_placeholder,
+                        history_placeholder,
+                        config,
+                    ),
+                )
+                st.success("Task completed!", icon="✨")
+            else:
+                st.error("Please give a task first!")
+if __name__ == "__main__":
+    main()

src/proxy_lite/browser/__init__.py ADDED Viewed

File without changes

src/proxy_lite/browser/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (166 Bytes). View file

src/proxy_lite/browser/__pycache__/bounding_boxes.cpython-313.pyc ADDED Viewed

Binary file (8.86 kB). View file

src/proxy_lite/browser/__pycache__/browser.cpython-313.pyc ADDED Viewed

Binary file (30.3 kB). View file

src/proxy_lite/browser/add_custom_select.js ADDED Viewed

	@@ -0,0 +1,123 @@

+handledSelectElementsConvergence = new WeakSet();
+overwriteDefaultSelectConvergence = (input = null) => {
+    let activeSelectElement = null;
+    // Handle iframe input element
+    let rootElement = input ? input : document.documentElement;
+    function createCustomSelectElement() {
+        // Create the custom select container
+        const customSelect = document.createElement('div');
+        customSelect.id = 'convergence-custom-select-element-X2EmudtLRN';
+        customSelect.style.position = 'absolute'
+        customSelect.style.zIndex = 2147483647 - 1;
+        customSelect.style.display = 'none';
+        document.body.appendChild(customSelect);
+        // Create the select options list
+        const optionsList = document.createElement('div');
+        optionsList.style.border = '1px solid #ccc';
+        optionsList.style.backgroundColor = '#fff';
+        optionsList.style.color = 'black';
+        customSelect.appendChild(optionsList);
+        return customSelect;
+    }
+    function showCustomSelect(select) {
+        activeSelectElement = select;
+        // Clear previous options
+        const customSelect = rootElement.querySelector('#convergence-custom-select-element-X2EmudtLRN');
+        let optionsList = customSelect.firstChild;
+        optionsList.innerHTML = '';
+        // Populate with new options
+        Array.from(select.options).forEach(option => {
+            const customOption = document.createElement('div');
+            customOption.className = 'custom-option';
+            customOption.style.padding = '8px';
+            customOption.style.cursor = 'pointer';
+            customOption.textContent = option.text;
+            customOption.dataset.value = option.value;
+            optionsList.appendChild(customOption);
+            customOption.addEventListener('mouseenter', function () {
+                customOption.style.backgroundColor = '#f0f0f0';
+            });
+            customOption.addEventListener('mouseleave', function () {
+                customOption.style.backgroundColor = '';
+            });
+            customOption.addEventListener('mousedown', (e) => {
+                e.stopPropagation();
+                select.value = customOption.dataset.value;
+                customSelect.style.display = 'none';
+                activeSelectElement = null;
+                // ensure we trigger all potential event listeners
+                select.dispatchEvent(new InputEvent('focus', { bubbles: true, cancelable: true }));
+                select.dispatchEvent(new InputEvent('input', { bubbles: true, cancelable: true }));
+                select.dispatchEvent(new InputEvent('change', { bubbles: true, cancelable: true }));
+                select.dispatchEvent(new InputEvent('blur', { bubbles: true, cancelable: true }));
+            });
+        });
+        // Position and show the custom select
+        const selectRect = select.getBoundingClientRect();
+        customSelect.style.top = `${selectRect.bottom + window.scrollY}px`;
+        customSelect.style.left = `${selectRect.left + window.scrollX}px`;
+        customSelect.style.width = `${selectRect.width}px`;
+        customSelect.style.display = 'block';
+        select.focus();
+        select.addEventListener('blur', function (e) {
+            customSelect.style.display = 'none';
+            activeSelectElement = null;
+        });
+        select.addEventListener('change', function (e) {
+            customSelect.style.display = 'none';
+            activeSelectElement = null;
+        });
+    }
+    // Ensure we have a custom select element
+    let customSelect = rootElement.querySelector(`#convergence-custom-select-element-X2EmudtLRN`);
+    if (!customSelect) {
+        customSelect = createCustomSelectElement();
+    }
+    // Find selects in shadow DOMs
+    function findSelectInShadowRoot(element) {
+        if (element.shadowRoot) {
+            return element.shadowRoot.querySelectorAll('select');
+        }
+        return [];
+    }
+    let shadowSelects = [];
+    rootElement.querySelectorAll('*').forEach(el => {
+        shadowSelects.push(...findSelectInShadowRoot(el));
+    });
+    // Find selects in the regular (light) DOM
+    const lightSelects = Array.from(rootElement.querySelectorAll('select'));
+    // Add event listeners to all select elements
+    const allSelects = [...lightSelects, ...shadowSelects];
+    allSelects.forEach(select => {
+        if (select.hasAttribute('multiple')) {
+            // skip special multiple elements as our POI code already handles them
+            return;
+        }
+        if (!handledSelectElementsConvergence.has(select)) {
+            select.addEventListener('mousedown', (e) => {
+                // only use custom select when the default behaviour is being used
+                if (!e.defaultPrevented) {
+                    showCustomSelect(select);
+                    e.preventDefault();
+                }
+            });
+            handledSelectElementsConvergence.add(select);
+        }
+    });
+}

src/proxy_lite/browser/bounding_boxes.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import math
+from typing import Any
+import cv2
+import numpy as np
+from pydantic import BaseModel, Field, field_validator
+class Point(BaseModel):
+    x: int
+    y: int
+    def __iter__(self):
+        return iter((self.x, self.y))
+    def __getitem__(self, index) -> int:
+        return (self.x, self.y)[index]
+    def __tuple__(self) -> tuple[int, int]:
+        return (self.x, self.y)
+    def __repr__(self) -> str:
+        return f"Point(x={self.x}, y={self.y})"
+class BoundingBox(BaseModel):
+    label: str = Field(..., description="The label that's given for this bounding box")
+    left: int = Field(..., description="Left coordinate of the bounding box")
+    right: int = Field(..., description="Right coordinate of the bounding box")
+    top: int = Field(..., description="Top coordinate of the bounding box")
+    bottom: int = Field(..., description="Bottom coordinate of the bounding box")
+    @field_validator("left", "top", mode="before")
+    @classmethod
+    def round_down(cls, v):
+        return math.floor(float(v))
+    @field_validator("right", "bottom", mode="before")
+    @classmethod
+    def round_up(cls, v):
+        return math.ceil(float(v))
+class POI(BaseModel):
+    info: dict[str, Any]
+    element_centroid: Point
+    bounding_box: BoundingBox
+def calculate_dash_points(start, end, dash_length, gap_length):
+    x1, y1 = start
+    x2, y2 = end
+    dx = x2 - x1
+    dy = y2 - y1
+    dist = np.sqrt(dx * dx + dy * dy)
+    if dist == 0:
+        return []
+    unit_x = dx / dist
+    unit_y = dy / dist
+    dash_points = []
+    current_dist = 0
+    while current_dist < dist:
+        dash_end = min(current_dist + dash_length, dist)
+        dash_points.extend(
+            [
+                (int(x1 + unit_x * current_dist), int(y1 + unit_y * current_dist)),
+                (int(x1 + unit_x * dash_end), int(y1 + unit_y * dash_end)),
+            ],
+        )
+        current_dist += dash_length + gap_length
+    return dash_points
+def draw_dashed_rectangle(
+    img,
+    bbox: BoundingBox,
+    color,
+    thickness=1,
+    dash_length=10,
+    gap_length=5,
+):
+    # Calculate dash points for all sides
+    top_points = calculate_dash_points(
+        (bbox.left + 25, bbox.top + 25),
+        (bbox.right + 25, bbox.top + 25),
+        dash_length,
+        gap_length,
+    )
+    right_points = calculate_dash_points(
+        (bbox.right + 25, bbox.top + 25),
+        (bbox.right + 25, bbox.bottom + 25),
+        dash_length,
+        gap_length,
+    )
+    bottom_points = calculate_dash_points(
+        (bbox.right + 25, bbox.bottom + 25),
+        (bbox.left + 25, bbox.bottom + 25),
+        dash_length,
+        gap_length,
+    )
+    left_points = calculate_dash_points(
+        (bbox.left + 25, bbox.bottom + 25),
+        (bbox.left + 25, bbox.top + 25),
+        dash_length,
+        gap_length,
+    )
+    # Combine all points
+    all_points = top_points + right_points + bottom_points + left_points
+    # Draw all lines at once
+    if all_points:
+        all_points = np.array(all_points).reshape((-1, 2, 2))
+        cv2.polylines(img, all_points, False, color, thickness)
+# @time_it(name='Annotate bounding box')
+def annotate_bounding_box(image: bytes, bbox: BoundingBox) -> None:
+    # Draw dashed bounding box
+    draw_dashed_rectangle(
+        image,
+        bbox,
+        color=(0, 0, 255),
+        thickness=1,
+        dash_length=10,
+        gap_length=5,
+    )
+    # Prepare label
+    font_scale = 0.4 * 4  # Increased by 4x for the larger patch
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    thickness = 3  # Increased thickness for the larger patch
+    # Get text size for the larger patch
+    (label_width, label_height), _ = cv2.getTextSize(
+        bbox.label,
+        font,
+        font_scale,
+        thickness,
+    )
+    # Create a larger patch (4x)
+    large_label_patch = np.zeros(
+        (label_height + 20, label_width + 20, 4),
+        dtype=np.uint8,
+    )
+    large_label_patch[:, :, 0:3] = (0, 0, 255)  # BGR color format: Red background
+    large_label_patch[:, :, 3] = 128  # Alpha channel: 50% opacity (128/255 = 0.5)
+    # Draw text on the larger patch
+    cv2.putText(
+        large_label_patch,
+        bbox.label,
+        (8, label_height + 8),  # Adjusted position for the larger patch
+        font,
+        font_scale,
+        (255, 255, 255, 128),  # White text, 50% opaque (128/255 = 0.5)
+        thickness,
+    )
+    # Scale down the patch to improve anti-aliasing
+    label_patch = cv2.resize(
+        large_label_patch,
+        (label_width // 4 + 5, label_height // 4 + 5),
+        interpolation=cv2.INTER_AREA,
+    )
+    # Calculate position for top-left alignment
+    offset = 2  # Small offset to prevent touching the bounding box edge
+    x = min(image.shape[1], max(0, int(bbox.left + 25) - offset))
+    y = min(image.shape[0], max(0, int(bbox.top + 25) - label_patch.shape[0] - offset))
+    # Ensure we're not out of bounds
+    x_end = min(image.shape[1], x + label_patch.shape[1])
+    y_end = min(image.shape[0], y + label_patch.shape[0])
+    label_patch = label_patch[: (y_end - y), : (x_end - x)]
+    # Create a mask for the label patch
+    alpha_mask = label_patch[:, :, 3] / 255.0
+    alpha_mask = np.repeat(alpha_mask[:, :, np.newaxis], 3, axis=2)
+    # Blend the label patch with the image
+    image_section = image[y:y_end, x:x_end]
+    blended = (1 - alpha_mask) * image_section + alpha_mask * label_patch[:, :, 0:3]
+    image[y:y_end, x:x_end] = blended.astype(np.uint8)
+def annotate_bounding_boxes(image: bytes, bounding_boxes: list[BoundingBox]) -> bytes:
+    # Read the image
+    nparr = np.frombuffer(image, np.uint8)
+    # Decode the image
+    img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+    padded_img = cv2.copyMakeBorder(
+        img,
+        top=25,  # Value chosen based on label size
+        bottom=25,  # Value chosen based on label size
+        left=25,  # Value chosen based on label size
+        right=25,  # Value chosen based on label size
+        borderType=cv2.BORDER_CONSTANT,
+        value=(255, 255, 255),
+    )
+    for bounding_box in bounding_boxes:
+        # Annotate the image in place with the bounding box and the bounding box label
+        annotate_bounding_box(padded_img, bounding_box)
+    _, buffer = cv2.imencode(".jpeg", padded_img)
+    return buffer.tobytes()

src/proxy_lite/browser/browser.py ADDED Viewed

	@@ -0,0 +1,508 @@

+import asyncio
+import logging
+import platform
+import re
+from contextlib import AsyncExitStack
+from pathlib import Path
+from typing import Literal, Optional, Self
+from playwright.async_api import Browser, BrowserContext, Page, Playwright, async_playwright
+from playwright.async_api import TimeoutError as PlaywrightTimeoutError
+from playwright_stealth import StealthConfig, stealth_async
+from pydantic import Field
+from tenacity import before_sleep_log, retry, stop_after_delay, wait_exponential
+from proxy_lite.browser.bounding_boxes import POI, BoundingBox, Point, annotate_bounding_boxes
+from proxy_lite.logger import logger
+import base64
+SELF_CONTAINED_TAGS = [
+    # many of these are non-interactive but keeping them anyway
+    "area",
+    "base",
+    "br",
+    "col",
+    "embed",
+    "hr",
+    "img",
+    "input",
+    "link",
+    "meta",
+    "param",
+    "source",
+    "track",
+    "wbr",
+]
+def element_as_text(
+    mark_id: int,
+    tag: Optional[str] = None,
+    text: Optional[str] = None,
+    **raw_attributes,
+) -> str:
+    """Return a text representation of all elements on the page."""
+    attributes = []
+    for k, v in raw_attributes.items():
+        if v is None:
+            continue
+        if isinstance(v, bool):
+            if v:
+                attributes.append(k)
+            # we ignore False bool attributes
+        else:
+            v = str(v)
+            if len(v) > 2500:
+                v = v[: 2500 - 1] + "…"
+            attributes.append(f'{k}="{v}"')
+    attributes = " ".join(attributes)
+    attributes = (" " + attributes).rstrip()
+    tag = tag.lower()
+    if text is None:
+        text = ""
+    if len(text) > 2500:
+        text = text[: 2500 - 1] + "…"
+    # sub-out line breaks so elements are easier to distinguish
+    attributes = re.sub(r"\r\n|\r|\n", "⏎", attributes)
+    text = re.sub(r"\r\n|\r|\n", "⏎", text)
+    if tag in SELF_CONTAINED_TAGS:
+        if text:
+            logger.warning(
+                f"Got self-contained element '{tag}' which contained text '{text}'.",
+            )
+        else:
+            return f"- [{mark_id}] <{tag}{attributes}/>"
+    return f"- [{mark_id}] <{tag}{attributes}>{text}</{tag}>"
+class BrowserSession:
+    def __init__(
+        self,
+        viewport_width: int = 1280,
+        viewport_height: int = 720,
+        headless: bool = True,
+    ):
+        self.viewport_width = viewport_width
+        self.viewport_height = viewport_height
+        self.headless = headless
+        self.playwright: Playwright | None = None
+        self.browser: Browser | None = None
+        self.context: BrowserContext | None = None
+        self._exit_stack: AsyncExitStack | None = None
+        self.poi_elements: list = Field(default_factory=list)
+        self.poi_centroids: list[Point] = Field(default_factory=list)
+        self.bounding_boxes: list[BoundingBox] = Field(default_factory=list)
+        self.pois: list[POI] = Field(default_factory=list)
+    async def __aenter__(self) -> Self:
+        self._exit_stack = AsyncExitStack()
+        self.playwright = await async_playwright().start()
+        self.browser = await self.playwright.chromium.launch(headless=self.headless)
+        self.context = await self.browser.new_context(
+            viewport={"width": self.viewport_width, "height": self.viewport_height},
+        )
+        # Ensure there's at least one page open
+        if not self.context.pages:
+            await self.context.new_page()
+        self.context.set_default_timeout(60_000)
+        self.current_page.set_default_timeout(60_000)
+        await stealth_async(self.current_page, StealthConfig(navigator_user_agent=False))
+        await self.context.add_init_script(
+            path=Path(__file__).with_name("add_custom_select.js"),
+        )
+        await self.context.add_init_script(
+            path=Path(__file__).with_name("find_pois.js"),
+        )
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        if self.browser:
+            await self.browser.close()
+        if self.playwright:
+            await self.playwright.stop()
+        if self._exit_stack:
+            await self._exit_stack.aclose()
+    @property
+    def current_page(self) -> Optional[Page]:
+        if self.context and self.context.pages:
+            return self.context.pages[-1] # Return the most recently opened page
+        return None
+    @property
+    def current_url(self) -> Optional[str]:
+        if self.current_page:
+            return self.current_page.url
+        return None
+    # re-run for cases of mid-run redirects
+    @retry(
+        wait=wait_exponential(multiplier=1, min=1, max=10),
+        stop=stop_after_delay(5),
+        reraise=True,
+        before_sleep=before_sleep_log(logger, logging.ERROR),
+    )
+    async def process_iframe(self, iframe) -> Optional[tuple[dict, dict]]:
+        try:
+            # Check iframe visibility and size
+            bounding_box = await iframe.bounding_box()
+            if not bounding_box:
+                return None  # Skip if iframe is not visible
+            width, height = bounding_box["width"], bounding_box["height"]
+            if width < 50 or height < 50:
+                return None
+            frame = await iframe.content_frame()
+            if not frame:
+                return None
+            poi = await frame.evaluate(
+                """() => {
+                    overwriteDefaultSelectConvergence();
+                    return findPOIsConvergence();
+                }""",
+            )
+            if not poi:
+                return None
+            iframe_offset = {"x": round(bounding_box["x"]), "y": round(bounding_box["y"])}
+            return poi, iframe_offset
+        except Exception as e:
+            logger.error(f"Error processing iframe: {e}")
+            return None
+    @retry(
+        wait=wait_exponential(multiplier=1, min=1, max=10),
+        stop=stop_after_delay(5),
+        reraise=True,
+        before_sleep=before_sleep_log(logger, logging.ERROR),
+    )
+    async def update_poi(self) -> None:
+        try:
+            # Wait for basic page load states to ensure the DOM is ready.
+            # This is a fundamental wait that should always apply.
+            await self.current_page.wait_for_load_state("domcontentloaded", timeout=60000)
+            logger.debug(f"DEBUG: wait_for_load_state('domcontentloaded') completed for {self.current_page.url}.")
+            current_url = self.current_page.url
+            # Define common Salesforce URL patterns for different states
+            login_url_patterns = [
+                "login.salesforce.com",
+                "identity.force.com",
+                "auth.lightning.force.com",
+                "setup.salesforce.com", # Sometimes a setup login redirects here temporarily
+                "my.salesforce.com" # Your specific custom domain login redirects here
+            ]
+            # This is the main Salesforce Lightning application base URL, typically seen after login.
+            # We treat this as an intermediate loading state before the specific target page.
+            intermediate_app_url_pattern = "/one/one.app"
+            # Check the current state of the page based on its URL
+            is_on_login_page = any(pattern in current_url for pattern in login_url_patterns)
+            is_on_intermediate_app_page = intermediate_app_url_pattern in current_url
+            # Note: is_on_target_forecast_page checks if the specific target path is in the URL
+            is_on_target_forecast_page = "/AccountForecastSettings/home" in current_url
+            # --- CONDITIONAL WAITING LOGIC BASED ON URL ---
+            if is_on_target_forecast_page:
+                logger.info(f"INFO: Detected target Account Forecast Settings page: {current_url}. Waiting for content.")
+                # When on the specific target page, wait for its content and spinners
+                spinner_selectors = [
+                    "div.slds-spinner_container",
+                    "div.auraLoadingBox",
+                    "div.dxp_axb_container", # Main overlay from your inspect screenshot
+                    "div.slds-sprite-astro-x-large" # Specific animated element itself
+                ]
+                for selector in spinner_selectors:
+                    try:
+                        await self.current_page.wait_for_selector(selector, state="hidden", timeout=5000) # Reduced timeout
+                        logger.debug(f"DEBUG: Spinner element '{selector}' became hidden for {self.current_page.url}.")
+                    except PlaywrightTimeoutError:
+                        logger.warning(f"DEBUGGING: Spinner element '{selector}' not detected or did not disappear on {self.current_page.url} within 5s.")
+                # Wait for a known element on the Account Forecast Settings page to ensure content is there.
+                try:
+                    # Added 'h2' for section headers, and a more generic 'div[data-aura-rendered-by]' for Lightning components
+                    await self.current_page.wait_for_selector("h1.slds-page-header__title, h2, .account-forecast-settings-component, div[data-aura-rendered-by]", state="visible", timeout=15000) # Increased timeout slightly for robust content load
+                    logger.debug(f"DEBUG: Confirmed main page element visible for {self.current_page.url}.")
+                except PlaywrightTimeoutError:
+                    logger.warning(f"DEBUGGING: Main page element not visible on {self.current_page.url} within 15s. This might indicate incomplete page load despite no spinner.")
+            elif is_on_login_page:
+                logger.info(f"INFO: Detected Salesforce login page: {current_url}. Waiting for login elements.")
+                # When on a login page, just wait for the login form elements to be visible
+                try:
+                    await self.current_page.wait_for_selector("input[type='email'], input[type='password'], input[type='submit'], #username, #password, #Login", state="visible", timeout=10000)
+                    logger.debug(f"DEBUG: Login page elements visible on {self.current_page.url}.")
+                except PlaywrightTimeoutError:
+                    logger.warning(f"DEBUGGING: Login page elements not visible on {self.current_page.url} within 10s. This may happen if elements are in an iframe or if page is extremely slow.")
+            elif is_on_intermediate_app_page:
+                logger.info(f"INFO: Detected intermediate Salesforce Lightning app loading page: {current_url}. Waiting for network idle and app spinner.")
+                # This is the /one/one.app page or similar. Don't wait for specific content, just general load.
+                try:
+                    await self.current_page.wait_for_load_state("networkidle", timeout=30000) # Give it more time for network to settle
+                    logger.debug(f"DEBUG: Network idle detected on intermediate app page: {current_url}.")
+                except PlaywrightTimeoutError:
+                    logger.warning(f"DEBUGGING: Network idle timeout on intermediate app page: {current_url}. Proceeding anyway.")
+                # Also try to wait for a common full-app spinner to disappear, if present
+                try:
+                    await self.current_page.wait_for_selector('div.app-spinner, div.auraLoadingBox', state='hidden', timeout=15000) # Added auraLoadingBox as it might reappear
+                    logger.debug(f"DEBUG: App spinner on intermediate page became hidden.")
+                except PlaywrightTimeoutError:
+                    logger.warning(f"DEBUGGING: App spinner on intermediate page not found or did not disappear.")
+            else:
+                logger.info(f"INFO: Detected unhandled URL type: {current_url}. Performing generic body wait.")
+                # Fallback for any other page, just wait for body to be visible
+                try:
+                    await self.current_page.wait_for_selector("body", timeout=5000, state="visible")
+                    logger.debug(f"DEBUG: wait_for_selector('body', state='visible') completed for {self.current_page.url}.")
+                except PlaywrightTimeoutError:
+                    logger.warning(f"DEBUGGING: Playwright Timeout (5s) on body selector for {self.current_page.url}. Continuing anyway.")
+                    pass
+        except PlaywrightTimeoutError as e:
+            logger.error(f"ERROR: Timeout waiting for page readiness for {self.current_page.url}: {e}")
+            raise # Re-raise if essential waits fail (e.g., initial domcontentloaded)
+        except Exception as e:
+            logger.error(f"ERROR: An unexpected error occurred during page readiness check for {self.current_page.url}: {e}")
+            raise
+        # Rest of update_poi: Run the bounding box javascript code to highlight the points of interest on the page
+        page_info = await self.current_page.evaluate(
+            """() => {
+                overwriteDefaultSelectConvergence();
+                return findPOIsConvergence();
+            }""",
+        )
+        # Get the points of interest on the page
+        self.poi_elements = page_info["element_descriptions"]
+        element_centroids = page_info["element_centroids"]
+        try:
+            # Select all iframes on the page
+            iframes = await self.current_page.query_selector_all("iframe")
+            max_iframes = 10
+            # Define an asynchronous function to process and filter each iframe
+            tasks = [asyncio.create_task(self.process_iframe(iframe)) for iframe in iframes[:max_iframes]]
+            results = await asyncio.gather(*tasks)
+            filtered_results = [result for result in results if result is not None]
+            iframes_pois = []
+            iframe_offsets = []
+            for poi, offset in filtered_results:
+                iframes_pois.append(poi)
+                iframe_offsets.append(offset)
+            # Combine the points of interest from the iframes with the main page and adjust the centroids
+            for index, iframe_poi in enumerate(iframes_pois):
+                self.poi_elements.extend(iframe_poi["element_descriptions"])
+                for centroid in iframe_poi["element_centroids"]:
+                    centroid["x"] += iframe_offsets[index]["x"]
+                    centroid["y"] += iframe_offsets[index]["y"]
+                    centroid["left"] += iframe_offsets[index]["x"]
+                    centroid["top"] += iframe_offsets[index]["y"]
+                    centroid["right"] += iframe_offsets[index]["x"]
+                    # Fix: Removed duplicate 'centroid["y"] += iframe_offsets[index]["y"]'
+                    centroid["bottom"] += iframe_offsets[index]["y"]
+                element_centroids.extend(iframe_poi["element_centroids"])
+        except Exception as e:
+            logger.error(f"Error in finding iframes: {e}")
+        # Get the centroids of the points of interest
+        self.poi_centroids = [Point(x=xy["x"], y=xy["y"]) for xy in element_centroids]
+        self.bounding_boxes = [BoundingBox(**xy, label=str(i)) for i, xy in enumerate(element_centroids)]
+        self.pois = [
+            POI(info=info, element_centroid=centroid, bounding_box=bbox)
+            for info, centroid, bbox in zip(
+                self.poi_elements,
+                self.poi_centroids,
+                self.bounding_boxes,
+                strict=False,
+            )
+        ]
+    @property
+    def poi_text(self) -> str:
+        # Get all points of interest on the page as text
+        texts = [element_as_text(mark_id=i, **element) for i, element in enumerate(self.poi_elements)]
+        # Return formatted text of points of interest on page
+        return "\n".join([txt for txt in texts if txt])
+    async def screenshot(
+        self,
+        delay: float = 0.0,
+        quality: int = 70,
+        type: str = "jpeg",
+        scale: str = "css",
+    ) -> tuple[bytes, bytes]:
+        if delay > 0.0:
+            await asyncio.sleep(delay)
+        await self.update_poi()
+        # Keep original logic if page is highly dynamic, but for static shots, simpler is faster
+        # old_poi_positions = [tuple(point) for point in self.poi_centroids]
+        img = await self.current_page.screenshot(type=type, quality=quality, scale=scale)
+        annotated_img = annotate_bounding_boxes(image=img, bounding_boxes=self.bounding_boxes)
+        # Re-evaluating this block for performance. Removed redundant update_poi and conditional screenshot.
+        # If precise screenshot timing is needed, the caller should manage delays and updates.
+        return img, annotated_img
+    async def goto(self, url: str) -> None:
+        await self.current_page.goto(url, wait_until="domcontentloaded")
+    async def reload(self) -> None:
+        await self.current_page.reload(wait_until="domcontentloaded")
+    async def click_tab(self, mark_id: int) -> None:
+        point: Point = self.poi_centroids[mark_id]
+        await self.hover(point)
+        await self.current_page.mouse.click(*point, button="middle")
+    async def click(self, mark_id: int) -> None:
+        point: Point = self.poi_centroids[mark_id]
+        await self.hover(point)
+        await self.current_page.mouse.click(*point)
+    async def enter_text(self, mark_id: int, text: str, submit: bool = False) -> None:
+        await self.clear_text_field(mark_id)
+        await self.click(mark_id)
+        await self.current_page.keyboard.type(text)
+        if submit:
+            await self.current_page.keyboard.press("Enter")
+    async def scroll(
+        self,
+        direction: Literal["up", "down", "left", "right"],
+        mark_id: Optional[int] = None,
+    ) -> None:
+        if mark_id is None:
+            point = Point(x=-1, y=-1)
+            max_scroll_x = self.viewport_width
+            max_scroll_y = self.viewport_height
+        else:
+            point: Point = self.poi_centroids[mark_id]
+            bbox: BoundingBox = self.bounding_boxes[mark_id]
+            max_scroll_x = bbox.right - bbox.left
+            max_scroll_y = bbox.bottom - bbox.top
+        await self.hover(point=point)
+        scroll_x = int(max_scroll_x * 0.8)
+        scroll_y = int(max_scroll_y * 0.8)
+        is_vertical = direction in ("up", "down")
+        reverse_scroll = direction in ("up", "left")
+        await self.current_page.mouse.wheel(
+            scroll_x * (-1 if reverse_scroll else 1) * (not is_vertical),
+            scroll_y * (-1 if reverse_scroll else 1) * is_vertical,
+        )
+    async def go_back(self) -> None:
+        # If there is no tab open then return
+        if not self.current_page:
+            return
+        await self.current_page.go_back(wait_until="domcontentloaded")
+        if self.current_page.url == "about:blank":
+            if not len(self.context.pages) > 1:
+                await self.current_page.go_forward(wait_until="domcontentloaded")
+                raise Exception("There is no previous page to go back to.")
+            await self.current_page.close()
+    async def hover(self, point: Point) -> None:
+        await self.current_page.mouse.move(*point)
+    async def focus(self, point: Point) -> None:
+        # Focus on the element on the page at point (x, y)
+        await self.current_page.evaluate(
+            """
+            ([x, y]) => {
+                const element = document.elementFromPoint(x, y);
+                if (element && element.focus) {
+                    element.focus();
+                }
+            }""",
+            tuple(point),
+        )
+    async def get_text(self, mark_id: int) -> str:
+        return await self.current_page.evaluate(
+            """
+            (mark_id) => {
+                const element = marked_elements_convergence[mark_id];
+                if (element && (element.value !== undefined || element.textContent !== undefined)) {
+                    return element.value || element.textContent;
+                }
+                return '';
+            }
+            """,
+            (mark_id,),
+        )
+    async def clear_text_field(self, mark_id: int) -> None:
+        existing_text = await self.get_text(mark_id)
+        if existing_text.strip():
+            # Clear existing text only if it exists
+            await self.click(mark_id)
+            if platform.system() == "Darwin":  # selecting all text is OS-specific
+                await self.click(mark_id)
+                await self.current_page.keyboard.press("Meta+a")
+                await self.current_page.keyboard.press("Backspace")
+            else:
+                await self.current_page.keyboard.press("Control+Home")
+                await self.current_page.keyboard.press("Control+Shift+End")
+            await self.current_page.keyboard.press("Backspace")
+    async def open_new_tab_and_go_to(self, url: str) -> None:
+        """
+        Opens a new browser tab/page and navigates to the specified URL.
+        Closes the old page if it's not the last one remaining.
+        """
+        logger.info(f"Attempting to open a new tab and navigate to: {url}")
+        new_page = await self.context.new_page()
+        # Close the previous page if it's not the only one left in the context
+        if len(self.context.pages) > 1 and self.current_page and self.current_page != new_page:
+            try:
+                await self.current_page.close()
+                logger.debug("Closed previous page.")
+            except Exception as e:
+                logger.warning(f"Could not close previous page (might already be closed or detached): {e}")
+        # After navigation, trigger POI update to reflect the new page's state
+        await new_page.goto(url, wait_until="domcontentloaded")
+        logger.info(f"Successfully navigated to {url} in a new tab.")
+        # Crucial: update_poi uses self.current_page, which is now new_page implicitly
+        await self.update_poi()
+if __name__ == "__main__":
+    async def dummy_test():
+        async with BrowserSession(headless=False) as s:
+            page = await s.context.new_page()
+            await page.goto("http://google.co.uk")
+            await asyncio.sleep(5)
+            await page.screenshot(path="example.png")
+            await s.update_poi()
+            _, annotated_image = await s.screenshot()
+            with open("output.png", "wb") as f:
+                f.write(annotated_image)
+    asyncio.run(dummy_test())

src/proxy_lite/browser/find_pois.js ADDED Viewed

	@@ -0,0 +1,397 @@

+marked_elements_convergence = [];
+const interactiveTags = new Set([
+    'a', 'button', 'details', 'embed', 'input', 'label',
+    'menu', 'menuitem', 'object', 'select', 'textarea', 'summary',
+    'video', 'audio', 'option', 'iframe'
+]);
+const interactiveRoles = new Set([
+    'button', 'menu', 'menuitem', 'link', 'checkbox', 'radio',
+    'slider', 'tab', 'tabpanel', 'textbox', 'combobox', 'grid',
+    'listbox', 'option', 'progressbar', 'scrollbar', 'searchbox',
+    'switch', 'tree', 'treeitem', 'spinbutton', 'tooltip',
+    'a-button-inner', 'a-dropdown-button', 'click',
+    'menuitemcheckbox', 'menuitemradio', 'a-button-text',
+    'button-text', 'button-icon', 'button-icon-only',
+    'button-text-icon-only', 'dropdown', 'combobox'
+]);
+findPOIsConvergence = (input = null) => {
+    let rootElement = input ? input : document.documentElement;
+    function isScrollable(element) {
+        if ((input === null) && (element === document.documentElement)) {
+            // we can always scroll the full page
+            return false;
+        }
+        const style = window.getComputedStyle(element);
+        const hasScrollableYContent = element.scrollHeight > element.clientHeight
+        const overflowYScroll = style.overflowY === 'scroll' || style.overflowY === 'auto';
+        const hasScrollableXContent = element.scrollWidth > element.clientWidth;
+        const overflowXScroll = style.overflowX === 'scroll' || style.overflowX === 'auto';
+        return (hasScrollableYContent && overflowYScroll) || (hasScrollableXContent && overflowXScroll);
+    }
+    function getEventListeners(element) {
+        try {
+            return window.getEventListeners?.(element) || {};
+        } catch (e) {
+            return {};
+        }
+    }
+    function isInteractive(element) {
+        if (!element) return false;
+        return (hasInteractiveTag(element) ||
+            hasInteractiveAttributes(element) ||
+            hasInteractiveEventListeners(element)) ||
+            isScrollable(element);
+    }
+    function hasInteractiveTag(element) {
+        return interactiveTags.has(element.tagName.toLowerCase());
+    }
+    function hasInteractiveAttributes(element) {
+        const role = element.getAttribute('role');
+        const ariaRole = element.getAttribute('aria-role');
+        const tabIndex = element.getAttribute('tabindex');
+        const onAttribute = element.getAttribute('on');
+        if (element.getAttribute('contenteditable') === 'true') return true;
+        if ((role && interactiveRoles.has(role)) ||
+            (ariaRole && interactiveRoles.has(ariaRole))) return true;
+        if (tabIndex !== null && tabIndex !== '-1') return true;
+        // Add check for AMP's 'on' attribute that starts with 'tap:'
+        if (onAttribute && onAttribute.startsWith('tap:')) return true;
+        const hasAriaProps = element.hasAttribute('aria-expanded') ||
+            element.hasAttribute('aria-pressed') ||
+            element.hasAttribute('aria-selected') ||
+            element.hasAttribute('aria-checked');
+        return hasAriaProps;
+    }
+    function hasInteractiveEventListeners(element) {
+        const hasClickHandler = element.onclick !== null ||
+             element.getAttribute('onclick') !== null ||
+             element.hasAttribute('ng-click') ||
+             element.hasAttribute('@click') ||
+             element.hasAttribute('v-on:click');
+        if (hasClickHandler) return true;
+        const listeners = getEventListeners(element);
+        return listeners && (
+            listeners.click?.length > 0 ||
+            listeners.mousedown?.length > 0 ||
+            listeners.mouseup?.length > 0 ||
+            listeners.touchstart?.length > 0 ||
+            listeners.touchend?.length > 0
+        );
+    }
+    function calculateArea(rects) {
+        return rects.reduce((acc, rect) => acc + rect.width * rect.height, 0);
+    }
+    function getElementRects(element, context) {
+        const vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0);
+        const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0);
+        let rects = [...element.getClientRects()];
+        // If rects are empty (likely due to Shadow DOM), try to estimate position
+        if (rects.length === 0 && element.getBoundingClientRect) {
+            rects = [element.getBoundingClientRect()];
+        }
+        // Get iframe offset if element is in an iframe
+        let iframeOffset = { x: 0, y: 0 };
+        if (context !== document && context?.defaultView?.frameElement) {
+            const iframe = context.defaultView.frameElement;
+            if (iframe) {
+                const iframeRect = iframe.getBoundingClientRect();
+                iframeOffset = {
+                    x: iframeRect.left,
+                    y: iframeRect.top
+                };
+            }
+        }
+        return rects.filter(bb => {
+            const center_x = bb.left + bb.width / 2 + iframeOffset.x;
+            const center_y = bb.top + bb.height / 2 + iframeOffset.y;
+            const elAtCenter = context.elementFromPoint(center_x - iframeOffset.x, center_y - iframeOffset.y);
+            return elAtCenter === element || element.contains(elAtCenter);
+        }).map(bb => {
+            const rect = {
+                left: Math.max(0, bb.left + iframeOffset.x),
+                top: Math.max(0, bb.top + iframeOffset.y),
+                right: Math.min(vw, bb.right + iframeOffset.x),
+                bottom: Math.min(vh, bb.bottom + iframeOffset.y)
+            };
+            return {
+                ...rect,
+                width: rect.right - rect.left,
+                height: rect.bottom - rect.top
+            };
+        });
+    }
+    function isElementVisible(element) {
+        const style = window.getComputedStyle(element);
+        return element.offsetWidth > 0 &&
+            element.offsetHeight > 0 &&
+            style.visibility !== 'hidden' &&
+            style.display !== 'none';
+    }
+    function isTopElement(element) {
+        let doc = element.ownerDocument;
+        if (doc !== window.document) {
+            // If in an iframe's document, treat as top
+            return true;
+        }
+        const shadowRoot = element.getRootNode();
+        if (shadowRoot instanceof ShadowRoot) {
+            const rect = element.getBoundingClientRect();
+            const point = { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
+            try {
+                const topEl = shadowRoot.elementFromPoint(point.x, point.y);
+                if (!topEl) return false;
+                let current = topEl;
+                while (current && current !== shadowRoot) {
+                    if (current === element) return true;
+                    current = current.parentElement;
+                }
+                return false;
+            } catch (e) {
+                return true;
+            }
+        }
+        const rect = element.getBoundingClientRect();
+        const point = { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
+        try {
+            const topEl = document.elementFromPoint(point.x, point.y);
+            if (!topEl) return false;
+            let current = topEl;
+            while (current && current !== document.documentElement) {
+                if (current === element) return true;
+                current = current.parentElement;
+            }
+            return false;
+        } catch (e) {
+            return true;
+        }
+    }
+    function getVisibleText(element, marked_elements_convergence = []) {
+        const blockLikeDisplays = [
+            // Basic block elements
+            'block', 'flow-root', 'inline-block',
+            // Lists
+            'list-item',
+            // Table elements
+            'table', 'inline-table', 'table-row', 'table-cell',
+            'table-caption', 'table-header-group', 'table-footer-group',
+            'table-row-group',
+            // Modern layouts
+            'flex', 'inline-flex', 'grid', 'inline-grid'
+        ];
+        // Check if element is hidden
+        const style = window.getComputedStyle(element);
+        if (style.display === 'none' || style.visibility === 'hidden') {
+            return '';
+        }
+        let collectedText = [];
+        function isMarkedInteractive(el) {
+            return marked_elements_convergence.includes(el);
+        }
+        function traverse(node) {
+            if (
+                node.nodeType === Node.ELEMENT_NODE &&
+                node !== element &&
+                isMarkedInteractive(node)
+            ) {
+                return false;
+            }
+            if (node.nodeType === Node.TEXT_NODE) {
+                const trimmed = node.textContent.trim();
+                if (trimmed) {
+                    collectedText.push(trimmed);
+                }
+            } else if (node.nodeType === Node.ELEMENT_NODE) {
+                // Skip noscript elements
+                if (node.tagName === 'NOSCRIPT') {
+                    return true;
+                }
+                const nodeStyle = window.getComputedStyle(node);
+                // Skip hidden elements
+                if (nodeStyle.display === 'none' || nodeStyle.visibility === 'hidden') {
+                    return true;
+                }
+                // Add newline before block elements if we have text
+                if (blockLikeDisplays.includes(nodeStyle.display) && collectedText.length > 0) {
+                    collectedText.push('\n');
+                }
+                if (node.tagName === 'IMG') {
+                    const textParts = [];
+                    const alt = node.getAttribute('alt');
+                    const title = node.getAttribute('title');
+                    const ariaLabel = node.getAttribute('aria-label');
+                    // Add more as needed (e.g., 'aria-describedby', 'data-caption', etc.)
+                    if (alt) textParts.push(`alt="${alt}"`);
+                    if (title) textParts.push(`title="${title}"`);
+                    if (ariaLabel) textParts.push(`aria-label="${ariaLabel}"`);
+                    if (textParts.length > 0) {
+                        collectedText.push(`[img - ${textParts.join(' ')}]`);
+                    }
+                    return true;
+                }
+                for (const child of node.childNodes) {
+                    const shouldContinue = traverse(child);
+                    if (shouldContinue === false) {
+                        return false;
+                    }
+                }
+                // Add newline after block elements
+                if (blockLikeDisplays.includes(nodeStyle.display)) {
+                    collectedText.push('\n');
+                }
+            }
+            return true;
+        }
+        traverse(element);
+        // Join text and normalize whitespace
+        return collectedText.join(' ').trim().replace(/\s{2,}/g, ' ').trim();
+    }
+    function extractInteractiveItems(rootElement) {
+        const items = [];
+        function processElement(element, context) {
+            if (!element) return;
+            // Recursively process elements
+            if (element.nodeType === Node.ELEMENT_NODE && isInteractive(element) && isElementVisible(element) && isTopElement(element)) {
+                const rects = getElementRects(element, context);
+                const area = calculateArea(rects);
+                items.push({
+                    element: element,
+                    area,
+                    rects,
+                    is_scrollable: isScrollable(element),
+                });
+            }
+            if (element.shadowRoot) {
+                // if it's shadow DOM, process elements in the shadow DOM
+                Array.from(element.shadowRoot.childNodes || []).forEach(child => {
+                    processElement(child, element.shadowRoot);
+                });
+            }
+            if (element.tagName === 'SLOT') {
+                // Handle both assigned elements and nodes
+                const assigned = element.assignedNodes ? element.assignedNodes() : element.assignedElements();
+                assigned.forEach(child => {
+                    processElement(child, context);
+                });
+            }
+            else if (element.tagName === 'IFRAME') {
+                try {
+                    const iframeDoc = element.contentDocument || element.contentWindow?.document;
+                    if (iframeDoc && iframeDoc.body) {
+                        // Process elements inside iframe
+                        processElement(iframeDoc.body, iframeDoc);
+                    }
+                } catch (e) {
+                    console.warn('Unable to access iframe contents:', e);
+                }
+            } else {
+                // if it's regular child elements, process regular child elements
+                Array.from(element.children || []).forEach(child => {
+                    processElement(child, context);
+                });
+            }
+        }
+        processElement(rootElement, document);
+        return items;
+    }
+    if (marked_elements_convergence) {
+        marked_elements_convergence = [];
+    }
+    let mark_centres = [];
+    let marked_element_descriptions = [];
+    var items = extractInteractiveItems(rootElement);
+    // Lets create a floating border on top of these elements that will always be visible
+    let index = 0;
+    items.forEach(function (item) {
+        item.rects.forEach((bbox) => {
+            marked_elements_convergence.push(item.element);
+            mark_centres.push({
+                x: Math.round((bbox.left + bbox.right) / 2),
+                y: Math.round((bbox.top + bbox.bottom) / 2),
+                left: bbox.left,
+                top: bbox.top,
+                right: bbox.right,
+                bottom: bbox.bottom,
+            });
+            marked_element_descriptions.push({
+                tag: item.element.tagName,
+                text: getVisibleText(item.element),
+                // NOTE: all other attributes will be shown to the model when present
+                // TODO: incorperate child attributes, e.g. <img alt="..."> when img is a child of the link element
+                value: item.element.value,
+                placeholder: item.element.getAttribute("placeholder"),
+                element_type: item.element.getAttribute("type"),
+                aria_label: item.element.getAttribute("aria-label"),
+                name: item.element.getAttribute("name"),
+                required: item.element.getAttribute("required"),
+                disabled: item.element.getAttribute("disabled"),
+                pattern: item.element.getAttribute("pattern"),
+                checked: item.element.getAttribute("checked"),
+                minlength: item.element.getAttribute("minlength"),
+                maxlength: item.element.getAttribute("maxlength"),
+                role: item.element.getAttribute("role"),
+                title: item.element.getAttribute("title"),
+                scrollable: item.is_scrollable
+            });
+            index++;
+        });
+    });
+    return {
+        element_descriptions: marked_element_descriptions,
+        element_centroids: mark_centres
+    };
+}

src/proxy_lite/cli.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import argparse
+import asyncio
+import base64
+import os
+from pathlib import Path
+from proxy_lite import Runner, RunnerConfig
+from proxy_lite.gif_maker import create_run_gif
+from proxy_lite.logger import logger
+def update_config_from_env(config: RunnerConfig) -> RunnerConfig:
+    if os.getenv("PROXY_LITE_API_BASE"):
+        config.solver.agent.client.api_base = os.getenv("PROXY_LITE_API_BASE")
+    if os.getenv("PROXY_LITE_MODEL"):
+        config.solver.agent.client.model_id = os.getenv("PROXY_LITE_MODEL")
+    if os.getenv("PROXY_LITE_VIEWPORT_WIDTH"):
+        config.environment.viewport_width = int(os.getenv("PROXY_LITE_VIEWPORT_WIDTH"))
+    if os.getenv("PROXY_LITE_VIEWPORT_HEIGHT"):
+        config.environment.viewport_height = int(os.getenv("PROXY_LITE_VIEWPORT_HEIGHT"))
+    return config
+def do_command(args):
+    do_text = " ".join(args.task)
+    logger.info("🤖 Let me help you with that...")
+    # Take default config from YAML
+    config = RunnerConfig.from_yaml(args.config)
+    # Update config from environment variables
+    config = update_config_from_env(config)
+    # Update config from command-line arguments
+    if args.api_base:
+        config.solver.agent.client.api_base = args.api_base
+    if args.model:
+        config.solver.agent.client.model_id = args.model
+    if args.homepage:
+        config.environment.homepage = args.homepage
+    if args.viewport_width:
+        config.environment.viewport_width = args.viewport_width
+    if args.viewport_height:
+        config.environment.viewport_height = args.viewport_height
+    o = Runner(config=config)
+    result = asyncio.run(o.run(do_text))
+    final_screenshot = result.observations[-1].info["original_image"]
+    folder_path = Path(__file__).parent.parent.parent / "screenshots"
+    folder_path.mkdir(parents=True, exist_ok=True)
+    path = folder_path / f"{result.run_id}.png"
+    with open(path, "wb") as f:
+        f.write(base64.b64decode(final_screenshot))
+    logger.info(f"🤖 Final screenshot saved to {path}")
+    gif_folder_path = Path(__file__).parent.parent.parent / "gifs"
+    gif_folder_path.mkdir(parents=True, exist_ok=True)
+    gif_path = gif_folder_path / f"{result.run_id}.gif"
+    create_run_gif(result, gif_path, duration=1500)
+    logger.info(f"🤖 GIF saved to {gif_path}")
+def main():
+    parser = argparse.ArgumentParser(description="Proxy-Lite")
+    parser.add_argument(
+        "task",
+        type=str,
+        help="The task you want to accomplish",
+        nargs="*",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=None,
+        help="The model to use.",
+    )
+    parser.add_argument(
+        "--api_base",
+        type=str,
+        default=None,
+        help="The API base URL to use.",
+    )
+    # New option for setting a homepage URL:
+    parser.add_argument(
+        "--homepage",
+        type=str,
+        default=None,
+        help="The homepage URL to use.",
+    )
+    # New viewport controls:
+    parser.add_argument(
+        "--viewport-width",
+        type=int,
+        default=None,
+        help="Viewport width in pixels.",
+    )
+    parser.add_argument(
+        "--viewport-height",
+        type=int,
+        default=None,
+        help="Viewport height in pixels.",
+    )
+    parser.add_argument(
+        "--config",
+        type=Path,
+        default=Path(__file__).parent / "configs/default.yaml",
+        help="Path to config file (default: configs/default.yaml)",
+    )
+    args = parser.parse_args()
+    do_command(args)
+if __name__ == "__main__":
+    main()

src/proxy_lite/client.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import os
+from abc import ABC, abstractmethod
+from functools import cached_property
+from typing import ClassVar, Literal, Optional, Union
+import httpx
+from httpx import Limits, Timeout
+from openai import AsyncOpenAI
+from openai.types.chat.chat_completion import (
+    ChatCompletion,
+)
+from pydantic import BaseModel
+from proxy_lite.history import MessageHistory
+from proxy_lite.logger import logger
+from proxy_lite.serializer import (
+    BaseSerializer,
+    OpenAICompatibleSerializer,
+)
+from proxy_lite.tools import Tool
+class BaseClientConfig(BaseModel):
+    http_timeout: float = 50
+    http_concurrent_connections: int = 50
+class BaseClient(BaseModel, ABC):
+    config: BaseClientConfig
+    serializer: ClassVar[BaseSerializer]
+    @abstractmethod
+    async def create_completion(
+        self,
+        messages: MessageHistory,
+        temperature: float = 0.7,
+        seed: Optional[int] = None,
+        tools: Optional[list[Tool]] = None,
+        response_format: Optional[type[BaseModel]] = None,
+    ) -> ChatCompletion: ...
+    """
+    Create completion from model.
+    Expect subclasses to adapt from various endpoints that will handle
+    requests differently, make sure to raise appropriate warnings.
+    Returns:
+        ChatCompletion: OpenAI ChatCompletion format for consistency
+    """
+    @classmethod
+    def create(cls, config: BaseClientConfig) -> "BaseClient":
+        supported_clients = {
+            "openai-azure": OpenAIClient,
+            "convergence": ConvergenceClient,
+        }
+        if config.name not in supported_clients:
+            error_message = f"Unsupported model: {config.name}."
+            raise ValueError(error_message)
+        return supported_clients[config.name](config=config)
+    @property
+    def http_client(self) -> httpx.AsyncClient:
+        return httpx.AsyncClient(
+            timeout=Timeout(self.config.http_timeout),
+            limits=Limits(
+                max_connections=self.config.http_concurrent_connections,
+                max_keepalive_connections=self.config.http_concurrent_connections,
+            ),
+        )
+class OpenAIClientConfig(BaseClientConfig):
+    name: Literal["openai"] = "openai"
+    model_id: str = "gpt-4o"
+    api_key: str = os.environ.get("OPENAI_API_KEY")
+class OpenAIClient(BaseClient):
+    config: OpenAIClientConfig
+    serializer: ClassVar[OpenAICompatibleSerializer] = OpenAICompatibleSerializer()
+    @cached_property
+    def external_client(self) -> AsyncOpenAI:
+        return AsyncOpenAI(
+            api_key=self.config.api_key,
+            http_client=self.http_client,
+        )
+    async def create_completion(
+        self,
+        messages: MessageHistory,
+        temperature: float = 0.7,
+        seed: Optional[int] = None,
+        tools: Optional[list[Tool]] = None,
+        response_format: Optional[type[BaseModel]] = None,
+    ) -> ChatCompletion:
+        base_params = {
+            "model": self.config.model_id,
+            "messages": self.serializer.serialize_messages(messages),
+            "temperature": temperature,
+        }
+        optional_params = {
+            "seed": seed,
+            "tools": self.serializer.serialize_tools(tools) if tools else None,
+            "tool_choice": "required" if tools else None,
+            "response_format": {"type": "json_object"} if response_format else {"type": "text"},
+        }
+        base_params.update({k: v for k, v in optional_params.items() if v is not None})
+        return await self.external_client.chat.completions.create(**base_params)
+class ConvergenceClientConfig(BaseClientConfig):
+    name: Literal["convergence"] = "convergence"
+    model_id: str = "convergence-ai/proxy-lite-7b"
+    api_base: str = "http://localhost:8000/v1"
+    api_key: str = "none"
+class ConvergenceClient(OpenAIClient):
+    config: ConvergenceClientConfig
+    serializer: ClassVar[OpenAICompatibleSerializer] = OpenAICompatibleSerializer()
+    _model_validated: bool = False
+    async def _validate_model(self) -> None:
+        try:
+            response = await self.external_client.models.list()
+            assert self.config.model_id in [model.id for model in response.data], (
+                f"Model {self.config.model_id} not found in {response.data}"
+            )
+            self._model_validated = True
+            logger.debug(f"Model {self.config.model_id} validated and connected to cluster")
+        except Exception as e:
+            logger.error(f"Error retrieving model: {e}")
+            raise e
+    @cached_property
+    def external_client(self) -> AsyncOpenAI:
+        return AsyncOpenAI(
+            api_key=self.config.api_key,
+            base_url=self.config.api_base,
+            http_client=self.http_client,
+        )
+    async def create_completion(
+        self,
+        messages: MessageHistory,
+        temperature: float = 0.7,
+        seed: Optional[int] = None,
+        tools: Optional[list[Tool]] = None,
+        response_format: Optional[type[BaseModel]] = None,
+    ) -> ChatCompletion:
+        if not self._model_validated:
+            await self._validate_model()
+        base_params = {
+            "model": self.config.model_id,
+            "messages": self.serializer.serialize_messages(messages),
+            "temperature": temperature,
+        }
+        optional_params = {
+            "seed": seed,
+            "tools": self.serializer.serialize_tools(tools) if tools else None,
+            "tool_choice": "auto" if tools else None,  # vLLM does not support "required"
+            "response_format": response_format if response_format else {"type": "text"},
+        }
+        base_params.update({k: v for k, v in optional_params.items() if v is not None})
+        return await self.external_client.chat.completions.create(**base_params)
+ClientConfigTypes = Union[OpenAIClientConfig, ConvergenceClientConfig]
+ClientTypes = Union[OpenAIClient, ConvergenceClient]

src/proxy_lite/configs/default.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+environment:
+  name: webbrowser
+  annotate_image: true
+  screenshot_delay: 2.0
+  viewport_width: 1280
+  viewport_height: 1920
+  include_poi_text: true
+  headless: false
+  homepage: https://www.google.co.uk
+  keep_original_image: true
+solver:
+  name: simple
+  agent:
+    name: proxy_lite
+    client:
+      name: convergence
+      model_id: convergence-ai/proxy-lite-3b
+      api_base: https://convergence-ai-demo-api.hf.space/v1
+local_view: true
+task_timeout: 1800
+environment_timeout: 1800
+action_timeout: 1800
+verbose: true

src/proxy_lite/environments/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from typing import Union
+from .environment_base import (
+    Action,
+    BaseEnvironment,
+    BaseEnvironmentConfig,
+    Environments,
+    Event,
+    EventType,
+    Observation,
+)
+from .webbrowser import (
+    WebBrowserEnvironment,
+    WebBrowserEnvironmentConfig,
+)
+EnvironmentConfigTypes = Union[*list(Environments._environment_config_registry.values())]
+EnvironmentTypes = Union[*list(Environments._environment_registry.values())]
+__all__ = [
+    "Action",
+    "BaseEnvironment",
+    "BaseEnvironmentConfig",
+    "EnvironmentConfigTypes",
+    "Environments",
+    "Event",
+    "EventType",
+    "Observation",
+    "WebBrowserEnvironment",
+    "WebBrowserEnvironmentConfig",
+]

src/proxy_lite/environments/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (924 Bytes). View file

src/proxy_lite/environments/__pycache__/environment_base.cpython-313.pyc ADDED Viewed

Binary file (8.85 kB). View file

src/proxy_lite/environments/__pycache__/webbrowser.cpython-313.pyc ADDED Viewed

Binary file (12.2 kB). View file

src/proxy_lite/environments/environment_base.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import json
+import logging
+from abc import ABC, abstractmethod
+from enum import Enum
+from functools import cached_property
+from typing import Any, Literal, Optional, Self
+from pydantic import BaseModel
+from proxy_lite.history import ToolCall
+from proxy_lite.tools import Tool, ToolExecutionResponse
+class EventType(str, Enum):
+    OBSERVATION = "observation"
+    ACTION = "action"
+    MESSAGE = "message"
+class Event(BaseModel):
+    type: EventType
+class State(BaseModel):
+    text: Optional[str] = None
+    image: Optional[str] = None  # base64 encoded image
+    html: Optional[str] = None
+    tool_responses: Optional[list[ToolExecutionResponse]] = None
+class Observation(Event):
+    type: Literal[EventType.OBSERVATION] = EventType.OBSERVATION
+    state: State
+    terminated: bool
+    reward: Optional[float] = None
+    info: Optional[dict[str, Any]] = None
+class Action(Event):
+    type: Literal[EventType.ACTION] = EventType.ACTION
+    text: Optional[str] = None
+    tool_calls: Optional[list[ToolCall]] = None
+    info: Optional[dict[str, Any]] = None
+class BaseEnvironmentConfig(BaseModel): ...
+class BaseEnvironment(BaseModel, ABC):
+    config: BaseEnvironmentConfig
+    logger: logging.Logger | None = None
+    class Config:
+        arbitrary_types_allowed = True
+    async def __aenter__(self) -> Self:
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        pass
+    @property
+    @abstractmethod
+    def info_for_user(self) -> str: ...
+    @cached_property
+    @abstractmethod
+    def tools(self) -> list[Tool]: ...
+    @abstractmethod
+    async def initialise(self) -> Observation: ...
+    @abstractmethod
+    async def execute_action(self, action: Action) -> Observation: ...
+    @abstractmethod
+    async def observe(self) -> Observation: ...
+    @abstractmethod
+    async def evaluate(self, **kwargs: dict[str, Any]) -> dict[str, Any]: ...
+    async def execute_tool(self, tool_call: ToolCall) -> None:
+        function = tool_call.function
+        for tool in self.tools:
+            if hasattr(tool, function["name"]):
+                arguments = json.loads(function["arguments"])
+                if isinstance(arguments, str):
+                    arguments = json.loads(arguments)
+                return await getattr(tool, function["name"])(
+                    **arguments,
+                )
+        msg = f'No tool function with name "{function["name"]}"'
+        raise ValueError(msg)
+    async def get_info(self) -> dict[str, Any]:
+        return {}
+class Environments:
+    _environment_registry: dict[str, type[BaseEnvironment]] = {}
+    _environment_config_registry: dict[str, type[BaseEnvironmentConfig]] = {}
+    @classmethod
+    def register_environment(cls, name: str):
+        """
+        Decorator to register an Environment class under a given name.
+        Example:
+            @Environments.register_environment("my_environment")
+            class MyEnvironment(BaseEnvironment):
+                ...
+        """
+        def decorator(env_cls: type[BaseEnvironment]) -> type[BaseEnvironment]:
+            cls._environment_registry[name] = env_cls
+            return env_cls
+        return decorator
+    @classmethod
+    def register_environment_config(cls, name: str):
+        """
+        Decorator to register an Environment configuration class under a given name.
+        Example:
+            @Environments.register_environment_config("my_environment")
+            class MyEnvironmentConfig(BaseEnvironmentConfig):
+                ...
+        """
+        def decorator(config_cls: type[BaseEnvironmentConfig]) -> type[BaseEnvironmentConfig]:
+            cls._environment_config_registry[name] = config_cls
+            return config_cls
+        return decorator
+    @classmethod
+    def get(cls, name: str) -> type[BaseEnvironment]:
+        """
+        Retrieve a registered Environment class by its name.
+        Raises:
+            ValueError: If no such environment is found.
+        """
+        try:
+            return cls._environment_registry[name]
+        except KeyError:
+            raise ValueError(f"Environment '{name}' not found.")
+    @classmethod
+    def get_config(cls, name: str) -> type[BaseEnvironmentConfig]:
+        """
+        Retrieve a registered Environment configuration class by its name.
+        Raises:
+            ValueError: If no such configuration is found.
+        """
+        try:
+            return cls._environment_config_registry[name]
+        except KeyError:
+            raise ValueError(f"Environment config for '{name}' not found.")

src/proxy_lite/environments/webbrowser.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import base64
+from functools import cached_property
+from typing import Any, Literal, Optional, Self
+from proxy_lite.browser.browser import BrowserSession
+from proxy_lite.environments.environment_base import (
+    Action,
+    BaseEnvironment,
+    BaseEnvironmentConfig,
+    Environments,
+    Observation,
+    State,
+)
+from proxy_lite.tools import BrowserTool, Tool, ToolExecutionResponse
+# Import logger from proxy_lite.logger, or if it's already available via BaseEnvironment
+from proxy_lite.logger import logger # Assuming you want to use the same logger
+@Environments.register_environment_config("webbrowser")
+class WebBrowserEnvironmentConfig(BaseEnvironmentConfig):
+    name: Literal["webbrowser"] = "webbrowser"
+    homepage: str = "https://google.com"
+    annotate_image: bool = True
+    screenshot_delay: float = 1.0  # seconds
+    include_html: bool = True
+    include_poi_text: bool = True
+    record_pois: bool = True
+    viewport_width: int = 1280
+    viewport_height: int = 720
+    browserbase_timeout: int = 7200
+    headless: bool = True
+    keep_original_image: bool = False
+    no_pois_in_image: bool = False
+@Environments.register_environment("webbrowser")
+class WebBrowserEnvironment(BaseEnvironment):
+    config: WebBrowserEnvironmentConfig
+    browser: Optional[BrowserSession] = None
+    cancelled_last_action: bool = False
+    class Config:
+        arbitrary_types_allowed = True
+    async def __aenter__(self) -> Self:
+        # Initialize the BrowserSession
+        self.browser = self.browser_session(
+            viewport_width=self.config.viewport_width,
+            viewport_height=self.config.viewport_height,
+            headless=self.config.headless,
+        )
+        await self.browser.__aenter__()
+        # Initialize other resources if necessary
+        if self.cookies:
+            await self.browser.context.add_cookies(self.cookies)
+        self.logger.info("🌐 [bold blue]Browser session started.[/]")
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        # Clean up the BrowserSession
+        await self.browser.__aexit__(exc_type, exc_value, traceback)
+    @property
+    def info_for_user(self) -> str:
+        return "This is a web browser environment. You can navigate the web, search the web, and perform actions on the web."  # noqa: E501
+    @cached_property
+    def tools(self) -> list[Tool]:
+        return [BrowserTool(session=self.browser)]
+    @cached_property
+    def browser_session(self) -> type[BrowserSession]:
+        return BrowserSession
+    @property
+    def cookies(self) -> list[dict]:
+        return []
+    async def initialise(self) -> Observation:
+        self.logger.debug(f"DEBUG: Initialising WebBrowserEnvironment. Homepage: {self.config.homepage}")
+        try:
+            await self.browser.goto(self.config.homepage)
+            self.logger.debug(f"DEBUG: Browser navigated to homepage. Current URL: {self.browser.current_url}")
+        except Exception as e:
+            self.logger.error(f"ERROR: Failed to navigate to homepage {self.config.homepage}: {e}")
+            raise # Re-raise to propagate the error
+        original_img, annotated_img = await self.browser.screenshot(
+            delay=self.config.screenshot_delay,
+        )
+        if self.config.no_pois_in_image:
+            base64_image = base64.b64encode(original_img).decode("utf-8")
+        else:
+            base64_image = base64.b64encode(annotated_img).decode("utf-8")
+        html_content = await self.browser.current_page.content() if self.config.include_html else None
+        info = {"url": self.browser.current_url}
+        if self.config.record_pois:
+            info["pois"] = self.browser.pois
+        if self.config.keep_original_image:
+            info["original_image"] = base64.b64encode(original_img).decode("utf-8")
+        self.logger.debug(f"DEBUG: Initial observation captured. URL: {self.browser.current_url}")
+        return Observation(
+            state=State(
+                text=f"URL: {self.browser.current_url}"
+                + (f"\n{self.browser.poi_text}" if self.config.include_poi_text else ""),
+                image=base64_image,
+                html=html_content,
+            ),
+            terminated=False,
+            reward=None,
+            info=info,
+        )
+    async def should_perform_action(self) -> bool:
+        # if cancelled last action, run the action without updating POIs
+        if self.cancelled_last_action:
+            self.cancelled_last_action = False
+            return True
+        # check for page changes
+        old_points = [tuple(point) for point in self.browser.poi_centroids]
+        await self.browser.update_poi()
+        new_points = [tuple(point) for point in self.browser.poi_centroids]
+        page_changed_mid_action = old_points != new_points
+        # record if the last action was cancelled
+        if page_changed_mid_action:
+            self.cancelled_last_action = True
+            return False
+        return True
+    async def execute_action(self, action: Action) -> Observation:
+        responses = []
+        cancelled_tools_flag = False
+        if await self.should_perform_action():
+            for tool_call in action.tool_calls:
+                # Perform the chosen action
+                try:
+                    tool_response: ToolExecutionResponse = await self.execute_tool(
+                        tool_call,
+                    )
+                    tool_response.id = tool_call.id
+                    responses.append(tool_response)
+                except Exception as e:  # noqa: PERF203
+                    self.logger.warning("🌐 An error occurred taking action: %s", str(e), exc_info=False)
+                    tool_response = ToolExecutionResponse(content=str(e), id=tool_call.id)
+                    responses.append(tool_response)
+        else:
+            self.logger.warning("🌐 Page changed since last observation, cancelling action.")
+            self.cancelled_last_action = True
+            for tool_call in action.tool_calls:
+                tool_response = ToolExecutionResponse(
+                    content="The page changed before the action could be executed, instead of being ran it was cancelled.",  # noqa: E501
+                    id=tool_call.id,
+                )
+                responses.append(tool_response)
+                cancelled_tools_flag = True
+        original_img, annotated_img = await self.browser.screenshot(
+            delay=self.config.screenshot_delay,
+        )
+        base64_image = base64.b64encode(annotated_img).decode("utf-8")
+        info = {"url": self.browser.current_url, "cancelled_tools": cancelled_tools_flag}
+        if self.config.record_pois:
+            info["pois"] = self.browser.pois
+        if self.config.keep_original_image:
+            info["original_image"] = base64.b64encode(original_img).decode("utf-8")
+        html_content = await self.browser.current_page.content() if self.config.include_html else None
+        return Observation(
+            state=State(
+                text=f"URL: {self.browser.current_url}"
+                + (f"\n{self.browser.poi_text}" if self.config.include_poi_text else ""),
+                image=base64_image,
+                html=html_content,
+                tool_responses=responses,
+            ),
+            terminated=False,
+            reward=None,
+            info=info,
+        )
+    async def observe(self) -> Observation:
+        return await self.browser.observe()
+    async def evaluate(self, **kwargs: dict[str, Any]) -> dict[str, Any]:
+        return {}
+    async def get_info(self) -> dict[str, Any]:
+        info = {}
+        return info

src/proxy_lite/gif_maker.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import base64
+import re
+import textwrap
+from io import BytesIO
+from PIL import Image, ImageDraw, ImageFont
+from proxy_lite.environments.environment_base import Action, Observation
+from proxy_lite.recorder import Run
+def create_run_gif(
+    run: Run, output_path: str, white_panel_width: int = 300, duration: int = 1500, resize_factor: int = 4
+) -> None:
+    """
+    Generate a gif from the Run object's history.
+    For each Observation record, the observation image is decoded from its base64
+    encoded string. If the next record is an Action, its text is drawn onto a
+    white panel. The observation image and the white panel are then concatenated
+    horizontally to produce a frame.
+    Parameters:
+        run (Run): A Run object with its history containing Observation and Action records.
+        output_path (str): The path where the GIF will be saved.
+        white_panel_width (int): The width of the white panel for displaying text.
+                                 Default increased to 400 for larger images.
+        duration (int): Duration between frames in milliseconds.
+                        Increased here to slow the FPS (default is 1000ms).
+        resize_factor (int): The factor to resize the image down by.
+    """
+    frames = []
+    history = run.history
+    i = 0
+    while i < len(history):
+        if isinstance(history[i], Observation):
+            observation = history[i]
+            image_data = observation.state.image
+            if not image_data:
+                i += 1
+                continue
+            # Decode the base64 image
+            image_bytes = base64.b64decode(image_data)
+            obs_img = Image.open(BytesIO(image_bytes)).convert("RGB")
+            # scale the image down
+            obs_img = obs_img.resize((obs_img.width // resize_factor, obs_img.height // resize_factor))
+            # Check if the next record is an Action and extract its text if available
+            action_text = ""
+            if i + 1 < len(history) and isinstance(history[i + 1], Action):
+                action = history[i + 1]
+                if action.text:
+                    action_text = action.text
+            # extract observation and thinking from tags in the action text
+            observation_match = re.search(r"<observation>(.*?)</observation>", action_text, re.DOTALL)
+            observation_content = observation_match.group(1).strip() if observation_match else None
+            # Extract text between thinking tags if present
+            thinking_match = re.search(r"<thinking>(.*?)</thinking>", action_text, re.DOTALL)
+            thinking_content = thinking_match.group(1).strip() if thinking_match else None
+            if observation_content and thinking_content:
+                action_text = f"**OBSERVATION**\n{observation_content}\n\n**THINKING**\n{thinking_content}"
+            # Create a white panel (same height as the observation image)
+            panel = Image.new("RGB", (white_panel_width, obs_img.height), "white")
+            draw = ImageDraw.Draw(panel)
+            font = ImageFont.load_default()
+            # Wrap the action text if it is too long
+            max_chars_per_line = 40  # Adjusted for larger font size
+            wrapped_text = textwrap.fill(action_text, width=max_chars_per_line)
+            # Calculate text block size and center it on the panel
+            try:
+                # Use multiline_textbbox if available (returns bounding box tuple)
+                bbox = draw.multiline_textbbox((0, 0), wrapped_text, font=font)
+                text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]
+            except AttributeError:
+                # Fallback for older Pillow versions: compute size for each line
+                lines = wrapped_text.splitlines() or [wrapped_text]
+                line_sizes = [draw.textsize(line, font=font) for line in lines]
+                text_width = max(width for width, _ in line_sizes)
+                text_height = sum(height for _, height in line_sizes)
+            text_x = (white_panel_width - text_width) // 2
+            text_y = (obs_img.height - text_height) // 2
+            draw.multiline_text((text_x, text_y), wrapped_text, fill="black", font=font, align="center")
+            # Create the combined frame by concatenating the observation image and the panel
+            total_width = obs_img.width + white_panel_width
+            combined_frame = Image.new("RGB", (total_width, obs_img.height))
+            combined_frame.paste(obs_img, (0, 0))
+            combined_frame.paste(panel, (obs_img.width, 0))
+            frames.append(combined_frame)
+            # Skip the Action record since it has been processed with this Observation
+            if i + 1 < len(history) and isinstance(history[i + 1], Action):
+                i += 2
+            else:
+                i += 1
+        else:
+            i += 1
+    if frames:
+        frames[0].save(output_path, save_all=True, append_images=frames[1:], duration=duration, loop=0)
+    else:
+        raise ValueError("No frames were generated from the Run object's history.")
+# Example usage:
+if __name__ == "__main__":
+    from proxy_lite.recorder import Run
+    dummy_run = Run.load("0abdb4cb-f289-48b0-ba13-35ed1210f7c1")
+    num_steps = int(len(dummy_run.history) / 2)
+    print(f"Number of steps: {num_steps}")
+    output_gif_path = "trajectory.gif"
+    create_run_gif(dummy_run, output_gif_path, duration=1000)
+    print(f"Trajectory GIF saved to {output_gif_path}")