Spaces:

ttomy
/

proxy-lite-demo-for-setup

Paused

+![Proxy-Lite Logo](assets/proxy-lite.png)
+A mini, open-weights version of our Proxy assistant.
+![Proxy-Lite Demo](demo.gif)
+---
+## Getting Started
+### Installation
+Clone the repository:
+```bash
+git clone https://github.com/convergence-ai/proxy-lite.git
+```
+Set-up the environment with:
+```bash
+make proxy
+```
+Or do it manually:
+```bash
+uv venv --python 3.11 --python-preference managed
+uv sync
+uv pip install -e .
+playwright install
+```
+### Usage
+```bash
+proxy --help
+```
+You can directly run the proxy with:
+```bash
+proxy "Book a table for 2 at an Italian restaurant in Kings Cross tonight at 7pm."
+```
+### Proxy-Lite Endpoint
+By default, Proxy-Lite will point to an endpoint set up on HuggingFace spaces. This is a demo endpoint and is not suitable for production use; it may be very slow when under heavy load.
+We recommend hosting your own endpoint with vLLM, you can use the following command:
+```bash
+vllm serve --model convergence-ai/proxy-lite-7b \
+    --trust-remote-code \
+    --enable-auto-tool-choice \
+    --tool-call-parser hermes \
+    --port 8008 \
+```
+You can set the `api_base` to point to your local endpoint when calling Proxy-Lite:
+```bash
+proxy --api-base http://localhost:8008/v1 "Book a table...
+```
+or by setting the environment variable:
+```bash
+export PROXY_LITE_API_BASE=http://localhost:8008/v1
+```

pyproject.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+[project]
+name = "proxy-lite"
+version = "0.1.0"
+description = "Proxy Lite - A mini, open-weights, version of the Convergence AI Proxy assistant."
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "omegaconf>=2.3.0",
+    "openai>=1.61.1",
+    "opencv-python>=4.11.0.86",
+    "playwright-stealth>=1.0.6",
+    "playwright>=1.50.0",
+    "pydantic>=2.10.6",
+    "rich>=13.9.4",
+    "setuptools>=75.8.0",
+    "tenacity>=9.0.0",
+    "torch>=2.6.0",
+    "torchvision>=0.21.0",
+]
+[project.scripts]
+proxy = "proxy_lite.cli:main"
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools]
+packages = { find = { where = ["src"] } }
+[tool.setuptools.package-data]
+proxy_lite = ["**/*.json"]
+[tool.ruff]
+line-length = 120
+[tool.ruff.lint]
+select = ["E", "F", "B", "I", "SIM"]
+ignore = [
+    "B028",
+    "E722", # ignore bare except
+    "B904", # ignore raise from requirement
+    "FA102",
+]
+[tool.ruff.lint.flake8-bugbear]
+extend-immutable-calls = [
+    "fastapi.Depends",
+    "fastapi.params.Depends",
+    "fastapi.Query",
+    "fastapi.params.Query",
+]

src/proxy_lite/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .runner import Runner, RunnerConfig
2	+
3	+ __all__ = ["Runner", "RunnerConfig"]

src/proxy_lite/agents/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from typing import Union
+from .agent_base import Agents, BaseAgent, BaseAgentConfig
+from .browser_agent import BrowserAgent, BrowserAgentConfig
+from .proxy_lite_agent import ProxyLiteAgent, ProxyLiteAgentConfig
+AgentTypes = Union[*list(Agents._agent_registry.values())]
+AgentConfigTypes = Union[*list(Agents._agent_config_registry.values())]
+__all__ = [
+    "AgentConfigTypes",
+    "AgentTypes",
+    "Agents",
+    "BaseAgent",
+    "BaseAgentConfig",
+    "BrowserAgent",
+    "BrowserAgentConfig",
+    "ProxyLiteAgent",
+    "ProxyLiteAgentConfig",
+]

src/proxy_lite/agents/agent_base.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import json
+import logging
+from abc import ABC, abstractmethod
+from contextlib import AsyncExitStack
+from functools import cached_property
+from typing import Any, Optional, Type, cast
+from pydantic import BaseModel, Field
+from tenacity import before_sleep_log, retry, stop_after_attempt, wait_exponential
+from proxy_lite.client import BaseClient, ClientConfigTypes, OpenAIClientConfig
+from proxy_lite.history import (
+    AssistantMessage,
+    MessageHistory,
+    MessageLabel,
+    SystemMessage,
+    Text,
+    ToolCall,
+    ToolMessage,
+    UserMessage,
+)
+from proxy_lite.logger import logger
+from proxy_lite.tools import Tool
+# if TYPE_CHECKING:
+#     from proxy_lite.tools import Tool
+class BaseAgentConfig(BaseModel):
+    client: ClientConfigTypes = Field(default_factory=OpenAIClientConfig)
+    history_messages_limit: dict[MessageLabel, int] = Field(default_factory=lambda: dict())
+    history_messages_include: Optional[dict[MessageLabel, int]] = Field(
+        default=None,
+        description="If set, overrides history_messages_limit by setting all message types to 0 except those specified",
+    )
+    def model_post_init(self, __context: Any) -> None:
+        if self.history_messages_include is not None:
+            self.history_messages_limit = {label: 0 for label in MessageLabel}
+            self.history_messages_limit.update(self.history_messages_include)
+class BaseAgent(BaseModel, ABC):
+    config: BaseAgentConfig
+    temperature: float = Field(default=0.7, ge=0, le=2)
+    history: MessageHistory = Field(default_factory=MessageHistory)
+    client: Optional[BaseClient] = None
+    env_tools: list[Tool] = Field(default_factory=list)
+    task: Optional[str] = Field(default=None)
+    seed: Optional[int] = Field(default=None)
+    class Config:
+        arbitrary_types_allowed = True
+    def __init__(self, **data) -> None:
+        super().__init__(**data)
+        self._exit_stack = AsyncExitStack()
+        self._tools_init_task = None
+    def model_post_init(self, __context: Any) -> None:
+        super().model_post_init(__context)
+        self.client = BaseClient.create(self.config.client)
+    @property
+    @abstractmethod
+    def system_prompt(self) -> str: ...
+    @cached_property
+    @abstractmethod
+    def tools(self) -> list[Tool]: ...
+    @cached_property
+    def tool_descriptions(self) -> str:
+        tool_descriptions = []
+        for tool in self.tools:
+            func_descriptions = "\n".join("- {name}: {description}".format(**schema) for schema in tool.schema)
+            tool_title = f"{tool.__class__.__name__}:\n" if len(self.tools) > 1 else ""
+            tool_descriptions.append(f"{tool_title}{func_descriptions}")
+        return "\n\n".join(tool_descriptions)
+    async def get_history_view(self) -> MessageHistory:
+        return MessageHistory(
+            messages=[SystemMessage(content=[Text(text=self.system_prompt)])],
+        ) + self.history.history_view(
+            limits=self.config.history_messages_limit,
+        )
+    @retry(
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        stop=stop_after_attempt(3),
+        reraise=True,
+        before_sleep=before_sleep_log(logger, logging.ERROR),
+    )
+    async def generate_output(
+        self,
+        use_tool: bool = False,
+        response_format: Optional[type[BaseModel]] = None,
+        append_assistant_message: bool = True,
+    ) -> AssistantMessage:
+        messages: MessageHistory = await self.get_history_view()
+        response_content = (
+            await self.client.create_completion(
+                messages=messages,
+                temperature=self.temperature,
+                seed=self.seed,
+                response_format=response_format,
+                tools=self.tools if use_tool else None,
+            )
+        ).model_dump()
+        response_content = response_content["choices"][0]["message"]
+        assistant_message = AssistantMessage(
+            role=response_content["role"],
+            content=[Text(text=response_content["content"])] if response_content["content"] else [],
+            tool_calls=response_content["tool_calls"],
+        )
+        if append_assistant_message:
+            self.history.append(message=assistant_message, label=self.message_label)
+        return assistant_message
+    def receive_user_message(
+        self,
+        text: Optional[str] = None,
+        image: list[bytes] = None,
+        label: MessageLabel = None,
+        is_base64: bool = False,
+    ) -> None:
+        message = UserMessage.from_media(
+            text=text,
+            image=image,
+            is_base64=is_base64,
+        )
+        self.history.append(message=message, label=label)
+    def receive_system_message(
+        self,
+        text: Optional[str] = None,
+        label: MessageLabel = None,
+    ) -> None:
+        message = SystemMessage.from_media(text=text)
+        self.history.append(message=message, label=label)
+    def receive_assistant_message(
+        self,
+        content: Optional[str] = None,
+        tool_calls: Optional[list[ToolCall]] = None,
+        label: MessageLabel = None,
+    ) -> None:
+        message = AssistantMessage(
+            content=[Text(text=content)] if content else [],
+            tool_calls=tool_calls,
+        )
+        self.history.append(message=message, label=label)
+    async def use_tool(self, tool_call: ToolCall):
+        function = tool_call.function
+        for tool in self.tools:
+            if hasattr(tool, function["name"]):
+                return await getattr(tool, function["name"])(
+                    **json.loads(function["arguments"]),
+                )
+        msg = f'No tool function with name "{function["name"]}"'
+        raise ValueError(msg)
+    async def receive_tool_message(
+        self,
+        text: str,
+        tool_id: str,
+        label: MessageLabel = None,
+    ) -> None:
+        self.history.append(
+            message=ToolMessage(content=[Text(text=text)], tool_call_id=tool_id),
+            label=label,
+        )
+class Agents:
+    _agent_registry: dict[str, type[BaseAgent]] = {}
+    _agent_config_registry: dict[str, type[BaseAgentConfig]] = {}
+    @classmethod
+    def register_agent(cls, name: str):
+        """
+        Decorator to register an Agent class under a given name.
+        Example:
+            @Agents.register_agent("browser")
+            class BrowserAgent(BaseAgent):
+                ...
+        """
+        def decorator(agent_cls: type[BaseAgent]) -> type[BaseAgent]:
+            cls._agent_registry[name] = agent_cls
+            return agent_cls
+        return decorator
+    @classmethod
+    def register_agent_config(cls, name: str):
+        """
+        Decorator to register a configuration class under a given name.
+        Example:
+            @Agents.register_agent_config("browser")
+            class BrowserAgentConfig(BaseAgentConfig):
+                ...
+        """
+        def decorator(config_cls: type[BaseAgentConfig]) -> type[BaseAgentConfig]:
+            cls._agent_config_registry[name] = config_cls
+            return config_cls
+        return decorator
+    @classmethod
+    def get(cls, name: str) -> type[BaseAgent]:
+        """
+        Retrieve a registered Agent class by its name.
+        Raises:
+            ValueError: If no such agent is found.
+        """
+        try:
+            return cast(Type[BaseAgent], cls._agent_registry[name])
+        except KeyError:
+            raise ValueError(f"Agent '{name}' not found.")
+    @classmethod
+    def get_config(cls, name: str) -> type[BaseAgentConfig]:
+        """
+        Retrieve a registered Agent configuration class by its name.
+        Raises:
+            ValueError: If no such config is found.
+        """
+        try:
+            return cast(type[BaseAgentConfig], cls._agent_config_registry[name])
+        except KeyError:
+            raise ValueError(f"Agent config for '{name}' not found.")

src/proxy_lite/agents/browser_agent.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from datetime import datetime
+from functools import cached_property
+from typing import Literal
+from pydantic import Field
+from proxy_lite.agents.agent_base import Agents, BaseAgent, BaseAgentConfig
+from proxy_lite.history import MessageHistory, MessageLabel, SystemMessage, Text
+from proxy_lite.tools import Tool
+BROWSER_AGENT_SYSTEM_PROMPT = """ **You are Proxy Lite, the Web-Browsing Agent.** You are developed by Convergence.
+**Current date:** {date_time_with_day}.
+You are given:
+1. A user task that you are trying to complete.
+2. Relevant facts we have at our disposal.
+3. A high level plan to complete the task.
+4. A history of previous actions and observations.
+5. An annotated webpage screenshot and text description of what's visible in the browser before and after the last action.
+## Objective
+You are an expert at controlling the web browser.
+You will be assisting a user with a task they are trying to complete on the web.
+## Web Screenshots
+Each iteration of your browsing loop, you'll be provided with a screenshot of the browser.
+The screenshot will have red rectangular annotations. These annotations highlight the marked elements you can interact with.
+## Mark IDs
+Each annotated element is labeled with a "mark id" in the top-left corner.
+When using tools like typing or clicking, specify the "mark id" to indicate which element you want to interact with.
+If an element is not annotated, you cannot interact with it. This is a limitation of the software. Focus on marked elements only.
+## Text Snippets
+Along with the screenshot, you will receive text snippets describing each annotated element.
+Here’s an example of different element types:
+- [0] `<a>text</a>` → Mark 0 is a link (`<a>` tag) containing the text "text".
+- [1] `<button>text</button>` → Mark 1 is a button (`<button>` tag) containing the text "text".
+- [2] `<input value="text"/>` → Mark 2 is an input field (`<input>` tag) with the value "text".
+- [3] `<select>text</select>` → Mark 3 is a dropdown menu (`<select>` tag) with the option "text" selected.
+- [4] `<textarea>text</textarea>` → Mark 4 is a text area (`<textarea>` tag) containing the text "text".
+- [5] `<li>text</li>` → Mark 5 is a list item (`<li>` tag) containing the text "text".
+- [6] `<div scrollable>text</div>` → Mark 6 is a division (`<div>` tag) containing the text "text" and is scrollable.
+- [7] `<td>text</td>` → Mark 7 is a table cell (`<td>` tag) containing the text "text".
+Note that these text snippets may be incomplete.
+## History
+You will see your past actions and observations but not old annotated webpages.
+This means annotated webpages showing useful information will not be visible in future actions.
+To get around this, key details from each webpage are stored in observations.
+## Web Browser Actions
+You can only take the following actions with the web browser:
+{tool_descriptions}
+## Important Browsing Tips
+If there is a modal overlay that is unresponsive on the page try reloading the webpage.
+If there is a cookie consent form covering part of the page just click accept on the form.
+When typing into a text field be sure to click one of the dropdown options (when present). Not selecting a dropdown option will result in the field being cleared after the next action.
+You do not have access any internet accounts (outside of those provided by the user).
+The browser has a built in CAPTCHA solver, if you are asked to solve one just wait and it will be solved for you.
+## Don't Repeat the Same Actions Continuously
+If you find yourself repeating an action without making progress, try another action.
+## Task
+You will now be connected to the user, who will give you their task."""  # noqa: E501
+MAX_MESSAGES_FOR_CONTEXT_WINDOW = {
+    MessageLabel.SCREENSHOT: 1,
+    # MessageLabel.REASONING_INDUCTION: 1,
+    # MessageLabel.FORMAT_INSTRUCTIONS: 1,
+    # MessageLabel.ACTION: 1,
+}
+@Agents.register_agent_config("browser")
+class BrowserAgentConfig(BaseAgentConfig):
+    name: Literal["browser"] = "browser"
+    history_messages_limit: dict[MessageLabel, int] = Field(
+        default_factory=lambda: MAX_MESSAGES_FOR_CONTEXT_WINDOW,
+    )
+@Agents.register_agent("browser")
+class BrowserAgent(BaseAgent):
+    config: BrowserAgentConfig
+    message_label: MessageLabel = MessageLabel.AGENT_MODEL_RESPONSE
+    def __init__(self, **data):
+        super().__init__(**data)
+    @property
+    def system_prompt(self) -> str:
+        return BROWSER_AGENT_SYSTEM_PROMPT.format(
+            date_time_with_day=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            tool_descriptions=self.tool_descriptions,
+            memories="",
+        )
+    @cached_property
+    def tools(self) -> list[Tool]:
+        return self.env_tools
+    async def get_history_view(self) -> MessageHistory:
+        return MessageHistory(
+            messages=[SystemMessage(content=[Text(text=self.system_prompt)])],
+        ) + self.history.history_view(
+            limits=self.config.history_messages_limit,
+        )

src/proxy_lite/agents/proxy_lite_agent.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from functools import cached_property
+from typing import Literal
+from pydantic import Field
+from proxy_lite.history import MessageHistory, MessageLabel, SystemMessage, Text
+from proxy_lite.tools import Tool
+from .agent_base import Agents, BaseAgent, BaseAgentConfig
+MODEL_SYSTEM_PROMPT = """You are Proxy-Lite, an AI assistant that can perform actions on a computer screen.
+You were developed by Convergence AI.
+The user will instuct you to perform a task.
+You will be shown a screen as well as relevant interactable elements highlighted by mark_ids and you will be given a set of tools to use to perform the task.
+You should make observations about the screen, putting them in <observation></observation> tags.
+You should then reason about what needs to be done to complete the task, putting your thoughts in <thinking></thinking> tags.
+You should then use the tools to perform the task, putting the tool calls in <tool_call></tool_call> tags.
+"""  # noqa: E501
+MAX_MESSAGES_FOR_CONTEXT_WINDOW = {
+    MessageLabel.SCREENSHOT: 1,
+}
+@Agents.register_agent_config("proxy_lite")
+class ProxyLiteAgentConfig(BaseAgentConfig):
+    name: Literal["proxy_lite"] = "proxy_lite"
+    history_messages_limit: dict[MessageLabel, int] = Field(
+        default_factory=lambda: MAX_MESSAGES_FOR_CONTEXT_WINDOW,
+    )
+@Agents.register_agent("proxy_lite")
+class ProxyLiteAgent(BaseAgent):
+    config: ProxyLiteAgentConfig
+    message_label: MessageLabel = MessageLabel.AGENT_MODEL_RESPONSE
+    def __init__(self, **data):
+        super().__init__(**data)
+    @property
+    def system_prompt(self) -> str:
+        return MODEL_SYSTEM_PROMPT
+    @cached_property
+    def tools(self) -> list[Tool]:
+        return self.env_tools
+    async def get_history_view(self) -> MessageHistory:
+        return MessageHistory(
+            messages=[SystemMessage(content=[Text(text=self.system_prompt)])],
+        ) + self.history.history_view(
+            limits=self.config.history_messages_limit,
+        )

src/proxy_lite/browser/__init__.py ADDED Viewed

File without changes

src/proxy_lite/browser/add_custom_select.js ADDED Viewed

	@@ -0,0 +1,123 @@

+handledSelectElementsConvergence = new WeakSet();
+overwriteDefaultSelectConvergence = (input = null) => {
+    let activeSelectElement = null;
+    // Handle iframe input element
+    let rootElement = input ? input : document.documentElement;
+    function createCustomSelectElement() {
+        // Create the custom select container
+        const customSelect = document.createElement('div');
+        customSelect.id = 'convergence-custom-select-element-X2EmudtLRN';
+        customSelect.style.position = 'absolute'
+        customSelect.style.zIndex = 2147483647 - 1;
+        customSelect.style.display = 'none';
+        document.body.appendChild(customSelect);
+        // Create the select options list
+        const optionsList = document.createElement('div');
+        optionsList.style.border = '1px solid #ccc';
+        optionsList.style.backgroundColor = '#fff';
+        optionsList.style.color = 'black';
+        customSelect.appendChild(optionsList);
+        return customSelect;
+    }
+    function showCustomSelect(select) {
+        activeSelectElement = select;
+        // Clear previous options
+        const customSelect = rootElement.querySelector('#convergence-custom-select-element-X2EmudtLRN');
+        let optionsList = customSelect.firstChild;
+        optionsList.innerHTML = '';
+        // Populate with new options
+        Array.from(select.options).forEach(option => {
+            const customOption = document.createElement('div');
+            customOption.className = 'custom-option';
+            customOption.style.padding = '8px';
+            customOption.style.cursor = 'pointer';
+            customOption.textContent = option.text;
+            customOption.dataset.value = option.value;
+            optionsList.appendChild(customOption);
+            customOption.addEventListener('mouseenter', function () {
+                customOption.style.backgroundColor = '#f0f0f0';
+            });
+            customOption.addEventListener('mouseleave', function () {
+                customOption.style.backgroundColor = '';
+            });
+            customOption.addEventListener('mousedown', (e) => {
+                e.stopPropagation();
+                select.value = customOption.dataset.value;
+                customSelect.style.display = 'none';
+                activeSelectElement = null;
+                // ensure we trigger all potential event listeners
+                select.dispatchEvent(new InputEvent('focus', { bubbles: true, cancelable: true }));
+                select.dispatchEvent(new InputEvent('input', { bubbles: true, cancelable: true }));
+                select.dispatchEvent(new InputEvent('change', { bubbles: true, cancelable: true }));
+                select.dispatchEvent(new InputEvent('blur', { bubbles: true, cancelable: true }));
+            });
+        });
+        // Position and show the custom select
+        const selectRect = select.getBoundingClientRect();
+        customSelect.style.top = `${selectRect.bottom + window.scrollY}px`;
+        customSelect.style.left = `${selectRect.left + window.scrollX}px`;
+        customSelect.style.width = `${selectRect.width}px`;
+        customSelect.style.display = 'block';
+        select.focus();
+        select.addEventListener('blur', function (e) {
+            customSelect.style.display = 'none';
+            activeSelectElement = null;
+        });
+        select.addEventListener('change', function (e) {
+            customSelect.style.display = 'none';
+            activeSelectElement = null;
+        });
+    }
+    // Ensure we have a custom select element
+    let customSelect = rootElement.querySelector(`#convergence-custom-select-element-X2EmudtLRN`);
+    if (!customSelect) {
+        customSelect = createCustomSelectElement();
+    }
+    // Find selects in shadow DOMs
+    function findSelectInShadowRoot(element) {
+        if (element.shadowRoot) {
+            return element.shadowRoot.querySelectorAll('select');
+        }
+        return [];
+    }
+    let shadowSelects = [];
+    rootElement.querySelectorAll('*').forEach(el => {
+        shadowSelects.push(...findSelectInShadowRoot(el));
+    });
+    // Find selects in the regular (light) DOM
+    const lightSelects = Array.from(rootElement.querySelectorAll('select'));
+    // Add event listeners to all select elements
+    const allSelects = [...lightSelects, ...shadowSelects];
+    allSelects.forEach(select => {
+        if (select.hasAttribute('multiple')) {
+            // skip special multiple elements as our POI code already handles them
+            return;
+        }
+        if (!handledSelectElementsConvergence.has(select)) {
+            select.addEventListener('mousedown', (e) => {
+                // only use custom select when the default behaviour is being used
+                if (!e.defaultPrevented) {
+                    showCustomSelect(select);
+                    e.preventDefault();
+                }
+            });
+            handledSelectElementsConvergence.add(select);
+        }
+    });
+}

src/proxy_lite/browser/bounding_boxes.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import math
+from typing import Any
+import cv2
+import numpy as np
+from pydantic import BaseModel, Field, field_validator
+class Point(BaseModel):
+    x: int
+    y: int
+    def __iter__(self):
+        return iter((self.x, self.y))
+    def __getitem__(self, index) -> int:
+        return (self.x, self.y)[index]
+    def __tuple__(self) -> tuple[int, int]:
+        return (self.x, self.y)
+    def __repr__(self) -> str:
+        return f"Point(x={self.x}, y={self.y})"
+class BoundingBox(BaseModel):
+    label: str = Field(..., description="The label that's given for this bounding box")
+    left: int = Field(..., description="Left coordinate of the bounding box")
+    right: int = Field(..., description="Right coordinate of the bounding box")
+    top: int = Field(..., description="Top coordinate of the bounding box")
+    bottom: int = Field(..., description="Bottom coordinate of the bounding box")
+    @field_validator("left", "top", mode="before")
+    @classmethod
+    def round_down(cls, v):
+        return math.floor(float(v))
+    @field_validator("right", "bottom", mode="before")
+    @classmethod
+    def round_up(cls, v):
+        return math.ceil(float(v))
+class POI(BaseModel):
+    info: dict[str, Any]
+    element_centroid: Point
+    bounding_box: BoundingBox
+def calculate_dash_points(start, end, dash_length, gap_length):
+    x1, y1 = start
+    x2, y2 = end
+    dx = x2 - x1
+    dy = y2 - y1
+    dist = np.sqrt(dx * dx + dy * dy)
+    if dist == 0:
+        return []
+    unit_x = dx / dist
+    unit_y = dy / dist
+    dash_points = []
+    current_dist = 0
+    while current_dist < dist:
+        dash_end = min(current_dist + dash_length, dist)
+        dash_points.extend(
+            [
+                (int(x1 + unit_x * current_dist), int(y1 + unit_y * current_dist)),
+                (int(x1 + unit_x * dash_end), int(y1 + unit_y * dash_end)),
+            ],
+        )
+        current_dist += dash_length + gap_length
+    return dash_points
+def draw_dashed_rectangle(
+    img,
+    bbox: BoundingBox,
+    color,
+    thickness=1,
+    dash_length=10,
+    gap_length=5,
+):
+    # Calculate dash points for all sides
+    top_points = calculate_dash_points(
+        (bbox.left + 25, bbox.top + 25),
+        (bbox.right + 25, bbox.top + 25),
+        dash_length,
+        gap_length,
+    )
+    right_points = calculate_dash_points(
+        (bbox.right + 25, bbox.top + 25),
+        (bbox.right + 25, bbox.bottom + 25),
+        dash_length,
+        gap_length,
+    )
+    bottom_points = calculate_dash_points(
+        (bbox.right + 25, bbox.bottom + 25),
+        (bbox.left + 25, bbox.bottom + 25),
+        dash_length,
+        gap_length,
+    )
+    left_points = calculate_dash_points(
+        (bbox.left + 25, bbox.bottom + 25),
+        (bbox.left + 25, bbox.top + 25),
+        dash_length,
+        gap_length,
+    )
+    # Combine all points
+    all_points = top_points + right_points + bottom_points + left_points
+    # Draw all lines at once
+    if all_points:
+        all_points = np.array(all_points).reshape((-1, 2, 2))
+        cv2.polylines(img, all_points, False, color, thickness)
+# @time_it(name='Annotate bounding box')
+def annotate_bounding_box(image: bytes, bbox: BoundingBox) -> None:
+    # Draw dashed bounding box
+    draw_dashed_rectangle(
+        image,
+        bbox,
+        color=(0, 0, 255),
+        thickness=1,
+        dash_length=10,
+        gap_length=5,
+    )
+    # Prepare label
+    font_scale = 0.4 * 4  # Increased by 4x for the larger patch
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    thickness = 3  # Increased thickness for the larger patch
+    # Get text size for the larger patch
+    (label_width, label_height), _ = cv2.getTextSize(
+        bbox.label,
+        font,
+        font_scale,
+        thickness,
+    )
+    # Create a larger patch (4x)
+    large_label_patch = np.zeros(
+        (label_height + 20, label_width + 20, 4),
+        dtype=np.uint8,
+    )
+    large_label_patch[:, :, 0:3] = (0, 0, 255)  # BGR color format: Red background
+    large_label_patch[:, :, 3] = 128  # Alpha channel: 50% opacity (128/255 = 0.5)
+    # Draw text on the larger patch
+    cv2.putText(
+        large_label_patch,
+        bbox.label,
+        (8, label_height + 8),  # Adjusted position for the larger patch
+        font,
+        font_scale,
+        (255, 255, 255, 128),  # White text, 50% opaque (128/255 = 0.5)
+        thickness,
+    )
+    # Scale down the patch to improve anti-aliasing
+    label_patch = cv2.resize(
+        large_label_patch,
+        (label_width // 4 + 5, label_height // 4 + 5),
+        interpolation=cv2.INTER_AREA,
+    )
+    # Calculate position for top-left alignment
+    offset = 2  # Small offset to prevent touching the bounding box edge
+    x = min(image.shape[1], max(0, int(bbox.left + 25) - offset))
+    y = min(image.shape[0], max(0, int(bbox.top + 25) - label_patch.shape[0] - offset))
+    # Ensure we're not out of bounds
+    x_end = min(image.shape[1], x + label_patch.shape[1])
+    y_end = min(image.shape[0], y + label_patch.shape[0])
+    label_patch = label_patch[: (y_end - y), : (x_end - x)]
+    # Create a mask for the label patch
+    alpha_mask = label_patch[:, :, 3] / 255.0
+    alpha_mask = np.repeat(alpha_mask[:, :, np.newaxis], 3, axis=2)
+    # Blend the label patch with the image
+    image_section = image[y:y_end, x:x_end]
+    blended = (1 - alpha_mask) * image_section + alpha_mask * label_patch[:, :, 0:3]
+    image[y:y_end, x:x_end] = blended.astype(np.uint8)
+def annotate_bounding_boxes(image: bytes, bounding_boxes: list[BoundingBox]) -> bytes:
+    # Read the image
+    nparr = np.frombuffer(image, np.uint8)
+    # Decode the image
+    img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+    padded_img = cv2.copyMakeBorder(
+        img,
+        top=25,  # Value chosen based on label size
+        bottom=25,  # Value chosen based on label size
+        left=25,  # Value chosen based on label size
+        right=25,  # Value chosen based on label size
+        borderType=cv2.BORDER_CONSTANT,
+        value=(255, 255, 255),
+    )
+    for bounding_box in bounding_boxes:
+        # Annotate the image in place with the bounding box and the bounding box label
+        annotate_bounding_box(padded_img, bounding_box)
+    _, buffer = cv2.imencode(".jpeg", padded_img)
+    return buffer.tobytes()

src/proxy_lite/browser/browser.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import asyncio
+import logging
+import re
+from contextlib import AsyncExitStack
+from pathlib import Path
+from typing import Literal, Optional, Self
+from playwright.async_api import Browser, BrowserContext, Page, Playwright, async_playwright
+from playwright.async_api import TimeoutError as PlaywrightTimeoutError
+from playwright_stealth import stealth_async
+from pydantic import Field
+from tenacity import before_sleep_log, retry, stop_after_delay, wait_exponential
+from proxy_lite.browser.bounding_boxes import POI, BoundingBox, Point, annotate_bounding_boxes
+from proxy_lite.logger import logger
+SELF_CONTAINED_TAGS = [
+    # many of these are non-interactive but keeping them anyway
+    "area",
+    "base",
+    "br",
+    "col",
+    "embed",
+    "hr",
+    "img",
+    "input",
+    "link",
+    "meta",
+    "param",
+    "source",
+    "track",
+    "wbr",
+]
+def element_as_text(
+    mark_id: int,
+    tag: Optional[str] = None,
+    text: Optional[str] = None,
+    **raw_attributes,
+) -> str:
+    """Return a text representation of all elements on the page."""
+    attributes = []
+    for k, v in raw_attributes.items():
+        if v is None:
+            continue
+        if isinstance(v, bool):
+            if v:
+                attributes.append(k)
+            # we ignore False bool attributes
+        else:
+            v = str(v)
+            if len(v) > 2500:
+                v = v[: 2500 - 1] + "…"
+            attributes.append(f'{k}="{v}"')
+    attributes = " ".join(attributes)
+    attributes = (" " + attributes).rstrip()
+    tag = tag.lower()
+    if text is None:
+        text = ""
+    if len(text) > 2500:
+        text = text[: 2500 - 1] + "…"
+    # sub-out line breaks so elements are easier to distinguish
+    attributes = re.sub(r"\r\n|\r|\n", "⏎", attributes)
+    text = re.sub(r"\r\n|\r|\n", "⏎", text)
+    if tag in SELF_CONTAINED_TAGS:
+        if text:
+            logger.warning(
+                f"Got self-contained element '{tag}' which contained text '{text}'.",
+            )
+        else:
+            return f"- [{mark_id}] <{tag}{attributes}/>"
+    return f"- [{mark_id}] <{tag}{attributes}>{text}</{tag}>"
+class BrowserSession:
+    def __init__(
+        self,
+        viewport_width: int = 1280,
+        viewport_height: int = 720,
+        headless: bool = True,
+    ):
+        self.viewport_width = viewport_width
+        self.viewport_height = viewport_height
+        self.headless = headless
+        self.playwright: Playwright | None = None
+        self.browser: Browser | None = None
+        self.context: BrowserContext | None = None
+        self._exit_stack: AsyncExitStack | None = None
+        self.poi_elements: list = Field(default_factory=list)
+        self.poi_centroids: list[Point] = Field(default_factory=list)
+        self.bounding_boxes: list[BoundingBox] = Field(default_factory=list)
+        self.pois: list[POI] = Field(default_factory=list)
+    async def __aenter__(self) -> Self:
+        self._exit_stack = AsyncExitStack()
+        self.playwright = await async_playwright().start()
+        self.browser = await self.playwright.chromium.launch(headless=self.headless)
+        self.context = await self.browser.new_context(
+            viewport={"width": self.viewport_width, "height": self.viewport_height},
+        )
+        await self.context.new_page()
+        self.context.set_default_timeout(60_000)
+        self.current_page.set_default_timeout(60_000)
+        await stealth_async(self.current_page)
+        await self.context.add_init_script(
+            path=Path(__file__).with_name("add_custom_select.js"),
+        )
+        await self.context.add_init_script(
+            path=Path(__file__).with_name("find_pois.js"),
+        )
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        if self.browser:
+            await self.browser.close()
+        if self.playwright:
+            await self.playwright.stop()
+        if self._exit_stack:
+            await self._exit_stack.aclose()
+    @property
+    def current_page(self) -> Optional[Page]:
+        if self.context.pages:
+            return self.context.pages[-1]
+        return None
+    @property
+    def current_url(self) -> Optional[str]:
+        if self.current_page:
+            return self.current_page.url
+        return None
+    # re-run for cases of mid-run redirects
+    @retry(
+        wait=wait_exponential(multiplier=1, min=1, max=10),
+        stop=stop_after_delay(5),
+        reraise=True,
+        before_sleep=before_sleep_log(logger, logging.ERROR),
+    )
+    async def process_iframe(self, iframe) -> Optional[tuple[dict, dict]]:
+        try:
+            # Check iframe visibility and size
+            bounding_box = await iframe.bounding_box()
+            if not bounding_box:
+                return None  # Skip if iframe is not visible
+            width, height = bounding_box["width"], bounding_box["height"]
+            if width < 50 or height < 50:
+                return None
+            frame = await iframe.content_frame()
+            if not frame:
+                return None
+            poi = await frame.evaluate(
+                """() => {
+                    overwriteDefaultSelectConvergence();
+                    return findPOIsConvergence();
+                }""",
+            )
+            if not poi:
+                return None
+            iframe_offset = {"x": round(bounding_box["x"]), "y": round(bounding_box["y"])}
+            return poi, iframe_offset
+        except Exception as e:
+            logger.error(f"Error processing iframe: {e}")
+            return None
+    # re-run for cases of mid-run redirects
+    @retry(
+        wait=wait_exponential(multiplier=1, min=1, max=10),
+        stop=stop_after_delay(5),
+        reraise=True,
+        before_sleep=before_sleep_log(logger, logging.ERROR),
+    )
+    async def update_poi(self) -> None:
+        try:
+            await self.current_page.wait_for_load_state(timeout=60000)
+        except PlaywrightTimeoutError:
+            logger.error(f"Timeout waiting for website load state: {self.current_url}")
+        await self.current_page.wait_for_selector("body", timeout=60000, state="visible")
+        # Run the bounding box javascript code to highlight the points of interest on the page
+        page_info = await self.current_page.evaluate(
+            """() => {
+                overwriteDefaultSelectConvergence();
+                return findPOIsConvergence();
+            }""",
+        )
+        # Get the points of interest on the page
+        self.poi_elements = page_info["element_descriptions"]
+        element_centroids = page_info["element_centroids"]
+        try:
+            # Select all iframes on the page
+            iframes = await self.current_page.query_selector_all("iframe")
+            max_iframes = 10
+            # Define an asynchronous function to process and filter each iframe
+            tasks = [asyncio.create_task(self.process_iframe(iframe)) for iframe in iframes[:max_iframes]]
+            results = await asyncio.gather(*tasks)
+            filtered_results = [result for result in results if result is not None]
+            iframes_pois = []
+            iframe_offsets = []
+            for poi, offset in filtered_results:
+                iframes_pois.append(poi)
+                iframe_offsets.append(offset)
+            # Combine the points of interest from the iframes with the main page and adjust the centroids
+            for index, iframe_poi in enumerate(iframes_pois):
+                self.poi_elements.extend(iframe_poi["element_descriptions"])
+                for centroid in iframe_poi["element_centroids"]:
+                    centroid["x"] += iframe_offsets[index]["x"]
+                    centroid["y"] += iframe_offsets[index]["y"]
+                    centroid["left"] += iframe_offsets[index]["x"]
+                    centroid["top"] += iframe_offsets[index]["y"]
+                    centroid["right"] += iframe_offsets[index]["x"]
+                    centroid["bottom"] += iframe_offsets[index]["y"]
+                element_centroids.extend(iframe_poi["element_centroids"])
+        except Exception as e:
+            logger.error(f"Error in finding iframes: {e}")
+        # Get the centroids of the points of interest
+        self.poi_centroids = [Point(x=xy["x"], y=xy["y"]) for xy in element_centroids]
+        self.bounding_boxes = [BoundingBox(**xy, label=str(i)) for i, xy in enumerate(element_centroids)]
+        self.pois = [
+            POI(info=info, element_centroid=centroid, bounding_box=bbox)
+            for info, centroid, bbox in zip(
+                self.poi_elements,
+                self.poi_centroids,
+                self.bounding_boxes,
+                strict=False,
+            )
+        ]
+    @property
+    def poi_text(self) -> str:
+        # Get all points of interest on the page as text
+        texts = [element_as_text(mark_id=i, **element) for i, element in enumerate(self.poi_elements)]
+        # Return formatted text of points of interest on page
+        return "\n".join([txt for txt in texts if txt])
+    async def screenshot(
+        self,
+        delay: float = 0.0,
+        quality: int = 70,
+        type: str = "jpeg",
+        scale: str = "css",
+    ) -> tuple[bytes, bytes]:
+        if delay > 0.0:
+            await asyncio.sleep(delay)
+        await self.update_poi()
+        old_poi_positions = [tuple(point) for point in self.poi_centroids]
+        img = await self.current_page.screenshot(type=type, quality=quality, scale=scale)
+        annotated_img = annotate_bounding_boxes(image=img, bounding_boxes=self.bounding_boxes)
+        # check page has not changed since the screenshot was taken
+        await self.update_poi()
+        new_poi_positions = [tuple(point) for point in self.poi_centroids]
+        if new_poi_positions != old_poi_positions:
+            # if it has changed, take another
+            img = await self.current_page.screenshot(type=type, quality=quality, scale=scale)
+            await self.update_poi()
+            annotated_img = annotate_bounding_boxes(image=img, bounding_boxes=self.bounding_boxes)
+        return img, annotated_img
+    async def goto(self, url: str) -> None:
+        await self.current_page.goto(url, wait_until="domcontentloaded")
+    async def reload(self) -> None:
+        await self.current_page.reload(wait_until="domcontentloaded")
+    async def click_tab(self, mark_id: int) -> None:
+        point: Point = self.poi_centroids[mark_id]
+        await self.hover(point)
+        await self.current_page.mouse.click(*point, button="middle")
+    async def click(self, mark_id: int) -> None:
+        point: Point = self.poi_centroids[mark_id]
+        await self.hover(point)
+        await self.current_page.mouse.click(*point)
+    async def enter_text(self, mark_id: int, text: str, submit: bool = False) -> None:
+        await self.clear_text_field(mark_id)
+        await self.click(mark_id)
+        await self.current_page.keyboard.type(text)
+        if submit:
+            await self.current_page.keyboard.press("Enter")
+    async def scroll(
+        self,
+        direction: Literal["up", "down", "left", "right"],
+        mark_id: Optional[int] = None,
+    ) -> None:
+        if mark_id is None:
+            point = Point(x=-1, y=-1)
+            max_scroll_x = self.viewport_width
+            max_scroll_y = self.viewport_height
+        else:
+            point: Point = self.poi_centroids[mark_id]
+            bbox: BoundingBox = self.bounding_boxes[mark_id]
+            max_scroll_x = bbox.right - bbox.left
+            max_scroll_y = bbox.bottom - bbox.top
+        await self.hover(point=point)
+        scroll_x = int(max_scroll_x * 0.8)
+        scroll_y = int(max_scroll_y * 0.8)
+        is_vertical = direction in ("up", "down")
+        reverse_scroll = direction in ("up", "left")
+        await self.current_page.mouse.wheel(
+            scroll_x * (-1 if reverse_scroll else 1) * (not is_vertical),
+            scroll_y * (-1 if reverse_scroll else 1) * is_vertical,
+        )
+    async def go_back(self) -> None:
+        # If there is no tab open then return
+        if not self.current_page:
+            return
+        await self.current_page.go_back(wait_until="domcontentloaded")
+        if self.current_page.url == "about:blank":
+            if not len(self.context.pages) > 1:
+                await self.current_page.go_forward(wait_until="domcontentloaded")
+                raise Exception("There is no previous page to go back to.")
+            await self.current_page.close()
+    async def hover(self, point: Point) -> None:
+        await self.current_page.mouse.move(*point)
+    async def focus(self, point: Point) -> None:
+        # Focus on the element on the page at point (x, y)
+        await self.current_page.evaluate(
+            """
+            ([x, y]) => {
+                const element = document.elementFromPoint(x, y);
+                if (element && element.focus) {
+                    element.focus();
+                }
+            }""",
+            tuple(point),
+        )
+    async def get_text(self, mark_id: int) -> str:
+        return await self.current_page.evaluate(
+            """
+            (mark_id) => {
+                const element = marked_elements_convergence[mark_id];
+                if (element && (element.value !== undefined || element.textContent !== undefined)) {
+                    return element.value || element.textContent;
+                }
+                return '';
+            }
+            """,
+            (mark_id,),
+        )
+    async def clear_text_field(self, mark_id: int) -> None:
+        existing_text = await self.get_text(mark_id)
+        if existing_text.strip():
+            # Clear existing text only if it exists
+            await self.click(mark_id)
+            await self.current_page.keyboard.press("Control+Home")
+            await self.current_page.keyboard.press("Control+Shift+End")
+            await self.current_page.keyboard.press("Backspace")
+if __name__ == "__main__":
+    import json
+    test = """{"name": "return_value", "arguments": {'value': 'The most downloaded French speech recognition model on Hugging Face is DeepSeek-R1. Here are its evaluation metrics:\n\n- Claude-3.5-1022: MMLU 88.3, MMLU-Redux 88.9\n- GPT-4.0-5013: MMLU 87.2, MMLU-Redux 88.0\n- DeepSeek-01-3013: MMLU 88.5, MMLU-Redux 89.1\n- OpenAI-01-mini: MMLU 91.0, MMLU-Redux 88.7\n\nPlease see the attached screenshot for more details.'}}"""
+    test = json.loads(test)
+    print(test)
+    exit()
+    async def dummy_test():
+        async with BrowserSession(headless=False) as s:
+            page = await s.context.new_page()
+            await page.goto("http://google.co.uk")
+            await asyncio.sleep(5)
+            await page.screenshot(path="example.png")
+            await s.update_poi()
+            _, annotated_image = await s.screenshot()
+            with open("output.png", "wb") as f:
+                f.write(annotated_image)
+    asyncio.run(dummy_test())

src/proxy_lite/browser/find_pois.js ADDED Viewed

	@@ -0,0 +1,397 @@

+marked_elements_convergence = [];
+const interactiveTags = new Set([
+    'a', 'button', 'details', 'embed', 'input', 'label',
+    'menu', 'menuitem', 'object', 'select', 'textarea', 'summary',
+    'video', 'audio', 'option', 'iframe'
+]);
+const interactiveRoles = new Set([
+    'button', 'menu', 'menuitem', 'link', 'checkbox', 'radio',
+    'slider', 'tab', 'tabpanel', 'textbox', 'combobox', 'grid',
+    'listbox', 'option', 'progressbar', 'scrollbar', 'searchbox',
+    'switch', 'tree', 'treeitem', 'spinbutton', 'tooltip',
+    'a-button-inner', 'a-dropdown-button', 'click',
+    'menuitemcheckbox', 'menuitemradio', 'a-button-text',
+    'button-text', 'button-icon', 'button-icon-only',
+    'button-text-icon-only', 'dropdown', 'combobox'
+]);
+findPOIsConvergence = (input = null) => {
+    let rootElement = input ? input : document.documentElement;
+    function isScrollable(element) {
+        if ((input === null) && (element === document.documentElement)) {
+            // we can always scroll the full page
+            return false;
+        }
+        const style = window.getComputedStyle(element);
+        const hasScrollableYContent = element.scrollHeight > element.clientHeight
+        const overflowYScroll = style.overflowY === 'scroll' || style.overflowY === 'auto';
+        const hasScrollableXContent = element.scrollWidth > element.clientWidth;
+        const overflowXScroll = style.overflowX === 'scroll' || style.overflowX === 'auto';
+        return (hasScrollableYContent && overflowYScroll) || (hasScrollableXContent && overflowXScroll);
+    }
+    function getEventListeners(element) {
+        try {
+            return window.getEventListeners?.(element) || {};
+        } catch (e) {
+            return {};
+        }
+    }
+    function isInteractive(element) {
+        if (!element) return false;
+        return (hasInteractiveTag(element) ||
+            hasInteractiveAttributes(element) ||
+            hasInteractiveEventListeners(element)) ||
+            isScrollable(element);
+    }
+    function hasInteractiveTag(element) {
+        return interactiveTags.has(element.tagName.toLowerCase());
+    }
+    function hasInteractiveAttributes(element) {
+        const role = element.getAttribute('role');
+        const ariaRole = element.getAttribute('aria-role');
+        const tabIndex = element.getAttribute('tabindex');
+        const onAttribute = element.getAttribute('on');
+        if (element.getAttribute('contenteditable') === 'true') return true;
+        if ((role && interactiveRoles.has(role)) ||
+            (ariaRole && interactiveRoles.has(ariaRole))) return true;
+        if (tabIndex !== null && tabIndex !== '-1') return true;
+        // Add check for AMP's 'on' attribute that starts with 'tap:'
+        if (onAttribute && onAttribute.startsWith('tap:')) return true;
+        const hasAriaProps = element.hasAttribute('aria-expanded') ||
+            element.hasAttribute('aria-pressed') ||
+            element.hasAttribute('aria-selected') ||
+            element.hasAttribute('aria-checked');
+        return hasAriaProps;
+    }
+    function hasInteractiveEventListeners(element) {
+        const hasClickHandler = element.onclick !== null ||
+             element.getAttribute('onclick') !== null ||
+             element.hasAttribute('ng-click') ||
+             element.hasAttribute('@click') ||
+             element.hasAttribute('v-on:click');
+        if (hasClickHandler) return true;
+        const listeners = getEventListeners(element);
+        return listeners && (
+            listeners.click?.length > 0 ||
+            listeners.mousedown?.length > 0 ||
+            listeners.mouseup?.length > 0 ||
+            listeners.touchstart?.length > 0 ||
+            listeners.touchend?.length > 0
+        );
+    }
+    function calculateArea(rects) {
+        return rects.reduce((acc, rect) => acc + rect.width * rect.height, 0);
+    }
+    function getElementRects(element, context) {
+        const vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0);
+        const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0);
+        let rects = [...element.getClientRects()];
+        // If rects are empty (likely due to Shadow DOM), try to estimate position
+        if (rects.length === 0 && element.getBoundingClientRect) {
+            rects = [element.getBoundingClientRect()];
+        }
+        // Get iframe offset if element is in an iframe
+        let iframeOffset = { x: 0, y: 0 };
+        if (context !== document && context?.defaultView?.frameElement) {
+            const iframe = context.defaultView.frameElement;
+            if (iframe) {
+                const iframeRect = iframe.getBoundingClientRect();
+                iframeOffset = {
+                    x: iframeRect.left,
+                    y: iframeRect.top
+                };
+            }
+        }
+        return rects.filter(bb => {
+            const center_x = bb.left + bb.width / 2 + iframeOffset.x;
+            const center_y = bb.top + bb.height / 2 + iframeOffset.y;
+            const elAtCenter = context.elementFromPoint(center_x - iframeOffset.x, center_y - iframeOffset.y);
+            return elAtCenter === element || element.contains(elAtCenter);
+        }).map(bb => {
+            const rect = {
+                left: Math.max(0, bb.left + iframeOffset.x),
+                top: Math.max(0, bb.top + iframeOffset.y),
+                right: Math.min(vw, bb.right + iframeOffset.x),
+                bottom: Math.min(vh, bb.bottom + iframeOffset.y)
+            };
+            return {
+                ...rect,
+                width: rect.right - rect.left,
+                height: rect.bottom - rect.top
+            };
+        });
+    }
+    function isElementVisible(element) {
+        const style = window.getComputedStyle(element);
+        return element.offsetWidth > 0 &&
+            element.offsetHeight > 0 &&
+            style.visibility !== 'hidden' &&
+            style.display !== 'none';
+    }
+    function isTopElement(element) {
+        let doc = element.ownerDocument;
+        if (doc !== window.document) {
+            // If in an iframe's document, treat as top
+            return true;
+        }
+        const shadowRoot = element.getRootNode();
+        if (shadowRoot instanceof ShadowRoot) {
+            const rect = element.getBoundingClientRect();
+            const point = { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
+            try {
+                const topEl = shadowRoot.elementFromPoint(point.x, point.y);
+                if (!topEl) return false;
+                let current = topEl;
+                while (current && current !== shadowRoot) {
+                    if (current === element) return true;
+                    current = current.parentElement;
+                }
+                return false;
+            } catch (e) {
+                return true;
+            }
+        }
+        const rect = element.getBoundingClientRect();
+        const point = { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
+        try {
+            const topEl = document.elementFromPoint(point.x, point.y);
+            if (!topEl) return false;
+            let current = topEl;
+            while (current && current !== document.documentElement) {
+                if (current === element) return true;
+                current = current.parentElement;
+            }
+            return false;
+        } catch (e) {
+            return true;
+        }
+    }
+    function getVisibleText(element, marked_elements_convergence = []) {
+        const blockLikeDisplays = [
+            // Basic block elements
+            'block', 'flow-root', 'inline-block',
+            // Lists
+            'list-item',
+            // Table elements
+            'table', 'inline-table', 'table-row', 'table-cell',
+            'table-caption', 'table-header-group', 'table-footer-group',
+            'table-row-group',
+            // Modern layouts
+            'flex', 'inline-flex', 'grid', 'inline-grid'
+        ];
+        // Check if element is hidden
+        const style = window.getComputedStyle(element);
+        if (style.display === 'none' || style.visibility === 'hidden') {
+            return '';
+        }
+        let collectedText = [];
+        function isMarkedInteractive(el) {
+            return marked_elements_convergence.includes(el);
+        }
+        function traverse(node) {
+            if (
+                node.nodeType === Node.ELEMENT_NODE &&
+                node !== element &&
+                isMarkedInteractive(node)
+            ) {
+                return false;
+            }
+            if (node.nodeType === Node.TEXT_NODE) {
+                const trimmed = node.textContent.trim();
+                if (trimmed) {
+                    collectedText.push(trimmed);
+                }
+            } else if (node.nodeType === Node.ELEMENT_NODE) {
+                // Skip noscript elements
+                if (node.tagName === 'NOSCRIPT') {
+                    return true;
+                }
+                const nodeStyle = window.getComputedStyle(node);
+                // Skip hidden elements
+                if (nodeStyle.display === 'none' || nodeStyle.visibility === 'hidden') {
+                    return true;
+                }
+                // Add newline before block elements if we have text
+                if (blockLikeDisplays.includes(nodeStyle.display) && collectedText.length > 0) {
+                    collectedText.push('\n');
+                }
+                if (node.tagName === 'IMG') {
+                    const textParts = [];
+                    const alt = node.getAttribute('alt');
+                    const title = node.getAttribute('title');
+                    const ariaLabel = node.getAttribute('aria-label');
+                    // Add more as needed (e.g., 'aria-describedby', 'data-caption', etc.)
+                    if (alt) textParts.push(`alt="${alt}"`);
+                    if (title) textParts.push(`title="${title}"`);
+                    if (ariaLabel) textParts.push(`aria-label="${ariaLabel}"`);
+                    if (textParts.length > 0) {
+                        collectedText.push(`[img - ${textParts.join(' ')}]`);
+                    }
+                    return true;
+                }
+                for (const child of node.childNodes) {
+                    const shouldContinue = traverse(child);
+                    if (shouldContinue === false) {
+                        return false;
+                    }
+                }
+                // Add newline after block elements
+                if (blockLikeDisplays.includes(nodeStyle.display)) {
+                    collectedText.push('\n');
+                }
+            }
+            return true;
+        }
+        traverse(element);
+        // Join text and normalize whitespace
+        return collectedText.join(' ').trim().replace(/\s{2,}/g, ' ').trim();
+    }
+    function extractInteractiveItems(rootElement) {
+        const items = [];
+        function processElement(element, context) {
+            if (!element) return;
+            // Recursively process elements
+            if (element.nodeType === Node.ELEMENT_NODE && isInteractive(element) && isElementVisible(element) && isTopElement(element)) {
+                const rects = getElementRects(element, context);
+                const area = calculateArea(rects);
+                items.push({
+                    element: element,
+                    area,
+                    rects,
+                    is_scrollable: isScrollable(element),
+                });
+            }
+            if (element.shadowRoot) {
+                // if it's shadow DOM, process elements in the shadow DOM
+                Array.from(element.shadowRoot.childNodes || []).forEach(child => {
+                    processElement(child, element.shadowRoot);
+                });
+            }
+            if (element.tagName === 'SLOT') {
+                // Handle both assigned elements and nodes
+                const assigned = element.assignedNodes ? element.assignedNodes() : element.assignedElements();
+                assigned.forEach(child => {
+                    processElement(child, context);
+                });
+            }
+            else if (element.tagName === 'IFRAME') {
+                try {
+                    const iframeDoc = element.contentDocument || element.contentWindow?.document;
+                    if (iframeDoc && iframeDoc.body) {
+                        // Process elements inside iframe
+                        processElement(iframeDoc.body, iframeDoc);
+                    }
+                } catch (e) {
+                    console.warn('Unable to access iframe contents:', e);
+                }
+            } else {
+                // if it's regular child elements, process regular child elements
+                Array.from(element.children || []).forEach(child => {
+                    processElement(child, context);
+                });
+            }
+        }
+        processElement(rootElement, document);
+        return items;
+    }
+    if (marked_elements_convergence) {
+        marked_elements_convergence = [];
+    }
+    let mark_centres = [];
+    let marked_element_descriptions = [];
+    var items = extractInteractiveItems(rootElement);
+    // Lets create a floating border on top of these elements that will always be visible
+    let index = 0;
+    items.forEach(function (item) {
+        item.rects.forEach((bbox) => {
+            marked_elements_convergence.push(item.element);
+            mark_centres.push({
+                x: Math.round((bbox.left + bbox.right) / 2),
+                y: Math.round((bbox.top + bbox.bottom) / 2),
+                left: bbox.left,
+                top: bbox.top,
+                right: bbox.right,
+                bottom: bbox.bottom,
+            });
+            marked_element_descriptions.push({
+                tag: item.element.tagName,
+                text: getVisibleText(item.element),
+                // NOTE: all other attributes will be shown to the model when present
+                // TODO: incorperate child attributes, e.g. <img alt="..."> when img is a child of the link element
+                value: item.element.value,
+                placeholder: item.element.getAttribute("placeholder"),
+                element_type: item.element.getAttribute("type"),
+                aria_label: item.element.getAttribute("aria-label"),
+                name: item.element.getAttribute("name"),
+                required: item.element.getAttribute("required"),
+                disabled: item.element.getAttribute("disabled"),
+                pattern: item.element.getAttribute("pattern"),
+                checked: item.element.getAttribute("checked"),
+                minlength: item.element.getAttribute("minlength"),
+                maxlength: item.element.getAttribute("maxlength"),
+                role: item.element.getAttribute("role"),
+                title: item.element.getAttribute("title"),
+                scrollable: item.is_scrollable
+            });
+            index++;
+        });
+    });
+    return {
+        element_descriptions: marked_element_descriptions,
+        element_centroids: mark_centres
+    };
+}

src/proxy_lite/cli.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import argparse
+import asyncio
+import os
+from pathlib import Path
+from typing import Optional
+from proxy_lite import Runner, RunnerConfig
+from proxy_lite.logger import logger
+def update_config_from_env(config: RunnerConfig) -> RunnerConfig:
+    if os.getenv("PROXY_LITE_API_BASE"):
+        config.solver.client.api_base = os.getenv("PROXY_LITE_API_BASE")
+    if os.getenv("PROXY_LITE_MODEL"):
+        config.solver.client.model_id = os.getenv("PROXY_LITE_MODEL")
+    return config
+def do_command(args):
+    do_text = " ".join(args.task)
+    logger.info("🤖 Let me help you with that...")
+    # Take default config from YAML
+    config = RunnerConfig.from_yaml(args.config)
+    # Update config from environment variables
+    config = update_config_from_env(config)
+    # Update config from command-line arguments
+    if args.api_base:
+        config.solver.client.api_base = args.api_base
+    if args.model:
+        config.solver.client.model_id = args.model
+    if args.homepage:
+        config.homepage = args.homepage
+    if args.viewport_width:
+        config.viewport_width = args.viewport_width
+    if args.viewport_height:
+        config.viewport_height = args.viewport_height
+    o = Runner(config=config)
+    asyncio.run(o.run(do_text))
+def main():
+    parser = argparse.ArgumentParser(description="Proxy-Lite")
+    parser.add_argument(
+        "task",
+        type=str,
+        help="The task you want to accomplish",
+        nargs="*",
+    )
+    parser.add_argument(
+        "--model",
+        type=Optional[str],
+        default=None,
+        help="The model to use.",
+    )
+    parser.add_argument(
+        "--api_base",
+        type=Optional[str],
+        default=None,
+        help="The API base URL to use.",
+    )
+    # New option for setting a homepage URL:
+    parser.add_argument(
+        "--homepage",
+        type=Optional[str],
+        default=None,
+        help="The homepage URL to use.",
+    )
+    # New viewport controls:
+    parser.add_argument(
+        "--viewport-width",
+        type=Optional[int],
+        default=None,
+        help="Viewport width in pixels.",
+    )
+    parser.add_argument(
+        "--viewport-height",
+        type=Optional[int],
+        default=None,
+        help="Viewport height in pixels.",
+    )
+    parser.add_argument(
+        "--config",
+        type=Path,
+        default=Path(__file__).parent / "configs/default.yaml",
+        help="Path to config file (default: configs/default.yaml)",
+    )
+    args = parser.parse_args()
+    do_command(args)
+if __name__ == "__main__":
+    main()

src/proxy_lite/client.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import os
+from abc import ABC, abstractmethod
+from functools import cached_property
+from typing import ClassVar, Literal, Optional, Union
+import httpx
+from httpx import Limits, Timeout
+from openai import AsyncOpenAI
+from openai.types.chat.chat_completion import (
+    ChatCompletion,
+)
+from pydantic import BaseModel
+from proxy_lite.history import MessageHistory
+from proxy_lite.logger import logger
+from proxy_lite.serializer import (
+    BaseSerializer,
+    OpenAISerializer,
+)
+from proxy_lite.tools import Tool
+class BaseClientConfig(BaseModel):
+    http_timeout: float = 50
+    http_concurrent_connections: int = 50
+class BaseClient(BaseModel, ABC):
+    config: BaseClientConfig
+    serializer: ClassVar[BaseSerializer]
+    @abstractmethod
+    async def create_completion(
+        self,
+        messages: MessageHistory,
+        temperature: float = 0.7,
+        seed: Optional[int] = None,
+        tools: Optional[list[Tool]] = None,
+        response_format: Optional[type[BaseModel]] = None,
+    ) -> ChatCompletion: ...
+    """
+    Create completion from model.
+    Expect subclasses to adapt from various endpoints that will handle
+    requests differently, make sure to raise appropriate warnings.
+    Returns:
+        ChatCompletion: OpenAI ChatCompletion format for consistency
+    """
+    @classmethod
+    def create(cls, config: BaseClientConfig) -> "BaseClient":
+        supported_clients = {
+            "openai-azure": OpenAIClient,
+            "convergence": ConvergenceClient,
+        }
+        if config.name not in supported_clients:
+            error_message = f"Unsupported model: {config.name}."
+            raise ValueError(error_message)
+        return supported_clients[config.name](config=config)
+    @property
+    def http_client(self) -> httpx.AsyncClient:
+        return httpx.AsyncClient(
+            timeout=Timeout(self.config.http_timeout),
+            limits=Limits(
+                max_connections=self.config.http_concurrent_connections,
+                max_keepalive_connections=self.config.http_concurrent_connections,
+            ),
+        )
+class OpenAIClientConfig(BaseClientConfig):
+    name: Literal["openai"] = "openai"
+    model_id: str = "gpt-4o"
+    api_key: str = os.environ["OPENAI_API_KEY"]
+class OpenAIClient(BaseClient):
+    config: OpenAIClientConfig
+    serializer: ClassVar[OpenAISerializer] = OpenAISerializer()
+    @cached_property
+    def external_client(self) -> AsyncOpenAI:
+        return AsyncOpenAI(
+            api_key=self.config.api_key,
+            http_client=self.http_client,
+        )
+    async def create_completion(
+        self,
+        messages: MessageHistory,
+        temperature: float = 0.7,
+        seed: Optional[int] = None,
+        tools: Optional[list[Tool]] = None,
+        response_format: Optional[type[BaseModel]] = None,
+    ) -> ChatCompletion:
+        base_params = {
+            "model": self.config.model_id,
+            "messages": self.serializer.serialize_messages(messages),
+            "temperature": temperature,
+        }
+        optional_params = {
+            "seed": seed,
+            "tools": self.serializer.serialize_tools(tools) if tools else None,
+            "tool_choice": "required" if tools else None,
+            "response_format": {"type": "json_object"} if response_format else {"type": "text"},
+        }
+        base_params.update({k: v for k, v in optional_params.items() if v is not None})
+        return await self.external_client.chat.completions.create(**base_params)
+class ConvergenceClientConfig(BaseClientConfig):
+    name: Literal["convergence"] = "convergence"
+    model_id: str = "convergence-ai/proxy-lite-7b"
+    api_base: str = "http://localhost:8000/v1"
+    api_key: str = "none"
+class ConvergenceClient(OpenAIClient):
+    config: ConvergenceClientConfig
+    serializer: ClassVar[OpenAISerializer] = OpenAISerializer()
+    _model_validated: bool = False
+    async def _validate_model(self) -> None:
+        try:
+            await self.external_client.beta.chat.completions.parse(
+                model=self.config.model_id,
+                messages=[{"role": "user", "content": "Hello"}],
+            )
+            self._model_validated = True
+            logger.debug(f"Model {self.config.model_id} validated and connected to cluster")
+        except Exception as e:
+            logger.error(f"Error retrieving model: {e}")
+            raise e
+    @cached_property
+    def external_client(self) -> AsyncOpenAI:
+        return AsyncOpenAI(
+            base_url=self.config.api_base,
+            http_client=self.http_client,
+        )
+    async def create_completion(
+        self,
+        messages: MessageHistory,
+        temperature: float = 0.7,
+        seed: Optional[int] = None,
+        tools: Optional[list[Tool]] = None,
+        response_format: Optional[type[BaseModel]] = None,
+    ) -> ChatCompletion:
+        if not self._model_validated:
+            await self._validate_model()
+        base_params = {
+            "model": self.config.model_id,
+            "messages": self.serializer.serialize_messages(messages),
+            "temperature": temperature,
+        }
+        optional_params = {
+            "seed": seed,
+            "tools": self.serializer.serialize_tools(tools) if tools else None,
+            "tool_choice": "auto" if tools else None,  # vLLM does not support "required"
+            "response_format": response_format if response_format else {"type": "text"},
+        }
+        base_params.update({k: v for k, v in optional_params.items() if v is not None})
+        return await self.external_client.chat.completions.create(**base_params)
+ClientConfigTypes = Union[OpenAIClientConfig, ConvergenceClientConfig]
+ClientTypes = Union[OpenAIClient, ConvergenceClient]

src/proxy_lite/configs/default.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+environment:
+  name: webbrowser
+  annotate_image: true
+  screenshot_delay: 2.0
+  viewport_width: 1280
+  viewport_height: 1920
+  include_poi_text: true
+  headless: false
+  homepage: https://www.google.co.uk
+solver:
+  name: simple
+  agent:
+    name: proxy_lite
+    client:
+      name: convergence
+      model_id: convergence-ai/subset-distill-tools-7b-15-02-2025
+      api_base: http://slurm1-a3nodeset-4-1:8002/v1
+local_view: true
+task_timeout: 1800
+verbose: true

src/proxy_lite/environments/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from typing import Union
+from .environment_base import (
+    Action,
+    BaseEnvironment,
+    BaseEnvironmentConfig,
+    Environments,
+    Event,
+    EventType,
+    Observation,
+)
+from .webbrowser import (
+    WebBrowserEnvironment,
+    WebBrowserEnvironmentConfig,
+)
+EnvironmentConfigTypes = Union[*list(Environments._environment_config_registry.values())]
+EnvironmentTypes = Union[*list(Environments._environment_registry.values())]
+__all__ = [
+    "Action",
+    "BaseEnvironment",
+    "BaseEnvironmentConfig",
+    "EnvironmentConfigTypes",
+    "Environments",
+    "Event",
+    "EventType",
+    "Observation",
+    "WebBrowserEnvironment",
+    "WebBrowserEnvironmentConfig",
+]

src/proxy_lite/environments/environment_base.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import json
+import logging
+from abc import ABC, abstractmethod
+from enum import Enum
+from functools import cached_property
+from typing import Any, Literal, Optional, Self
+from pydantic import BaseModel
+from proxy_lite.history import ToolCall
+from proxy_lite.tools import Tool, ToolExecutionResponse
+class EventType(str, Enum):
+    OBSERVATION = "observation"
+    ACTION = "action"
+    MESSAGE = "message"
+class Event(BaseModel):
+    type: EventType
+class State(BaseModel):
+    text: Optional[str] = None
+    image: Optional[str] = None  # base64 encoded image
+    html: Optional[str] = None
+    tool_responses: Optional[list[ToolExecutionResponse]] = None
+class Observation(Event):
+    type: Literal[EventType.OBSERVATION] = EventType.OBSERVATION
+    state: State
+    terminated: bool
+    reward: Optional[float] = None
+    info: Optional[dict[str, Any]] = None
+class Action(Event):
+    type: Literal[EventType.ACTION] = EventType.ACTION
+    text: Optional[str] = None
+    tool_calls: Optional[list[ToolCall]] = None
+    info: Optional[dict[str, Any]] = None
+class BaseEnvironmentConfig(BaseModel): ...
+class BaseEnvironment(BaseModel, ABC):
+    config: BaseEnvironmentConfig
+    logger: logging.Logger | None = None
+    class Config:
+        arbitrary_types_allowed = True
+    async def __aenter__(self) -> Self:
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        pass
+    @property
+    @abstractmethod
+    def info_for_user(self) -> str: ...
+    @cached_property
+    @abstractmethod
+    def tools(self) -> list[Tool]: ...
+    @abstractmethod
+    async def initialise(self) -> Observation: ...
+    @abstractmethod
+    async def execute_action(self, action: Action) -> Observation: ...
+    @abstractmethod
+    async def observe(self) -> Observation: ...
+    @abstractmethod
+    async def evaluate(self, **kwargs: dict[str, Any]) -> dict[str, Any]: ...
+    async def execute_tool(self, tool_call: ToolCall) -> None:
+        function = tool_call.function
+        for tool in self.tools:
+            if hasattr(tool, function["name"]):
+                arguments = json.loads(function["arguments"])
+                if type(arguments) == str:
+                    arguments = json.loads(arguments)
+                return await getattr(tool, function["name"])(
+                    **arguments,
+                )
+        msg = f'No tool function with name "{function["name"]}"'
+        raise ValueError(msg)
+    async def get_info(self) -> dict[str, Any]:
+        return {}
+class Environments:
+    _environment_registry: dict[str, type[BaseEnvironment]] = {}
+    _environment_config_registry: dict[str, type[BaseEnvironmentConfig]] = {}
+    @classmethod
+    def register_environment(cls, name: str):
+        """
+        Decorator to register an Environment class under a given name.
+        Example:
+            @Environments.register_environment("my_environment")
+            class MyEnvironment(BaseEnvironment):
+                ...
+        """
+        def decorator(env_cls: type[BaseEnvironment]) -> type[BaseEnvironment]:
+            cls._environment_registry[name] = env_cls
+            return env_cls
+        return decorator
+    @classmethod
+    def register_environment_config(cls, name: str):
+        """
+        Decorator to register an Environment configuration class under a given name.
+        Example:
+            @Environments.register_environment_config("my_environment")
+            class MyEnvironmentConfig(BaseEnvironmentConfig):
+                ...
+        """
+        def decorator(config_cls: type[BaseEnvironmentConfig]) -> type[BaseEnvironmentConfig]:
+            cls._environment_config_registry[name] = config_cls
+            return config_cls
+        return decorator
+    @classmethod
+    def get(cls, name: str) -> type[BaseEnvironment]:
+        """
+        Retrieve a registered Environment class by its name.
+        Raises:
+            ValueError: If no such environment is found.
+        """
+        try:
+            return cls._environment_registry[name]
+        except KeyError:
+            raise ValueError(f"Environment '{name}' not found.")
+    @classmethod
+    def get_config(cls, name: str) -> type[BaseEnvironmentConfig]:
+        """
+        Retrieve a registered Environment configuration class by its name.
+        Raises:
+            ValueError: If no such configuration is found.
+        """
+        try:
+            return cls._environment_config_registry[name]
+        except KeyError:
+            raise ValueError(f"Environment config for '{name}' not found.")

src/proxy_lite/environments/webbrowser.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import base64
+from functools import cached_property
+from typing import Any, Literal, Optional, Self
+from proxy_lite.browser.browser import BrowserSession
+from proxy_lite.environments.environment_base import (
+    Action,
+    BaseEnvironment,
+    BaseEnvironmentConfig,
+    Environments,
+    Observation,
+    State,
+)
+from proxy_lite.tools import BrowserTool, Tool, ToolExecutionResponse
+@Environments.register_environment_config("webbrowser")
+class WebBrowserEnvironmentConfig(BaseEnvironmentConfig):
+    name: Literal["webbrowser"] = "webbrowser"
+    homepage: str = "https://google.com"
+    annotate_image: bool = True
+    screenshot_delay: float = 1.0  # seconds
+    include_html: bool = True
+    include_poi_text: bool = True
+    record_pois: bool = True
+    viewport_width: int = 1280
+    viewport_height: int = 720
+    browserbase_timeout: int = 7200
+    headless: bool = True
+    keep_original_image: bool = False
+@Environments.register_environment("webbrowser")
+class WebBrowserEnvironment(BaseEnvironment):
+    config: WebBrowserEnvironmentConfig
+    browser: Optional[BrowserSession] = None
+    cancelled_last_action: bool = False
+    class Config:
+        arbitrary_types_allowed = True
+    async def __aenter__(self) -> Self:
+        # Initialize the BrowserSession
+        self.browser = self.browser_session(
+            viewport_width=self.config.viewport_width,
+            viewport_height=self.config.viewport_height,
+            headless=self.config.headless,
+        )
+        await self.browser.__aenter__()
+        # Initialize other resources if necessary
+        if self.cookies:
+            await self.browser.context.add_cookies(self.cookies)
+        self.logger.info("🌐 [bold blue]Browser session started.[/]")
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        # Clean up the BrowserSession
+        await self.browser.__aexit__(exc_type, exc_value, traceback)
+    @property
+    def info_for_user(self) -> str:
+        return "This is a web browser environment. You can navigate the web, search the web, and perform actions on the web."  # noqa: E501
+    @cached_property
+    def tools(self) -> list[Tool]:
+        return [BrowserTool(session=self.browser)]
+    @cached_property
+    def browser_session(self) -> type[BrowserSession]:
+        return BrowserSession
+    @property
+    def cookies(self) -> list[dict]:
+        return []
+    async def initialise(self) -> Observation:
+        await self.browser.goto(self.config.homepage)
+        original_img, annotated_img = await self.browser.screenshot(
+            delay=self.config.screenshot_delay,
+        )
+        base64_image = base64.b64encode(annotated_img).decode("utf-8")
+        html_content = await self.browser.current_page.content() if self.config.include_html else None
+        info = {"url": self.browser.current_url}
+        if self.config.record_pois:
+            info["pois"] = self.browser.pois
+        if self.config.keep_original_image:
+            info["original_image"] = base64.b64encode(original_img).decode("utf-8")
+        return Observation(
+            state=State(
+                text=f"URL: {self.browser.current_url}"
+                + (f"\n{self.browser.poi_text}" if self.config.include_poi_text else ""),
+                image=base64_image,
+                html=html_content,
+            ),
+            terminated=False,
+            reward=None,
+            info=info,
+        )
+    async def should_perform_action(self) -> bool:
+        # if cancelled last action, run the action without updating POIs
+        if self.cancelled_last_action:
+            self.cancelled_last_action = False
+            return True
+        # check for page changes
+        old_points = [tuple(point) for point in self.browser.poi_centroids]
+        await self.browser.update_poi()
+        new_points = [tuple(point) for point in self.browser.poi_centroids]
+        page_changed_mid_action = old_points != new_points
+        # record if the last action was cancelled
+        if page_changed_mid_action:
+            self.cancelled_last_action = True
+            return False
+        return True
+    async def execute_action(self, action: Action) -> Observation:
+        responses = []
+        cancelled_tools_flag = False
+        if await self.should_perform_action():
+            for tool_call in action.tool_calls:
+                # Perform the chosen action
+                try:
+                    tool_response: ToolExecutionResponse = await self.execute_tool(
+                        tool_call,
+                    )
+                    tool_response.id = tool_call.id
+                    responses.append(tool_response)
+                except Exception as e:  # noqa: PERF203
+                    self.logger.warning("🌐 An error occurred taking action: %s", str(e), exc_info=False)
+                    tool_response = ToolExecutionResponse(content=str(e), id=tool_call.id)
+                    responses.append(tool_response)
+        else:
+            self.logger.warning("🌐 Page changed since last observation, cancelling action.")
+            self.cancelled_last_action = True
+            for tool_call in action.tool_calls:
+                tool_response = ToolExecutionResponse(
+                    content="The page changed before the action could be executed, instead of being ran it was cancelled.",  # noqa: E501
+                    id=tool_call.id,
+                )
+                responses.append(tool_response)
+                cancelled_tools_flag = True
+        original_img, annotated_img = await self.browser.screenshot(
+            delay=self.config.screenshot_delay,
+        )
+        base64_image = base64.b64encode(annotated_img).decode("utf-8")
+        info = {"url": self.browser.current_url, "cancelled_tools": cancelled_tools_flag}
+        if self.config.record_pois:
+            info["pois"] = self.browser.pois
+        if self.config.keep_original_image:
+            info["original_image"] = base64.b64encode(original_img).decode("utf-8")
+        html_content = await self.browser.current_page.content() if self.config.include_html else None
+        return Observation(
+            state=State(
+                text=f"URL: {self.browser.current_url}"
+                + (f"\n{self.browser.poi_text}" if self.config.include_poi_text else ""),
+                image=base64_image,
+                html=html_content,
+                tool_responses=responses,
+            ),
+            terminated=False,
+            reward=None,
+            info=info,
+        )
+    async def observe(self) -> Observation:
+        return await self.browser.observe()
+    async def evaluate(self, **kwargs: dict[str, Any]) -> dict[str, Any]:
+        return {}
+    async def get_info(self) -> dict[str, Any]:
+        info = {}
+        return info

src/proxy_lite/history.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from __future__ import annotations
+import base64
+from collections.abc import Iterator
+from enum import Enum
+from typing import Any, Literal, Optional, Set, Union
+from pydantic import BaseModel, Field, TypeAdapter, field_validator
+class MessageLabel(str, Enum):
+    SYSTEM = "system"
+    USER_INPUT = "user_input"
+    SCREENSHOT = "screenshot"
+    AGENT_MODEL_RESPONSE = "agent_model_response"
+MAX_MESSAGES_FOR_CONTEXT_WINDOW = {
+    MessageLabel.SCREENSHOT: 1,
+}
+class MessageContent(BaseModel):
+    pass
+class Text(MessageContent):
+    type: Literal["text"] = Field(default="text", init=False)
+    text: str
+class ImageUrl(BaseModel):
+    url: str
+class Image(MessageContent):
+    type: Literal["image_url"] = Field(default="image_url", init=False)
+    image_url: ImageUrl
+class Message(BaseModel):
+    label: Optional[MessageLabel] = None
+    content: list[Union[Text, Image]] = Field(default_factory=list)
+    class Config:
+        use_enum_values = True
+    @property
+    def images(self) -> list[Image]:
+        return [content for content in self.content if isinstance(content, Image)]
+    @property
+    def texts(self) -> list[Text]:
+        return [content for content in self.content if isinstance(content, Text)]
+    @property
+    def first_image(self) -> Optional[Image]:
+        return self.images[0] if self.images else None
+    @property
+    def first_text(self) -> Optional[Text]:
+        return self.texts[0] if self.texts else None
+    def __len__(self):
+        return len(self.content)
+    @classmethod
+    def from_media(
+        cls,
+        text: Optional[str] = None,
+        image: Optional[bytes | str] = None,
+        is_base64: bool = False,
+    ) -> Message:
+        if text is not None:
+            text = Text(text=text)
+        if image is not None:
+            base64_image = image if is_base64 else base64.b64encode(image).decode("utf-8")
+            data_url = f"data:image/jpeg;base64,{base64_image}"
+            image = Image(image_url=ImageUrl(url=data_url))
+            content = [text, image] if text is not None else [image]
+        else:
+            content = [text]
+        return cls(content=content)
+class SystemMessage(Message):
+    role: Literal["system"] = Field(default="system", init=False)
+class UserMessage(Message):
+    role: Literal["user"] = Field(default="user", init=False)
+class ToolCall(BaseModel):
+    id: str
+    type: str
+    function: dict[str, Any]
+class AssistantMessage(Message):
+    role: Literal["assistant"] = Field(default="assistant", init=False)
+    tool_calls: list[ToolCall] = Field(default_factory=list)
+    def model_dump(self, **kwargs):
+        data = super().model_dump(**kwargs)
+        if not self.tool_calls:
+            data.pop("tool_calls")
+        return data
+    @field_validator("tool_calls", mode="before")
+    @classmethod
+    def ensure_list(cls, v):
+        return [] if v is None else v
+class ToolMessage(Message):
+    role: Literal["tool"] = Field(default="tool", init=False)
+    tool_call_id: str
+MessageTypes = Union[SystemMessage, UserMessage, AssistantMessage, ToolMessage]
+MessageAdapter = TypeAdapter(MessageTypes)
+class MessageHistory(BaseModel):
+    messages: list[MessageTypes] = Field(default_factory=list)
+    def append(self, message: MessageTypes, label: Optional[str] = None):
+        if label is not None:
+            message.label = label
+        self.messages.append(message)
+    def pop(self) -> MessageTypes:
+        return self.messages.pop()
+    def extend(self, history: MessageHistory):
+        self.messages.extend(history.messages)
+    def __reversed__(self):
+        return MessageHistory(messages=self.messages[::-1])
+    def __getitem__(self, index):
+        return self.messages[index]
+    def __len__(self):
+        return len(self.messages)
+    def __iter__(self) -> Iterator[MessageTypes]:
+        return iter(self.messages)
+    def to_dict(self, exclude: Set[str] | None = None) -> list[dict]:
+        exclude = exclude or set()
+        return [message.model_dump(exclude=exclude) for message in self.messages]
+    def history_view(
+        self,
+        limits: dict = MAX_MESSAGES_FOR_CONTEXT_WINDOW,
+    ) -> MessageHistory:
+        """Context window management.
+        Filters messages in reverse order, retaining a limited number of recent screenshots and prompts.
+        """
+        label_counts = {label: 0 for label in limits}
+        filtered_messages = []
+        for message in reversed(self.messages):
+            if message.label in limits:
+                maximum_count = limits[message.label]
+                if label_counts[message.label] < maximum_count:
+                    filtered_messages.append(message)
+                    label_counts[message.label] += 1
+            else:
+                filtered_messages.append(message)
+        return MessageHistory(messages=reversed(filtered_messages))
+    def __add__(self, other: MessageHistory) -> MessageHistory:
+        new_history = MessageHistory()
+        new_history.extend(self)
+        new_history.extend(other)
+        return new_history
+    def __iadd__(self, other: MessageHistory) -> MessageHistory:
+        self.extend(other)
+        return self

src/proxy_lite/logger.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import logging
+import sys
+from typing import Literal
+from uuid import uuid4
+from rich.logging import RichHandler
+class StructuredLogger(logging.Logger):
+    def _log(
+        self,
+        level,
+        msg,
+        args,
+        exc_info=None,
+        extra=None,
+        stack_info=False,
+        stacklevel=1,
+    ):
+        if extra is None:
+            extra = {}
+        json_fields = {
+            "logger_name": self.name,
+            "message": msg % args if args else msg,
+        }
+        exc_type, exc_value, exc_traceback = sys.exc_info()
+        if exc_type is not None:
+            json_fields["exception_class"] = exc_type.__name__
+            json_fields["exception_message"] = str(exc_value)
+        json_fields.update(extra)
+        super()._log(
+            level,
+            msg,
+            args,
+            exc_info,
+            {"json_fields": json_fields},
+            stack_info,
+            stacklevel + 1,
+        )
+def create_logger(
+    name: str,
+    level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO",
+    detailed_name: bool = False,
+) -> logging.Logger:
+    unique_name = f"{name}-{str(uuid4())[:8]}"
+    logger = logging.getLogger(unique_name)
+    logger.setLevel(level)
+    handler = RichHandler(
+        rich_tracebacks=True,
+        markup=True,
+        show_path=False,
+        show_time=False,
+        log_time_format="[%s]",
+    )
+    if detailed_name:
+        handler.setFormatter(logging.Formatter("%(name)s:\n%(message)s\n------"))
+    else:
+        handler.setFormatter(logging.Formatter("%(message)s\n------"))
+    logger.addHandler(handler)
+    logger.propagate = False
+    return logger
+# Set StructuredLogger as the default logger class
+logging.setLoggerClass(StructuredLogger)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+logger.propagate = True
+handler = RichHandler(
+    rich_tracebacks=True,
+    markup=True,
+    show_path=False,
+    show_time=False,
+)
+logger.addHandler(handler)

src/proxy_lite/recorder.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from __future__ import annotations
+import datetime
+import json
+import os
+import sys
+import uuid
+from pathlib import Path
+from typing import Any, Optional, Self
+from pydantic import BaseModel, Field
+from proxy_lite.environments import EnvironmentConfigTypes
+from proxy_lite.environments.environment_base import Action, Observation
+from proxy_lite.history import MessageHistory
+from proxy_lite.solvers import SolverConfigTypes
+class Run(BaseModel):
+    run_id: str  # uuid.UUID
+    task: str
+    created_at: str  # datetime.datetime
+    complete: bool = False
+    terminated_at: str | None = None  # datetime.datetime
+    evaluation: dict[str, Any] | None = None
+    history: list[Observation | Action] = Field(default_factory=list)
+    solver_history: MessageHistory | None = None
+    result: str | None = None
+    env_info: dict[str, Any] = Field(default_factory=dict)
+    environment: Optional[EnvironmentConfigTypes] = None
+    solver: Optional[SolverConfigTypes] = None
+    @classmethod
+    def initialise(cls, task: str) -> Self:
+        run_id = str(uuid.uuid4())
+        return cls(
+            run_id=run_id,
+            task=task,
+            created_at=str(datetime.datetime.now(datetime.UTC)),
+        )
+    @property
+    def observations(self) -> list[Observation]:
+        return [h for h in self.history if isinstance(h, Observation)]
+    @property
+    def actions(self) -> list[Action]:
+        return [h for h in self.history if isinstance(h, Action)]
+    @property
+    def last_action(self) -> Action | None:
+        return self.actions[-1] if self.actions else None
+    @property
+    def last_observation(self) -> Observation | None:
+        return self.observations[-1] if self.observations else None
+    def record(
+        self,
+        observation: Optional[Observation] = None,
+        action: Optional[Action] = None,
+        solver_history: Optional[MessageHistory] = None,
+    ) -> None:
+        # expect only one of observation and action to be provided in order to handle ordering
+        if observation and action:
+            raise ValueError("Only one of observation and action can be provided")
+        if observation:
+            self.history.append(observation)
+        if action:
+            self.history.append(action)
+        if solver_history:
+            self.solver_history = solver_history
+    def terminate(self) -> None:
+        self.terminated_at = str(datetime.datetime.now(datetime.UTC))
+class DataRecorder:
+    def __init__(self, local_folder: str | None = None):
+        self.local_folder = local_folder
+    def initialise_run(self, task: str) -> Run:
+        self.local_folder = Path(os.path.abspath(sys.path[0])) / "local_trajectories"
+        os.makedirs(self.local_folder, exist_ok=True)
+        return Run.initialise(task)
+    async def terminate(
+        self,
+        run: Run,
+        save: bool = True,
+    ) -> None:
+        run.terminate()
+        if save:
+            await self.save(run)
+    async def save(self, run: Run) -> None:
+        json_payload = run.model_dump()
+        with open(self.local_folder / f"{run.run_id}.json", "w") as f:
+            json.dump(json_payload, f)

src/proxy_lite/runner.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import asyncio
+import logging
+from collections.abc import AsyncIterator
+from contextlib import asynccontextmanager
+from typing import Any, Literal, Self
+from omegaconf import OmegaConf
+from pydantic import BaseModel
+from proxy_lite.environments import (
+    Action,
+    BaseEnvironment,
+    EnvironmentConfigTypes,
+    Environments,
+    EventType,
+    Observation,
+)
+from proxy_lite.logger import create_logger
+from proxy_lite.recorder import DataRecorder, Run
+from proxy_lite.solvers import (
+    BaseSolver,
+    SolverConfigTypes,
+    Solvers,
+)
+@asynccontextmanager
+async def async_timeout(timeout: float, task_name: str = "timeout"):
+    try:
+        async with asyncio.TaskGroup() as tg:
+            async def timeout_task():
+                await asyncio.sleep(timeout)
+                raise TimeoutError(
+                    f"Operation {task_name} timed out after {timeout} seconds",
+                )
+            # Create the timeout task
+            timeout_handle = tg.create_task(timeout_task())
+            try:
+                yield
+            finally:
+                timeout_handle.cancel()
+    except* asyncio.TimeoutError as eg:
+        for e in eg.exceptions:
+            raise e
+    except* Exception as eg:
+        for e in eg.exceptions:
+            raise e
+class RunnerConfig(BaseModel):
+    environment: EnvironmentConfigTypes
+    solver: SolverConfigTypes
+    save_every_step: bool = True
+    max_steps: int = 100
+    action_timeout: float = 60.0
+    environment_timeout: float = 30.0
+    task_timeout: float = 1800.0
+    logger_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] = "INFO"
+    detailed_logger_name: bool = False
+    @classmethod
+    def from_dict(cls, config_dict: dict) -> Self:
+        conf = OmegaConf.create(config_dict)
+        config_dict = OmegaConf.to_container(conf, resolve=True)
+        return cls(**config_dict)
+    @classmethod
+    def from_yaml(cls, yaml_path: str) -> Self:
+        conf = OmegaConf.load(yaml_path)
+        config_dict = OmegaConf.to_container(conf, resolve=True)
+        return cls(**config_dict)
+class Runner(BaseModel):
+    config: RunnerConfig
+    recorder: DataRecorder | None = None
+    environment: type[BaseEnvironment] | None = None
+    solver: type[BaseSolver] | None = None
+    logger: logging.Logger | None = None
+    _run: Run | None = None
+    class Config:
+        arbitrary_types_allowed = True
+    def model_post_init(self, __context: Any) -> None:
+        super().model_post_init(__context)
+        self.environment = Environments.get(self.config.environment.name)
+        self.solver = Solvers.get(self.config.solver.name)
+        self.recorder = DataRecorder()
+        self.logger = create_logger(
+            name=f"([bold purple]{self.config.solver.name}[/]-[bold blue]{self.config.environment.name}[/])",
+            level=self.config.logger_level,
+            detailed_name=self.config.detailed_logger_name,
+        )
+    async def run_generator(self, task: str) -> AsyncIterator[Run]:
+        async with (
+            async_timeout(self.config.task_timeout, "Task"),
+        ):
+            if self.config.logger_level is not None:
+                self.logger.setLevel(self.config.logger_level)
+            run = self.recorder.initialise_run(task)
+            run.environment = self.config.environment
+            run.solver = self.config.solver
+            self.logger.debug(f"Run intialised: {run.run_id}")
+            event_queue = asyncio.Queue()
+            async with (
+                self.environment(
+                    config=self.config.environment,
+                    logger=self.logger,
+                ) as environment,
+                self.solver(config=self.config.solver, logger=self.logger) as solver,
+            ):
+                run.env_info = await environment.get_info()
+                await solver.initialise(
+                    task,
+                    environment.tools,
+                    environment.info_for_user,
+                )
+                self.logger.debug("Solver initialised.")
+                run.solver_history = solver.history
+                observation: Observation = await environment.initialise()
+                await event_queue.put(observation)
+                self.logger.debug("Environment initialised.")
+                step_count = 0
+                while step_count < self.config.max_steps:
+                    event = await event_queue.get()
+                    self.logger.debug(f"🤖 [bold purple]Processing event:[/] {event.type}")
+                    match event.type:
+                        case EventType.OBSERVATION:
+                            observation: Observation = event
+                            run.record(
+                                observation=observation,
+                                solver_history=solver.history,
+                            )
+                            async with async_timeout(
+                                self.config.action_timeout,
+                                "Action decision",
+                            ):
+                                action: Action = await solver.act(observation)
+                            await event_queue.put(action)
+                        case EventType.ACTION:
+                            action: Action = event
+                            self.logger.debug(f"Tool calls: {action.tool_calls}")
+                            run.record(action=action, solver_history=solver.history)
+                            run.complete = await solver.is_complete(observation)
+                            if self.config.save_every_step:
+                                await self.recorder.save(run)
+                            if run.complete:
+                                run.result = action.text
+                                self.logger.info(f"🤖 [bold purple]Task complete.[/] ✨ \n{run.result}")
+                                break
+                            async with async_timeout(
+                                self.config.environment_timeout,
+                                "Environment response",
+                            ):
+                                observation: Observation = await environment.execute_action(action)
+                                step_count += 1
+                            await event_queue.put(observation)
+                    yield run
+                if not run.complete:
+                    self.logger.warning("🤖 [bold purple]Ran out of steps!")
+                await self.recorder.terminate(run, save=True)
+        yield run
+    async def run(self, task: str) -> Run:
+        async for run in self.run_generator(task):  # noqa: B007
+            self._run = run
+        return run
+    def run_concurrent(self, tasks: list[str]) -> list[Run]:
+        async def gather_runs():
+            return await asyncio.gather(
+                *[self.run(task) for task in tasks],
+                return_exceptions=True,
+            )
+        return asyncio.run(gather_runs())
+    @property
+    def complete(self) -> bool:
+        if self._run is None:
+            raise RuntimeError("Run not initialised")
+        return self._run.complete
+    @property
+    def run_id(self) -> str:
+        if self._run is None:
+            raise RuntimeError("Run not initialised")
+        return self._run.run_id
+    @property
+    def run_result(self) -> str:
+        if self._run is None:
+            raise RuntimeError("Run not initialised")
+        return self._run.result
+if __name__ == "__main__":
+    from proxy_lite.logger import logger
+    config = RunnerConfig.from_dict(
+        {
+            "environment": {
+                "name": "webbrowser",
+                "homepage": "https://www.google.com",
+                "viewport_width": 1920,
+                "viewport_height": 1080,
+                "screenshot_delay": 1,
+                "headless": False,
+            },
+            "solver": {
+                "name": "simple",
+                "agent": {
+                    "name": "proxy_lite",
+                    "client": {
+                        "name": "convergence",
+                        "model_id": "convergence-ai/all-distill-tools-7b-16-02-2025",
+                        "api_base": "http://slurm1-a3nodeset-4-1:8009/v1",
+                        #     # "model_id": "Qwen/Qwen2.5-VL-3B-Instruct",
+                        #     # "api_base": "http://0.0.0.0:8000/v1",
+                    },
+                },
+            },
+            "max_steps": 150,
+            "action_timeout": 1800,
+            "environment_timeout": 1800,
+            "task_timeout": 18000,
+            "logger_level": "DEBUG",
+        },
+    )
+    logger.info(f"🤖 [bold purple]Config:[/] {config}")
+    runner = Runner(config=config)
+    result = asyncio.run(
+        runner.run(
+            "Tell me the tesla stock price"  # noqa: E501
+        )
+    )
+    print(runner.run_result)
+    print(runner.complete)

src/proxy_lite/serializer.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import itertools
+from abc import ABC, abstractmethod
+from pydantic import BaseModel
+from proxy_lite.history import MessageAdapter, MessageHistory
+from proxy_lite.tools import Tool
+class BaseSerializer(BaseModel, ABC):
+    """Base class for serializers.
+    Serializers are responsible for converting between the internal MessageHistory/Tool
+    objects and the external API format. Deserialise is not always possible, so raise
+    appropriate warnings.
+    """
+    @abstractmethod
+    def serialize_messages(self, message_history: MessageHistory) -> list[dict]: ...
+    @abstractmethod
+    def deserialize_messages(self, data: list[dict]) -> MessageHistory: ...
+    @abstractmethod
+    def serialize_tools(self, tools: list[Tool]) -> list[dict]: ...
+class OpenAISerializer(BaseSerializer):
+    def serialize_messages(self, message_history: MessageHistory) -> list[dict]:
+        return message_history.to_dict(exclude={"label"})
+    def deserialize_messages(self, data: list[dict]) -> MessageHistory:
+        return MessageHistory(
+            messages=[MessageAdapter.validate_python(message) for message in data],
+        )
+    def serialize_tools(self, tools: list[Tool]) -> list[dict]:
+        tool_schemas = [[{"type": "function", "function": schema} for schema in tool.schema] for tool in tools]
+        return list(itertools.chain.from_iterable(tool_schemas))

src/proxy_lite/solvers/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from __future__ import annotations
+from typing import Union
+from .simple_solver import SimpleSolver, SimpleSolverConfig
+from .solver_base import BaseSolver, BaseSolverConfig, Solvers
+from .structured_solver import StructuredSolver, StructuredSolverConfig
+SolverConfigTypes = Union[*Solvers._solver_config_registry.values()]
+SolverTypes = Union[*Solvers._solver_registry.values()]
+__all__ = [
+    "BaseSolver",
+    "BaseSolverConfig",
+    "SimpleSolver",
+    "SimpleSolverConfig",
+    "StructuredSolver",
+    "StructuredSolverConfig",
+    "SolverConfigTypes",
+    "SolverTypes",
+    "Solvers",
+]

src/proxy_lite/solvers/simple_solver.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# ruff: noqa: E501
+import json
+import re
+from functools import cached_property
+from typing import Literal, Optional
+from proxy_lite.agents import AgentConfigTypes, Agents, BaseAgent
+from proxy_lite.environments.environment_base import Action, Observation
+from proxy_lite.history import (
+    MessageHistory,
+    MessageLabel,
+    SystemMessage,
+)
+from proxy_lite.solvers.solver_base import BaseSolver, BaseSolverConfig, Solvers
+from proxy_lite.tools import ReturnValueTool, Tool
+WEB_TOOL_TURN = """The action has been attempted in the computer."""
+@Solvers.register_solver_config("simple")
+class SimpleSolverConfig(BaseSolverConfig):
+    name: Literal["simple"] = "simple"
+    agent: AgentConfigTypes
+@Solvers.register_solver("simple")
+class SimpleSolver(BaseSolver):
+    task: Optional[str] = None
+    complete: bool = False
+    @cached_property
+    def tools(self) -> list[Tool]:
+        return [ReturnValueTool()] + self.env_tools
+    @cached_property
+    def agent(self) -> BaseAgent:
+        self.logger.debug(f"Tools: {self.tools}")
+        return Agents.get(self.config.agent.name)(
+            config=self.config.agent,
+            env_tools=self.tools,
+        )
+    @property
+    def history(self) -> MessageHistory:
+        return MessageHistory(
+            messages=[SystemMessage.from_media(text=self.agent.system_prompt)] + self.agent.history.messages,
+        )
+    async def initialise(self, task: str, env_tools: list[Tool], env_info: str) -> None:
+        self.env_tools = env_tools
+        self.task = task
+        self.agent.receive_user_message(
+            text=f"Task: {task}",
+            label=MessageLabel.USER_INPUT,
+        )
+        self.logger.debug(f"Initialised with task: {task}")
+    async def act(self, observation: Observation) -> Action:
+        self.agent.receive_user_message(
+            image=observation.state.image,
+            text=observation.state.text,
+            label=MessageLabel.SCREENSHOT,
+            is_base64=True,
+        )
+        message = await self.agent.generate_output(use_tool=True)
+        self.logger.debug(f"Assistant message generated: {message}")
+        # check tool calls for return_value
+        if any(tool_call.function["name"] == "return_value" for tool_call in message.tool_calls):
+            self.complete = True
+            arguments = json.loads(message.tool_calls[0].function["arguments"])
+            if isinstance(arguments, str):
+                arguments = json.loads(arguments)
+            return_value = arguments["value"]
+            return Action(tool_calls=[], text=return_value)
+        text_content = message.content[0].text
+        observation_match = re.search(r"<observation>(.*?)</observation>", text_content, re.DOTALL)
+        observation_content = observation_match.group(1).strip() if observation_match else ""
+        self.logger.info(f"🌐 [bold blue]Observation:[/] {observation_content}")
+        # Extract text between thinking tags if present
+        thinking_match = re.search(r"<thinking>(.*?)</thinking>", text_content, re.DOTALL)
+        thinking_content = thinking_match.group(1).strip() if thinking_match else text_content
+        self.logger.info(f"🤖 [bold purple]Action:[/] {thinking_content}")
+        return Action(tool_calls=message.tool_calls, text=text_content)
+    async def is_complete(self, observation: Observation) -> bool:
+        env_terminated = observation.terminated
+        return self.complete or env_terminated

src/proxy_lite/solvers/solver_base.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import logging
+from abc import ABC, abstractmethod
+from functools import cached_property
+from typing import Optional, Self, Type, cast
+from pydantic import BaseModel, Field
+from proxy_lite.environments.environment_base import Action, Observation
+from proxy_lite.tools import Tool
+class BaseSolverConfig(BaseModel):
+    pass
+class BaseSolver(BaseModel, ABC):
+    task: Optional[str] = None
+    env_tools: list[Tool] = Field(default_factory=list)
+    config: BaseSolverConfig
+    logger: logging.Logger | None = None
+    class Config:
+        arbitrary_types_allowed = True
+    async def __aenter__(self) -> Self:
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback) -> None:
+        pass
+    @cached_property
+    @abstractmethod
+    def tools(self) -> list[Tool]: ...
+    @abstractmethod
+    async def initialise(
+        self,
+        task: str,
+        env_tools: list[Tool],
+        env_info: str,
+    ) -> None:
+        """
+        Initialise the solution with the given task.
+        """
+        ...
+    @abstractmethod
+    async def act(self, observation: Observation) -> Action:
+        """
+        Return an action for interacting with the environment.
+        """
+        ...
+    async def is_complete(self, observation: Observation) -> bool:
+        """
+        Return a boolean indicating if the task is complete.
+        """
+        return observation.terminated
+class Solvers:
+    _solver_registry: dict[str, type[BaseSolver]] = {}
+    _solver_config_registry: dict[str, type[BaseSolverConfig]] = {}
+    @classmethod
+    def register_solver(cls, name: str):
+        """
+        Decorator to register a Solver class under a given name.
+        Example:
+            @Solvers.register_solver("my_solver")
+            class MySolver(BaseSolver):
+                ...
+        """
+        def decorator(solver_cls: type[BaseSolver]) -> type[BaseSolver]:
+            cls._solver_registry[name] = solver_cls
+            return solver_cls
+        return decorator
+    @classmethod
+    def register_solver_config(cls, name: str):
+        """
+        Decorator to register a Solver configuration class under a given name.
+        Example:
+            @Solvers.register_solver_config("my_solver")
+            class MySolverConfig(BaseSolverConfig):
+                ...
+        """
+        def decorator(config_cls: type[BaseSolverConfig]) -> type[BaseSolverConfig]:
+            cls._solver_config_registry[name] = config_cls
+            return config_cls
+        return decorator
+    @classmethod
+    def get(cls, name: str) -> type[BaseSolver]:
+        """
+        Retrieve a registered Solver class by its name.
+        Raises:
+            ValueError: If no such solver is found.
+        """
+        try:
+            return cast(Type[BaseSolver], cls._solver_registry[name])
+        except KeyError:
+            raise ValueError(f"Solver '{name}' not found.")
+    @classmethod
+    def get_config(cls, name: str) -> type[BaseSolverConfig]:
+        """
+        Retrieve a registered Solver configuration class by its name.
+        Raises:
+            ValueError: If no such config is found.
+        """
+        try:
+            return cast(Type[BaseSolverConfig], cls._solver_config_registry[name])
+        except KeyError:
+            raise ValueError(f"Solver config for '{name}' not found.")

src/proxy_lite/solvers/structured_solver.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# ruff: noqa: E501
+from functools import cached_property
+from typing import Literal, Optional
+from pydantic import BaseModel, Field
+from proxy_lite.agents import AgentConfigTypes, Agents, BaseAgent
+from proxy_lite.environments.environment_base import Action, Observation
+from proxy_lite.history import (
+    MessageHistory,
+    MessageLabel,
+    SystemMessage,
+)
+from proxy_lite.tools import Tool
+from .solver_base import BaseSolver, BaseSolverConfig, Solvers
+WEB_TOOL_TURN = """The browser action has been attempted. Please double check if the action was successful."""
+PLAN_USER_PROMPT = "First create a high-level plan to help solve the task on the web."
+ACTION_PROMPT = """Now take the most-promising next action in the browser.
+Only refer to the latest web elements from the latest screenshot.
+Using mark ids from older turns will lead to errors as they are no longer valid.
+Only interact with elements visible on the current webpage. Do not make up numbers or elements."""
+REASONING_PROMPT = """You will now follow these steps.
+1. **Make observations about the state of the webpage**:
+   - Consider the previous screenshot, your attempted previous action, and the current screenshot.
+   - Describe any changes you observe, and try to determine if the previous action succeeded.
+   - For example, if a form is being filled out, check whether the correct information is now displayed.
+2. **Write down any helpful facts you have gathered**:
+   - Describe any useful information on the webpage that might be helpful for completing the task.
+   - For example, if you are viewing a document, you may wish to note down any information you want to refer back to later.
+3. **Reason about the system's status**:
+   - Have you fully completed the task?
+4. **Select one of the following statuses**:
+   - "complete": if the task has been completed.
+   - "continue": if you are ready to continue without information or help.
+5. **Reason through next steps**:
+    - If the status is "continue", write down your reasoning for the next action you will take. You can only take one action at a time.
+    - If the status is not "continue", return an empty string.
+6. **Write a message to the user**:
+   - If the status is "complete", write a message to the user. If they asked a question in the task, make sure the answer is here. Otherwise, just provide other useful information about how the task went or if there was a problem in completing it.
+   - If the status is not "complete", set this to an empty string.
+Tips:
+- If you have already provided a response, don't provide it again.
+- If you notice you are repeating previous actions, you're likely stuck. Try something different."""
+class Reflection(BaseModel):
+    observation: str = Field(
+        ...,
+        description="Observation of the current browser state, including an assessment on the success of the last action (previous actions and observations are often wrong).",
+    )
+    fact_updates: list[str] = Field(
+        "",
+        description="List of new information relevant to the task that was found on the page, ignore input fields holding content you wrote.",
+    )
+    status_reasoning: str = Field(
+        ...,
+        description="Reasoning about the current state of the task.",
+    )
+    status: Literal["complete", "continue"] = Field(
+        ...,
+        description="Choose a system status based on your status reasoning.",
+    )
+    next_step_reasoning: str = Field(
+        ...,
+        description='If status is "continue", reason through the next action you will be taking (do not repeat actions over and over). Otherwise set to "".',
+    )
+    ending_message: str = Field(
+        ...,
+        description="If status is 'complete', write a message to the user. If they asked a question in the task, make sure the answer is here. Otherwise, just provide other useful information about how the task went or if there was a problem in completing it. If status is 'continue', set to ''.",
+    )
+@Solvers.register_solver_config("structured")
+class StructuredSolverConfig(BaseSolverConfig):
+    name: Literal["structured"] = "structured"
+    agent: AgentConfigTypes
+    start_with_plan: bool = True
+@Solvers.register_solver("structured")
+class StructuredSolver(BaseSolver):
+    task: Optional[str] = None
+    complete: bool = False
+    @cached_property
+    def tools(self) -> list[Tool]:
+        return self.env_tools
+    @cached_property
+    def local_tools(self) -> list[Tool]:
+        if self.sandbox:
+            return self.sandbox.tools
+        return []
+    @cached_property
+    def agent(self) -> BaseAgent:
+        self.logger.debug(f"Tools: {self.tools}")
+        return Agents.get(self.config.agent.name)(
+            config=self.config.agent,
+            env_tools=self.tools,
+        )
+    @property
+    def history(self) -> MessageHistory:
+        return MessageHistory(
+            messages=[SystemMessage.from_media(text=self.agent.system_prompt)] + self.agent.history.messages,
+        )
+    async def initialise(self, task: str, env_tools: list[Tool], env_info: str) -> None:
+        self.env_tools = env_tools
+        self.agent.receive_user_message(
+            text=env_info,
+            label=MessageLabel.USER_INPUT,
+        )
+        self.task = task
+        self.agent.receive_user_message(
+            text=f"Task: {task}",
+            label=MessageLabel.USER_INPUT,
+        )
+        if self.config.start_with_plan:
+            self.agent.receive_user_message(text=PLAN_USER_PROMPT, label=MessageLabel.PLAN)
+            await self.agent.generate_output(use_tool=False)
+    async def act(self, observation: Observation) -> Action:
+        if observation.state.tool_responses:
+            for tool_response in observation.state.tool_responses:
+                await self.agent.receive_tool_message(
+                    text=f"{WEB_TOOL_TURN}\n{tool_response.content}",
+                    tool_id=tool_response.id,
+                    label=MessageLabel.TOOL_RESULT_INDUCTION,
+                )
+        self.agent.receive_user_message(
+            image=observation.state.image,
+            text=observation.state.text,
+            label=MessageLabel.SCREENSHOT,
+            is_base64=True,
+        )
+        self.agent.receive_user_message(
+            text=REASONING_PROMPT,
+            label=MessageLabel.REASONING_INDUCTION,
+        )
+        message = await self.agent.generate_structured_output(model=Reflection)
+        self.logger.info(f"🌐 [bold blue]Observation:[/] {message.observation}")
+        if message.status == "complete":
+            self.complete = True
+            return Action(tool_calls=[], text=message.ending_message)
+        next_step = message.next_step_reasoning
+        self.agent.receive_user_message(
+            text=ACTION_PROMPT,
+            label=MessageLabel.ACTION,
+            is_base64=True,
+        )
+        message = await self.agent.generate_output(use_tool=True)
+        return Action(tool_calls=message.tool_calls, text=next_step)
+    async def is_complete(self, observation: Observation) -> bool:
+        env_terminated = observation.terminated
+        return self.complete or env_terminated

src/proxy_lite/tools/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .browser_tool import BrowserTool
+from .return_tool import ReturnValueTool
+from .tool_base import Tool, ToolExecutionResponse, attach_param_schema
+__all__ = ["BrowserTool", "ReturnValueTool", "Tool", "ToolExecutionResponse", "attach_param_schema"]

src/proxy_lite/tools/browser_tool.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import asyncio
+from contextlib import AsyncExitStack
+from typing import List, Literal, Optional
+from pydantic import BaseModel, Field
+from proxy_lite.browser.browser import BrowserSession
+from proxy_lite.logger import logger
+from .tool_base import Tool, ToolExecutionResponse, attach_param_schema
+SELF_CONTAINED_TAGS = [
+    # many of these are non-interactive but keeping them anyway
+    "area",
+    "base",
+    "br",
+    "col",
+    "embed",
+    "hr",
+    "img",
+    "input",
+    "link",
+    "meta",
+    "param",
+    "source",
+    "track",
+    "wbr",
+]
+def element_as_text(
+    mark_id: int,
+    tag: Optional[str] = None,
+    text: Optional[str] = None,
+    **raw_attributes,
+) -> str:
+    """Return a text representation of all elements on the page"""
+    attributes = []
+    for k, v in raw_attributes.items():
+        if v is None:
+            continue
+        if isinstance(v, bool):
+            if v:
+                attributes.append(k)
+            # we ignore False bool attributes
+        else:
+            v = str(v)
+            if len(v) > 2500:
+                v = v[: 2500 - 1] + "…"
+            attributes.append(f'{k}="{v}"')
+    attributes = " ".join(attributes)
+    attributes = (" " + attributes).rstrip()
+    tag = tag.lower()
+    if text is None:
+        text = ""
+    if len(text) > 2500:
+        text = text[: 2500 - 1] + "…"
+    if tag in SELF_CONTAINED_TAGS:
+        if text:
+            logger.warning(
+                f"Got self-contained element '{tag}' which contained text '{text}'.",
+            )
+        else:
+            return f"<{tag} id={mark_id}{attributes}/>"
+    return f"<{tag} id={mark_id}{attributes}>{text}</{tag}>"
+class GotoParams(BaseModel):
+    url: str = Field(..., description="The web address to visit. Must be a valid URL.")
+class GoogleSearchParams(BaseModel):
+    query_plan: str = Field(
+        ...,
+        description="Plan out the query you will make. Re-write queries in a way that will yield the best results.",
+    )
+    query: str = Field(..., description="The Google search to perform.")
+class ClickParams(BaseModel):
+    mark_id: int = Field(..., description="Element Mark ID.")
+class TypeEntry(BaseModel):
+    mark_id: int = Field(..., description="Element Mark ID.")
+    content: str = Field(..., description="The text to type into the element.")
+class TypeParams(BaseModel):
+    entries: List[TypeEntry] = Field(
+        ...,
+        description="A list of elements and contents to type.",
+    )
+    submit: bool = Field(
+        ...,
+        description='Whether to press the "Enter" key after typing in the last entry.',
+    )
+class ScrollParams(BaseModel):
+    direction: Literal["up", "down", "left", "right"] = Field(
+        ...,
+        description='Direction to scroll. Must be one of "up", "down", "left" or "right".',
+    )
+    mark_id: int = Field(
+        ...,
+        description="What to scroll. Use -1 to scroll the whole page otherwise give the mark ID of an element that is `scrollable`.",  # noqa: E501
+    )
+class BackParams(BaseModel):
+    pass
+class WaitParams(BaseModel):
+    pass
+class ReloadParams(BaseModel):
+    pass
+class DoNothingParams(BaseModel):
+    pass
+class BrowserTool(Tool):
+    def __init__(self, session: BrowserSession) -> None:
+        super().__init__()
+        self.browser = session
+    async def __aenter__(self):
+        self._exit_stack = AsyncExitStack()
+        await self._exit_stack.enter_async_context(self.browser)
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self._exit_stack.aclose()
+    @property
+    def poi_text(self) -> str:
+        # Get all points of interest on the page as text
+        texts = [element_as_text(mark_id=i, **element) for i, element in enumerate(self.browser.poi_elements)]
+        # Return formatted text of points of interest on page
+        return "\n".join([txt for txt in texts if txt])
+    @attach_param_schema(GotoParams)
+    async def goto(self, url: str) -> ToolExecutionResponse:
+        """Go directly to a specific web url. Specify the exact URL."""
+        await self.browser.goto(url)
+        return ToolExecutionResponse()
+    @attach_param_schema(GoogleSearchParams)
+    async def google_search(self, query_plan: str, query: str) -> ToolExecutionResponse:
+        """Perform a generic web search using Google.
+        Results may not be relevant. If you see poor results, you can try another query.
+        """
+        url = f"https://www.google.com/search?q={query}"
+        await self.browser.goto(url)
+        return ToolExecutionResponse()
+    @attach_param_schema(ClickParams)
+    async def click(self, mark_id: int) -> ToolExecutionResponse:
+        """Click on an element of the page."""
+        await self.browser.click(mark_id=mark_id)
+        return ToolExecutionResponse()
+    @attach_param_schema(TypeParams)
+    async def type(self, entries: List[dict], submit: bool) -> ToolExecutionResponse:
+        """Type text.
+        You can type into one or more elements.
+        Note that the text inside an element is cleared before typing.
+        """
+        for i, entry_dict in enumerate(entries):
+            entry = TypeEntry(**entry_dict)
+            last_entry = i == len(entries) - 1
+            old_poi_positions = [tuple(point) for point in self.browser.poi_centroids]
+            await self.browser.enter_text(
+                mark_id=entry.mark_id,
+                text=entry.content,
+                submit=submit and last_entry,
+            )
+            await self.browser.update_poi()
+            new_poi_positions = [tuple(point) for point in self.browser.poi_centroids]
+            if not last_entry and old_poi_positions != new_poi_positions:
+                logger.error(
+                    "POI positions changed mid-typing, cancelling future type entries.",
+                )
+                break
+        return ToolExecutionResponse()
+    @attach_param_schema(ScrollParams)
+    async def scroll(self, direction: str, mark_id: int) -> ToolExecutionResponse:
+        """Scroll the page (or a scrollable element) up, down, left or right."""
+        if mark_id == -1:
+            mark_id = None
+        await self.browser.scroll(direction=direction, mark_id=mark_id)
+        return ToolExecutionResponse()
+    @attach_param_schema(BackParams)
+    async def back(self) -> ToolExecutionResponse:
+        """Go back to the previous page."""
+        await self.browser.go_back()
+        return ToolExecutionResponse()
+    @attach_param_schema(WaitParams)
+    async def wait(self) -> ToolExecutionResponse:
+        """Wait three seconds. Useful when the page appears to still be loading, or if there are any unfinished webpage processes."""  # noqa: E501
+        await asyncio.sleep(3)
+        return ToolExecutionResponse()
+    @attach_param_schema(ReloadParams)
+    async def reload(self) -> ToolExecutionResponse:
+        """Reload the current page. Useful when the page seems unresponsive, broken, outdated, or if you want to reset the page to its initial state."""  # noqa: E501
+        await self.browser.reload()
+        return ToolExecutionResponse()
+    @attach_param_schema(DoNothingParams)
+    async def do_nothing_tool(self) -> ToolExecutionResponse:
+        """Do nothing. Use this if you have no need for the browser at this time."""
+        return ToolExecutionResponse()

src/proxy_lite/tools/return_tool.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from pydantic import BaseModel, Field
+from proxy_lite.tools.tool_base import Tool, attach_param_schema
+class ReturnValueParams(BaseModel):
+    value: str = Field(description="The value to return to the user.")
+class ReturnValueTool(Tool):
+    def __init__(self):
+        pass
+    @attach_param_schema(ReturnValueParams)
+    def return_value(self, value: str):
+        """Return a value to the user. Use this tool when you have finished the task in order to provide any information the user has requested."""
+        print(value)

src/proxy_lite/tools/tool_base.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import inspect
+from functools import cached_property, wraps
+from typing import Any, Callable, Optional
+from pydantic import BaseModel, Field
+class Tool:
+    async def __aenter__(self):
+        pass
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        pass
+    @cached_property
+    def schema(self) -> list[dict[str, Any]]:
+        schema = []
+        for name, method in self.__class__.__dict__.items():
+            # If function is not callable and isn't decorated using attach_param_schema
+            if not isinstance(method, Callable) or not hasattr(method, "param_model"):
+                continue
+            docstring = inspect.getdoc(method)
+            if not docstring:
+                raise ValueError(f"The tool function '{name}' is missing a docstring.")
+            # Handle multi-line docstirngs
+            description = " ".join(line.strip() for line in docstring.split("\n"))
+            tool_json = {
+                "name": name,
+                "description": description,
+                "parameters": method.param_model.model_json_schema(),
+            }
+            schema.append(tool_json)
+        return schema
+def attach_param_schema(param_model: type[BaseModel]):
+    def decorator(func: Callable) -> Callable:
+        @wraps(func)
+        def wrapper(self, **kwargs):
+            # Throw an error if there's a mismatch between the function parameters and pydantic model's fields.
+            validated_params = param_model(**kwargs)
+            return func(self, **validated_params.model_dump())
+        wrapper.param_model = param_model
+        return wrapper
+    return decorator
+class ToolExecutionResponse(BaseModel):
+    content: Optional[str] = None
+    id: Optional[str] = None