Spaces:

ttomy
/

proxy-lite-experimental

Running

App Files Files Community

Trisha Tomy commited on 28 days ago

Commit

c9803a3

0 Parent(s):

Stretch goal experimentation

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
proxy-lite-demo-v2/.gitattributes +35 -0
proxy-lite-demo-v2/.gitignore +177 -0
proxy-lite-demo-v2/.idea/.gitignore +14 -0
proxy-lite-demo-v2/.idea/libraries/my_test_package.xml +9 -0
proxy-lite-demo-v2/.idea/misc.xml +6 -0
proxy-lite-demo-v2/.idea/modules.xml +8 -0
proxy-lite-demo-v2/.idea/proxy-lite-demo-v2.iml +9 -0
proxy-lite-demo-v2/.idea/vcs.xml +6 -0
proxy-lite-demo-v2/CODEOWNERS +1 -0
proxy-lite-demo-v2/Dockerfile +59 -0
proxy-lite-demo-v2/LICENSE +3 -0
proxy-lite-demo-v2/Makefile +11 -0
proxy-lite-demo-v2/Procfile +1 -0
proxy-lite-demo-v2/README.md +10 -0
proxy-lite-demo-v2/app.py +350 -0
proxy-lite-demo-v2/pyproject.toml +65 -0
proxy-lite-demo-v2/requirements.txt +6 -0
proxy-lite-demo-v2/src/proxy_lite/__init__.py +3 -0
proxy-lite-demo-v2/src/proxy_lite/agents/__init__.py +18 -0
proxy-lite-demo-v2/src/proxy_lite/agents/agent_base.py +238 -0
proxy-lite-demo-v2/src/proxy_lite/agents/proxy_lite_agent.py +61 -0
proxy-lite-demo-v2/src/proxy_lite/app.py +239 -0
proxy-lite-demo-v2/src/proxy_lite/browser/__init__.py +0 -0
proxy-lite-demo-v2/src/proxy_lite/browser/add_custom_select.js +123 -0
proxy-lite-demo-v2/src/proxy_lite/browser/bounding_boxes.py +210 -0
proxy-lite-demo-v2/src/proxy_lite/browser/browser.py +508 -0
proxy-lite-demo-v2/src/proxy_lite/browser/find_pois.js +397 -0
proxy-lite-demo-v2/src/proxy_lite/cli.py +112 -0
proxy-lite-demo-v2/src/proxy_lite/client.py +405 -0
proxy-lite-demo-v2/src/proxy_lite/configs/default.yaml +23 -0
proxy-lite-demo-v2/src/proxy_lite/environments/__init__.py +32 -0
proxy-lite-demo-v2/src/proxy_lite/environments/environment_base.py +161 -0
proxy-lite-demo-v2/src/proxy_lite/environments/webbrowser.py +205 -0
proxy-lite-demo-v2/src/proxy_lite/gif_maker.py +122 -0
proxy-lite-demo-v2/src/proxy_lite/history.py +183 -0
proxy-lite-demo-v2/src/proxy_lite/logger.py +92 -0
proxy-lite-demo-v2/src/proxy_lite/recorder.py +103 -0
proxy-lite-demo-v2/src/proxy_lite/runner.py +240 -0
proxy-lite-demo-v2/src/proxy_lite/serializer.py +39 -0
proxy-lite-demo-v2/src/proxy_lite/solvers/__init__.py +20 -0
proxy-lite-demo-v2/src/proxy_lite/solvers/simple_solver.py +117 -0
proxy-lite-demo-v2/src/proxy_lite/solvers/solver_base.py +123 -0
proxy-lite-demo-v2/src/proxy_lite/tools/__init__.py +5 -0
proxy-lite-demo-v2/src/proxy_lite/tools/browser_tool.py +374 -0
proxy-lite-demo-v2/src/proxy_lite/tools/return_tool.py +17 -0
proxy-lite-demo-v2/src/proxy_lite/tools/tool_base.py +54 -0
proxy-lite-demo-v2/test_tool_calling.py +65 -0
proxy-lite-demo-v2/uv.lock +0 -0
proxy-lite-work/.forceignore +12 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

proxy-lite-demo-v2/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

proxy-lite-demo-v2/.gitignore ADDED Viewed

	@@ -0,0 +1,177 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc
+logs/
+local_trajectories/
+screenshots/
+gifs/
+.DS_Store

proxy-lite-demo-v2/.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Environment-dependent path to Maven home directory
+/mavenHomeManager.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Core Dev Booster ignored files
+/compile.flag
+/coreModuleDependants.csv
+/.mavenCleaned

proxy-lite-demo-v2/.idea/libraries/my_test_package.xml ADDED Viewed

	@@ -0,0 +1,9 @@

+<component name="libraryTable">
+  <library name="my-test-package">
+    <CLASSES>
+      <root url="jar://$PROJECT_DIR$/venv/lib/python3.13/site-packages/pkg_resources/tests/data/my-test-package-zip/my-test-package.zip!/" />
+    </CLASSES>
+    <JAVADOC />
+    <SOURCES />
+  </library>
+</component>

proxy-lite-demo-v2/.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_24" default="true" project-jdk-name="24" project-jdk-type="JavaSDK">
+    <output url="file://$PROJECT_DIR$/out" />
+  </component>
+</project>

proxy-lite-demo-v2/.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/proxy-lite-demo-v2.iml" filepath="$PROJECT_DIR$/.idea/proxy-lite-demo-v2.iml" />
+    </modules>
+  </component>
+</project>

proxy-lite-demo-v2/.idea/proxy-lite-demo-v2.iml ADDED Viewed

	@@ -0,0 +1,9 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

proxy-lite-demo-v2/.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

proxy-lite-demo-v2/CODEOWNERS ADDED Viewed

	@@ -0,0 +1 @@


1	+ * @aptoul @Fraser-Greenlee @XanderJC

proxy-lite-demo-v2/Dockerfile ADDED Viewed

	@@ -0,0 +1,59 @@

+# Use an official Playwright Docker image for Python, matching your Playwright version and Debian base
+FROM mcr.microsoft.com/playwright/python:v1.53.0-noble
+# Set the working directory inside the container
+WORKDIR /app
+# The official Playwright image comes with most necessary system dependencies,
+# so we only need to add git for proxy-lite and potentially any very specific missing libs.
+# Removing the extensive list as it's largely redundant with the Playwright base image.
+RUN apt-get update && apt-get install -y \
+    git \
+    xvfb \
+    # Clean up apt caches to reduce image size
+    && rm -rf /var/lib/apt/lists/*
+# Copy common Python dependencies first (needed for pip installs)
+COPY requirements.txt .
+# Copy your Flask application code (app.py) and other project files.
+COPY . .
+# --- START: Directory permission workaround ---
+# Create the directory proxy-lite's recorder insists on writing to
+# and grant full permissions. This addresses the PermissionError.
+# This line creates the directory *directly* under /app, which is now the correct path
+RUN mkdir -p /app/local_trajectories \
+    && chmod -R 777 /app/local_trajectories
+# --- END: Directory permission workaround ---
+# Upgrade pip, setuptools, and wheel for a robust Python build environment.
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel
+# Install your local proxy-lite package in editable mode.
+RUN pip install --no-cache-dir --no-input -e .
+# Install the rest of the Python dependencies from requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# Set environment variables required for Playwright at runtime
+ENV DISPLAY=:99
+ENV XDG_RUNTIME_DIR=/tmp
+# Removed PLAYWRIGHT_BROWSERS_PATH and PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD
+# as the official Playwright image manages these internally, defaulting to /ms-playwright.
+# --- Debugging: Check Playwright version and browser installation (moved AFTER install in the original setup) ---
+# Now checking the default Playwright browser installation path /ms-playwright
+RUN echo "--- Checking Playwright Version (from base image) ---"
+RUN python -m playwright --version
+RUN echo "--- Listing Playwright Browser Cache (Recursive, from base image) ---"
+RUN ls -alR /ms-playwright/
+RUN echo "-----------------------------------"
+# --- End Debugging ---
+# Expose the port your Flask app will listen on. Hugging Face Spaces requires 7860.
+EXPOSE 7860
+# Define the command to run your Flask application using Gunicorn for production.
+CMD exec gunicorn --bind 0.0.0.0:7860 --workers 2 --worker-class gevent app:app --timeout 300

proxy-lite-demo-v2/LICENSE ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ Creative Commons Attribution-NonCommercial 4.0 International
2	+
3	+ This work is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License. To view a copy of this license, visit https://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.

proxy-lite-demo-v2/Makefile ADDED Viewed

	@@ -0,0 +1,11 @@

+.PHONY: proxy
+proxy:
+	pip install uv
+	uv venv --python 3.11 --python-preference managed
+	uv sync
+	uv pip install -e .
+	playwright install
+app:
+	streamlit run src/proxy_lite/app.py

proxy-lite-demo-v2/Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: gunicorn --bind 0.0.0.0:7860 --workers 2 --worker-class gevent app:app --timeout 300

proxy-lite-demo-v2/README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: Proxy Lite Demo For Setup
+emoji: 😻
+colorFrom: indigo
+colorTo: gray
+sdk: docker
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

proxy-lite-demo-v2/app.py ADDED Viewed

	@@ -0,0 +1,350 @@

+import gevent.monkey
+gevent.monkey.patch_all(asyncio=True) # Keep this at the very top
+import asyncio
+from flask import Flask, request, jsonify
+from proxy_lite import Runner, RunnerConfig
+import os
+import logging
+from datetime import datetime
+from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+app = Flask(__name__)
+_runner = None
+async def perform_hardcoded_salesforce_login_and_get_cookies(username, password, login_url, target_url):
+    logger.info("Attempting hardcoded Salesforce login with Playwright to obtain cookies...")
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=False, args=["--no-sandbox", "--disable-setuid-sandbox"])
+        context = await browser.new_context()
+        page = await context.new_page()
+        try:
+            await page.goto(login_url, wait_until="domcontentloaded", timeout=60000)
+            logger.info(f"Playwright: Navigated to Salesforce login page: {page.url}")
+            await page.fill("#username", username)
+            await page.fill("#password", password)
+            await page.click("#Login")
+            logger.info("Playwright: Filled credentials and clicked Login. Waiting for post-login state...")
+            try:
+                await page.wait_for_url(lambda url: "login.salesforce.com" not in url and "unauthorized" not in url.lower(), timeout=60000)
+                logger.info(f"Playwright: Successfully redirected from login page. Current URL: {page.url}")
+                await page.wait_for_selector('button[title="App Launcher"]', timeout=30000)
+                logger.info("Playwright: Main Salesforce Lightning UI (e.g., App Launcher) detected after login.")
+            except PlaywrightTimeoutError:
+                logger.error(f"Playwright: Did not detect main UI or expected URL change within timeout after login. Current URL: {page.url}. Login might have failed or stuck on a redirect loop.")
+                raise Exception("Salesforce login redirection failed or main UI not detected.")
+            logger.info(f"Playwright: Navigating to target URL: {target_url} to ensure all relevant cookies are captured.")
+            await page.goto(target_url, wait_until="domcontentloaded", timeout=60000)
+            try:
+                # Wait for generic Salesforce setup page elements to load
+                await page.wait_for_selector('.setupPage, .slds-page-header, .slds-card, [data-aura-class*="setup"], .forcePageBlockSectionView', timeout=30000)
+                logger.info("Playwright: Detected Salesforce setup page elements loaded successfully.")
+            except PlaywrightTimeoutError:
+                logger.warning("Playwright: Specific setup page elements not found. Trying generic page load check...")
+                try:
+                    # Fallback: wait for page to reach network idle state
+                    await page.wait_for_load_state("networkidle", timeout=10000)
+                    logger.info("Playwright: Page reached network idle state - proceeding with task.")
+                except PlaywrightTimeoutError:
+                    logger.info("Playwright: Page load validation timed out, but continuing as page may still be functional.")
+            await asyncio.sleep(2)
+            logger.info(f"Playwright: Successfully navigated to and confirmed content on {page.url}")
+            cookies = await context.cookies()
+            logger.info(f"Playwright: Extracted {len(cookies)} cookies after successful login and navigation.")
+            return cookies
+        except PlaywrightTimeoutError as e:
+            logger.error(f"Playwright login/navigation failed (Timeout): {e}. Current URL: {page.url}")
+            raise
+        except Exception as e:
+            logger.error(f"Playwright login/navigation failed (General Error): {e}. Current URL: {page.url}")
+            raise
+        finally:
+            if browser:
+                await browser.close()
+async def initialize_runner_with_cookies(cookies: list, target_url: str):
+    global _runner
+    logger.info("Initializing Proxy-lite Runner with provided cookies...")
+    gemini_api_key = os.environ.get("GEMINI_API_KEY")
+    if not gemini_api_key:
+        logger.error("GEMINI_API_KEY environment variable not set. Cannot initialize Runner.")
+        raise ValueError("GEMINI_API_KEY environment variable not set. Please set it as a Space secret.")
+    config_dict = {
+        "environment": {
+            "name": "webbrowser",
+            "homepage": "about:blank",  # Safe startup, we'll open new tab programmatically
+            "headless": False,
+            "launch_args": ["--no-sandbox", "--disable-setuid-sandbox"],
+            "screenshot_delay": 0.5,
+            "include_html": True,
+            "include_poi_text": True,
+            "record_pois": True,
+            "viewport_width": 1280,
+            "viewport_height": 720,
+            "browserbase_timeout": 7200,
+            "keep_original_image": False,
+            "no_pois_in_image": False,
+            "initial_cookies": cookies
+        },
+        "solver": {
+            "name": "simple",
+            "agent": {
+                "name": "proxy_lite",
+                "client": {
+                    "name": "gemini",
+                    "model_id": "gemini-2.0-flash-001",
+                    "api_key": gemini_api_key,
+                    "http_timeout": 50.0,
+                    "http_concurrent_connections": 50,
+                },
+                "history_messages_limit": {
+                    "screenshot": 1
+                },
+                "history_messages_include": None,
+            }
+        },
+        "environment_timeout": 1800.0,
+        "action_timeout": 1800.0,
+        "task_timeout": 18000.0,
+        "max_steps": 150,
+        "logger_level": "DEBUG",
+        "save_every_step": True,
+        "detailed_logger_name": False
+    }
+    config = RunnerConfig.from_dict(config_dict)
+    logger.info(f"DEBUG: app.py - Initializing Proxy-lite Runner with Gemini Flash 2.0 configuration.")
+    _runner = Runner(config=config)
+    logger.info("Proxy-lite Runner initialized successfully with Gemini Flash 2.0 and injected cookies.")
+    return _runner
+@app.route('/run_proxy_task', methods=['POST'])
+async def run_proxy_task_endpoint():
+    data = request.json
+    request_task_instruction = data.get('task')
+    target_url = data.get('url')
+    if not request_task_instruction:
+        logger.warning("Received request without 'task' field. Returning 400.")
+        return jsonify({"error": "No 'task' provided in request body"}), 400
+    if not target_url:
+        logger.warning("Received request without 'url' field. Returning 400.")
+        return jsonify({"error": "No 'url' provided in request body"}), 400
+    logger.info(f"Received user request task: '{request_task_instruction}'")
+    logger.info(f"Target URL: '{target_url}'")
+    # Check if this is a Salesforce URL
+    is_salesforce_url = "salesforce.com" in target_url or "force.com" in target_url
+    try:
+        if is_salesforce_url:
+            # Salesforce automation - requires login
+            salesforce_username = os.environ.get("SALESFORCE_USERNAME")
+            salesforce_password = os.environ.get("SALESFORCE_PASSWORD")
+            if not salesforce_username or not salesforce_password:
+                logger.error("Salesforce credentials (SALESFORCE_USERNAME, SALESFORCE_PASSWORD) environment variables not set.")
+                return jsonify({"error": "Salesforce credentials not configured. Please set SALESFORCE_USERNAME and SALESFORCE_PASSWORD as Space secrets."}), 500
+            salesforce_login_url = "https://login.salesforce.com/"
+            logger.info("Executing hardcoded login via Playwright to get session cookies...")
+            session_cookies = await perform_hardcoded_salesforce_login_and_get_cookies(
+                salesforce_username, salesforce_password, salesforce_login_url, target_url
+            )
+            logger.info(f"Successfully obtained {len(session_cookies)} cookies. These will be injected into the agent's browser.")
+        else:
+            # General web browsing - no login required
+            logger.info("Non-Salesforce URL detected. Skipping Salesforce login.")
+            session_cookies = []
+        runner = await initialize_runner_with_cookies(session_cookies, target_url)
+        logger.info("Proxy-lite Runner initialized with cookies." if session_cookies else "Proxy-lite Runner initialized for general web browsing.")
+        logger.info("Agent will use mandatory new tab tool to bypass loading issues.")
+        # MANDATORY new tab navigation task - this is critical to avoid loading issues
+        agent_task = f"""
+CRITICAL FIRST STEP - MANDATORY:
+Your VERY FIRST action must be to use the open_new_tab_and_go_to tool to navigate to {target_url}
+DO NOT skip this step. DO NOT use goto. You MUST use: open_new_tab_and_go_to(url='{target_url}')
+This is necessary because direct navigation to this URL gets stuck loading. The new tab approach bypasses this issue.
+STEP 1: Use open_new_tab_and_go_to(url='{target_url}')
+STEP 2: Wait for the page to be fully loaded (no loading spinners visible)
+STEP 3: {request_task_instruction}
+CRITICAL WORKFLOW - FOLLOW THESE EXACT STEPS IN SEQUENCE:
+STEP A: Select Permission Set
+- Use select_option_by_text tool to find and select the target permission set from Available list
+- Wait for "[ACTION COMPLETED]" response before proceeding
+STEP B: Click Add Button
+- After successful selection, immediately click the "Add" button to move permission set to Enabled list
+- Do NOT repeat the selection - proceed directly to Add button
+STEP C: Click Save Button
+- After clicking Add, immediately click "Save" to persist the changes
+- After Save, Salesforce redirects to User page indicating SUCCESS
+CRITICAL: Do NOT repeat actions. Each step should happen exactly once in sequence.
+GENERAL INSTRUCTIONS:
+- You must EXECUTE all actions immediately - do NOT just describe what you plan to do
+- Do NOT wait for user input or ask "what should I do next?"
+- Complete the entire task autonomously using the available tools
+- After completing all steps, use the return_value tool to provide your final response
+- If you make a plan, IMMEDIATELY execute it step by step using the appropriate tools
+        """
+        logger.info("Executing agent task with mandatory new tab navigation...")
+        result = await runner.run(task=agent_task)
+        # Extract the actual result value from the Run object
+        if hasattr(result, 'value') and result.value:
+            task_result = str(result.value)
+        elif hasattr(result, 'result') and result.result:
+            task_result = str(result.result)
+        else:
+            task_result = str(result)
+        logger.info(f"Proxy-lite task completed. Output (truncated for log): {task_result[:500]}...")
+        # Structure response for LWC integration
+        response = {
+            "status": "success",
+            "message": "Task completed successfully",
+            "data": {
+                "task_result": task_result,
+                "steps_completed": [
+                    "Hardcoded Salesforce login completed",
+                    "Browser session initialized with cookies",
+                    "New tab navigation executed",
+                    "Target Salesforce setup page accessed",
+                    "Task execution completed successfully"
+                ],
+                "environment": {
+                    "target_url": target_url,
+                    "cookies_count": len(session_cookies),
+                    "navigation_method": "new_tab_bypass"
+                }
+            },
+            "timestamp": datetime.now().isoformat(),
+            "task_request": request_task_instruction
+        }
+        return jsonify(response)
+    except PlaywrightTimeoutError as e:
+        logger.exception(f"Playwright timeout during login/navigation: {e}")
+        error_response = {
+            "status": "error",
+            "error_type": "navigation_timeout",
+            "message": "Page loading timed out during login or navigation",
+            "data": {
+                "error_details": str(e),
+                "suggested_action": "Retry the request - network issues may be temporary",
+                "steps_completed": ["Login attempted", "Navigation failed due to timeout"]
+            },
+            "timestamp": datetime.now().isoformat(),
+            "task_request": request_task_instruction
+        }
+        return jsonify(error_response), 500
+    except ValueError as e:
+        logger.exception(f"Configuration error: {e}")
+        error_response = {
+            "status": "error",
+            "error_type": "configuration_error",
+            "message": "System configuration issue",
+            "data": {
+                "error_details": str(e),
+                "suggested_action": "Check environment variables and system configuration",
+                "steps_completed": ["Configuration validation failed"]
+            },
+            "timestamp": datetime.now().isoformat(),
+            "task_request": request_task_instruction
+        }
+        return jsonify(error_response), 500
+    except Exception as e:
+        logger.exception(f"Unexpected error processing Salesforce task: {e}")
+        error_response = {
+            "status": "error",
+            "error_type": "unexpected_error",
+            "message": "An unexpected error occurred during task execution",
+            "data": {
+                "error_details": str(e),
+                "error_class": type(e).__name__,
+                "suggested_action": "Check logs for detailed error information and retry",
+                "steps_completed": ["Login attempted", "Error occurred during execution"]
+            },
+            "timestamp": datetime.now().isoformat(),
+            "task_request": request_task_instruction
+        }
+        return jsonify(error_response), 500
+@app.route('/')
+def root():
+    logger.info("Root endpoint accessed.")
+    return "Proxy-lite API is running. Send POST requests to /run_proxy_task with a 'task' in JSON body."
+@app.route('/health', methods=['GET'])
+def health_check():
+    """Health check endpoint for monitoring and debugging"""
+    logger.info("Health check endpoint accessed.")
+    # Check environment variables
+    env_status = {
+        "GEMINI_API_KEY": "✓" if os.environ.get("GEMINI_API_KEY") else "✗",
+        "SALESFORCE_USERNAME": "✓" if os.environ.get("SALESFORCE_USERNAME") else "✗",
+        "SALESFORCE_PASSWORD": "✓" if os.environ.get("SALESFORCE_PASSWORD") else "✗"
+    }
+    health_response = {
+        "status": "healthy",
+        "message": "Proxy-lite API is running",
+        "environment_variables": env_status,
+        "endpoints": {
+            "POST /run_proxy_task": "Execute Salesforce automation tasks (requires 'task' and 'url' parameters)",
+            "GET /health": "Health check and status",
+            "GET /": "API information"
+        },
+        "supported_pages": [
+            "Warranty Lifecycle Management",
+            "Account Forecasting Settings",
+            "Sales Agreements",
+            "Account Manager Targets",
+            "Any Salesforce Setup page"
+        ],
+        "timestamp": datetime.now().isoformat()
+    }
+    return jsonify(health_response)
+if __name__ == '__main__':
+    if not os.environ.get("GEMINI_API_KEY"):
+        logger.error("GEMINI_API_KEY environment variable is not set. Please set it for local testing.")
+    logger.info("Starting Flask development server on 0.0.0.0:6101...")
+    app.run(host='0.0.0.0', port=6101, debug=True)

proxy-lite-demo-v2/pyproject.toml ADDED Viewed

	@@ -0,0 +1,65 @@

+[project]
+name = "proxy-lite"
+version = "0.1.0"
+description = "Proxy Lite - A mini, open-weights, version of the Convergence AI Proxy assistant."
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "omegaconf>=2.3.0",
+    "openai>=1.61.1",
+    "opencv-python>=4.11.0.86",
+    "opencv-python-headless>=4.11.0.86",
+    "playwright-stealth>=1.0.6",
+    "playwright>=1.50.0",
+    "pydantic>=2.10.6",
+    "rich>=13.9.4",
+    "setuptools>=75.8.0",
+    "tenacity>=9.0.0",
+    "torch>=2.5.1",
+    "torchvision>=0.20.1",
+    "streamlit>=1.40.2",
+    "pre-commit>=4.1.0",
+]
+[project.scripts]
+proxy = "proxy_lite.cli:main"
+[project.optional-dependencies]
+serving = [
+    "transformers",
+    "vllm==0.7.2",
+]
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools]
+packages = { find = { where = ["src"] } }
+[tool.setuptools.package-data]
+proxy_lite = ["**/*.json"]
+[tool.ruff]
+line-length = 120
+[tool.ruff.lint]
+select = ["E", "F", "B", "I", "SIM"]
+ignore = [
+    "B028",
+    "E722", # ignore bare except
+    "B904", # ignore raise from requirement
+    "FA102",
+]
+[tool.ruff.lint.flake8-bugbear]
+extend-immutable-calls = [
+    "fastapi.Depends",
+    "fastapi.params.Depends",
+    "fastapi.Query",
+    "fastapi.params.Query",
+]
+[tool.uv.sources]
+transformers = { git = "https://github.com/huggingface/transformers.git", rev = "336dc69d63d56f232a183a3e7f52790429b871ef" }

proxy-lite-demo-v2/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+Flask[async]
+-e .
+playwright
+playwright-stealth==1.0.6
+gunicorn
+gevent

proxy-lite-demo-v2/src/proxy_lite/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .runner import Runner, RunnerConfig
2	+
3	+ __all__ = ["Runner", "RunnerConfig"]

proxy-lite-demo-v2/src/proxy_lite/agents/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from typing import Union
+from .agent_base import Agents, BaseAgent, BaseAgentConfig
+from .proxy_lite_agent import ProxyLiteAgent, ProxyLiteAgentConfig
+AgentTypes = Union[*list(Agents._agent_registry.values())]
+AgentConfigTypes = Union[*list(Agents._agent_config_registry.values())]
+__all__ = [
+    "AgentConfigTypes",
+    "AgentTypes",
+    "Agents",
+    "BaseAgent",
+    "BaseAgentConfig",
+    "ProxyLiteAgent",
+    "ProxyLiteAgentConfig",
+]

proxy-lite-demo-v2/src/proxy_lite/agents/agent_base.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import json
+import logging
+from abc import ABC, abstractmethod
+from contextlib import AsyncExitStack
+from functools import cached_property
+from typing import Any, Optional, Type, cast
+from pydantic import BaseModel, Field
+from tenacity import before_sleep_log, retry, stop_after_attempt, wait_exponential
+from proxy_lite.client import BaseClient, ClientConfigTypes, OpenAIClientConfig
+from proxy_lite.history import (
+    AssistantMessage,
+    MessageHistory,
+    MessageLabel,
+    SystemMessage,
+    Text,
+    ToolCall,
+    ToolMessage,
+    UserMessage,
+)
+from proxy_lite.logger import logger
+from proxy_lite.tools import Tool
+# if TYPE_CHECKING:
+#     from proxy_lite.tools import Tool
+class BaseAgentConfig(BaseModel):
+    client: ClientConfigTypes = Field(default_factory=OpenAIClientConfig)
+    history_messages_limit: dict[MessageLabel, int] = Field(default_factory=lambda: dict())
+    history_messages_include: Optional[dict[MessageLabel, int]] = Field(
+        default=None,
+        description="If set, overrides history_messages_limit by setting all message types to 0 except those specified",
+    )
+    def model_post_init(self, __context: Any) -> None:
+        if self.history_messages_include is not None:
+            self.history_messages_limit = {label: 0 for label in MessageLabel}
+            self.history_messages_limit.update(self.history_messages_include)
+class BaseAgent(BaseModel, ABC):
+    config: BaseAgentConfig
+    temperature: float = Field(default=0.7, ge=0, le=2)
+    history: MessageHistory = Field(default_factory=MessageHistory)
+    client: Optional[BaseClient] = None
+    env_tools: list[Tool] = Field(default_factory=list)
+    task: Optional[str] = Field(default=None)
+    seed: Optional[int] = Field(default=None)
+    class Config:
+        arbitrary_types_allowed = True
+    def __init__(self, **data) -> None:
+        super().__init__(**data)
+        self._exit_stack = AsyncExitStack()
+        self._tools_init_task = None
+    def model_post_init(self, __context: Any) -> None:
+        super().model_post_init(__context)
+        self.client = BaseClient.create(self.config.client)
+    @property
+    @abstractmethod
+    def system_prompt(self) -> str: ...
+    @cached_property
+    @abstractmethod
+    def tools(self) -> list[Tool]: ...
+    @cached_property
+    def tool_descriptions(self) -> str:
+        tool_descriptions = []
+        for tool in self.tools:
+            func_descriptions = "\n".join("- {name}: {description}".format(**schema) for schema in tool.schema)
+            tool_title = f"{tool.__class__.__name__}:\n" if len(self.tools) > 1 else ""
+            tool_descriptions.append(f"{tool_title}{func_descriptions}")
+        return "\n\n".join(tool_descriptions)
+    async def get_history_view(self) -> MessageHistory:
+        return MessageHistory(
+            messages=[SystemMessage(content=[Text(text=self.system_prompt)])],
+        ) + self.history.history_view(
+            limits=self.config.history_messages_limit,
+        )
+    @retry(
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        stop=stop_after_attempt(3),
+        reraise=True,
+        before_sleep=before_sleep_log(logger, logging.ERROR),
+    )
+    async def generate_output(
+        self,
+        use_tool: bool = False,
+        response_format: Optional[type[BaseModel]] = None,
+        append_assistant_message: bool = True,
+    ) -> AssistantMessage:
+        messages: MessageHistory = await self.get_history_view()
+        response_content = (
+            await self.client.create_completion(
+                messages=messages,
+                temperature=self.temperature,
+                seed=self.seed,
+                response_format=response_format,
+                tools=self.tools if use_tool else None,
+            )
+        ).model_dump()
+        response_content = response_content["choices"][0]["message"]
+        assistant_message = AssistantMessage(
+            role=response_content["role"],
+            content=[Text(text=response_content["content"])] if response_content["content"] else [],
+            tool_calls=response_content["tool_calls"],
+        )
+        if append_assistant_message:
+            self.history.append(message=assistant_message, label=self.message_label)
+        return assistant_message
+    def receive_user_message(
+        self,
+        text: Optional[str] = None,
+        image: list[bytes] = None,
+        label: MessageLabel = None,
+        is_base64: bool = False,
+    ) -> None:
+        message = UserMessage.from_media(
+            text=text,
+            image=image,
+            is_base64=is_base64,
+        )
+        self.history.append(message=message, label=label)
+    def receive_system_message(
+        self,
+        text: Optional[str] = None,
+        label: MessageLabel = None,
+    ) -> None:
+        message = SystemMessage.from_media(text=text)
+        self.history.append(message=message, label=label)
+    def receive_assistant_message(
+        self,
+        content: Optional[str] = None,
+        tool_calls: Optional[list[ToolCall]] = None,
+        label: MessageLabel = None,
+    ) -> None:
+        message = AssistantMessage(
+            content=[Text(text=content)] if content else [],
+            tool_calls=tool_calls,
+        )
+        self.history.append(message=message, label=label)
+    async def use_tool(self, tool_call: ToolCall):
+        function = tool_call.function
+        for tool in self.tools:
+            if hasattr(tool, function["name"]):
+                return await getattr(tool, function["name"])(
+                    **json.loads(function["arguments"]),
+                )
+        msg = f'No tool function with name "{function["name"]}"'
+        raise ValueError(msg)
+    async def receive_tool_message(
+        self,
+        text: str,
+        tool_id: str,
+        label: MessageLabel = None,
+    ) -> None:
+        self.history.append(
+            message=ToolMessage(content=[Text(text=text)], tool_call_id=tool_id),
+            label=label,
+        )
+class Agents:
+    _agent_registry: dict[str, type[BaseAgent]] = {}
+    _agent_config_registry: dict[str, type[BaseAgentConfig]] = {}
+    @classmethod
+    def register_agent(cls, name: str):
+        """
+        Decorator to register an Agent class under a given name.
+        Example:
+            @Agents.register_agent("browser")
+            class BrowserAgent(BaseAgent):
+                ...
+        """
+        def decorator(agent_cls: type[BaseAgent]) -> type[BaseAgent]:
+            cls._agent_registry[name] = agent_cls
+            return agent_cls
+        return decorator
+    @classmethod
+    def register_agent_config(cls, name: str):
+        """
+        Decorator to register a configuration class under a given name.
+        Example:
+            @Agents.register_agent_config("browser")
+            class BrowserAgentConfig(BaseAgentConfig):
+                ...
+        """
+        def decorator(config_cls: type[BaseAgentConfig]) -> type[BaseAgentConfig]:
+            cls._agent_config_registry[name] = config_cls
+            return config_cls
+        return decorator
+    @classmethod
+    def get(cls, name: str) -> type[BaseAgent]:
+        """
+        Retrieve a registered Agent class by its name.
+        Raises:
+            ValueError: If no such agent is found.
+        """
+        try:
+            return cast(Type[BaseAgent], cls._agent_registry[name])
+        except KeyError:
+            raise ValueError(f"Agent '{name}' not found.")
+    @classmethod
+    def get_config(cls, name: str) -> type[BaseAgentConfig]:
+        """
+        Retrieve a registered Agent configuration class by its name.
+        Raises:
+            ValueError: If no such config is found.
+        """
+        try:
+            return cast(type[BaseAgentConfig], cls._agent_config_registry[name])
+        except KeyError:
+            raise ValueError(f"Agent config for '{name}' not found.")

proxy-lite-demo-v2/src/proxy_lite/agents/proxy_lite_agent.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from functools import cached_property
+from typing import Literal
+from pydantic import Field
+from proxy_lite.history import MessageHistory, MessageLabel, SystemMessage, Text
+from proxy_lite.tools import Tool
+from .agent_base import Agents, BaseAgent, BaseAgentConfig
+MODEL_SYSTEM_PROMPT = """You are Proxy-Lite, an AI assistant that can perform actions on a computer screen.
+You were developed by Convergence AI.
+The user will instruct you to perform a task.
+You will be shown a screen as well as relevant interactable elements highlighted by mark_ids and you will be given a set of tools to use to perform the task.
+CRITICAL WORKFLOW INSTRUCTIONS:
+1. Make observations about the screen, putting them in <observation></observation> tags.
+2. Reason about what needs to be done to complete the task, putting your thoughts in <thinking></thinking> tags.
+3. Use the tools to perform actions - DO NOT just describe what you plan to do, EXECUTE the actions immediately.
+4. When you receive "[ACTION COMPLETED]" feedback, analyze the new screen state to determine your next action.
+5. Continue executing actions step by step until the entire task is complete.
+6. Use the return_value tool only when the ENTIRE task is finished.
+IMPORTANT: Do NOT stop after one action. Multi-step tasks require multiple tool calls. When you receive action completion feedback, immediately analyze the screen and continue with the next required action.
+"""  # noqa: E501
+MAX_MESSAGES_FOR_CONTEXT_WINDOW = {
+    MessageLabel.SCREENSHOT: 1,
+}
+@Agents.register_agent_config("proxy_lite")
+class ProxyLiteAgentConfig(BaseAgentConfig):
+    name: Literal["proxy_lite"] = "proxy_lite"
+    history_messages_limit: dict[MessageLabel, int] = Field(
+        default_factory=lambda: MAX_MESSAGES_FOR_CONTEXT_WINDOW,
+    )
+@Agents.register_agent("proxy_lite")
+class ProxyLiteAgent(BaseAgent):
+    config: ProxyLiteAgentConfig
+    message_label: MessageLabel = MessageLabel.AGENT_MODEL_RESPONSE
+    def __init__(self, **data):
+        super().__init__(**data)
+    @property
+    def system_prompt(self) -> str:
+        return MODEL_SYSTEM_PROMPT
+    @cached_property
+    def tools(self) -> list[Tool]:
+        return self.env_tools
+    async def get_history_view(self) -> MessageHistory:
+        return MessageHistory(
+            messages=[SystemMessage(content=[Text(text=self.system_prompt)])],
+        ) + self.history.history_view(
+            limits=self.config.history_messages_limit,
+        )

proxy-lite-demo-v2/src/proxy_lite/app.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import asyncio
+import base64
+from io import BytesIO
+import streamlit as st
+from PIL import Image
+from proxy_lite import Runner, RunnerConfig
+def get_user_config(config_expander):
+    config = {
+        "environment": {
+            "name": "webbrowser",
+            "annotate_image": True,
+            "screenshot_delay": 2.0,
+            "include_html": False,
+            "viewport_width": 1280,
+            "viewport_height": 1920,
+            "include_poi_text": True,
+            "homepage": "https://dwd000006jia1mae.lightning.force.com/lightning/setup/AccountForecastSettings/home",
+            "keep_original_image": False,
+            "headless": False,  # without proxies headless mode often results in getting bot blocked
+        },
+        "solver": {
+            "name": "simple",
+            "agent": {
+                "name": "proxy_lite",
+                "client": {
+                    "name": "convergence",
+                    "model_id": "convergence-ai/proxy-lite-3b",
+                    "api_base": "https://convergence-ai-demo-api.hf.space/v1",
+                },
+            },
+        },
+        "local_view": False,
+        "verbose": True,
+        "task_timeout": 1800,  # 30 minutes
+        "action_timeout": 300,
+        "environment_timeout": 120,
+    }
+    with config_expander:
+        st.subheader("Environment Settings")
+        col1, col2 = st.columns(2)
+        with col1:
+            config["environment"]["include_html"] = st.checkbox(
+                "Include HTML",
+                value=config["environment"]["include_html"],
+                help="Include HTML in observations",
+            )
+            config["environment"]["include_poi_text"] = st.checkbox(
+                "Include POI Text",
+                value=config["environment"]["include_poi_text"],
+                help="Include points of interest text in observations",
+            )
+            config["environment"]["homepage"] = st.text_input(
+                "Homepage",
+                value=config["environment"]["homepage"],
+                help="Homepage to start from",
+            )
+        with col2:
+            config["solver"]["agent"]["client"]["api_base"] = st.text_input(
+                "VLLM Server URL",
+                value=config["solver"]["agent"]["client"]["api_base"],
+                help="URL of a vllm server running proxy-lite",
+            )
+            config["environment"]["screenshot_delay"] = st.slider(
+                "Screenshot Delay (seconds)",
+                min_value=0.5,
+                max_value=10.0,
+                value=config["environment"]["screenshot_delay"],
+                step=0.5,
+                help="Delay before taking screenshots",
+            )
+        st.subheader("Advanced Settings")
+        config["task_timeout"] = st.number_input(
+            "Task Timeout (seconds)",
+            min_value=60,
+            max_value=3600,
+            step=60,
+            value=config["task_timeout"],
+            help="Maximum time allowed for task completion",
+        )
+        config["action_timeout"] = st.number_input(
+            "Action Timeout (seconds)",
+            min_value=10,
+            max_value=300,
+            step=10,
+            value=config["action_timeout"],
+            help="Maximum time allowed for an action to complete",
+        )
+        config["environment_timeout"] = st.number_input(
+            "Environment Timeout (seconds)",
+            min_value=10,
+            max_value=300,
+            step=10,
+            value=config["environment_timeout"],
+            help="Maximum time allowed for environment to respond",
+        )
+    return config
+async def run_task_async(
+    task: str,
+    status_placeholder,
+    action_placeholder,
+    environment_placeholder,
+    image_placeholder,
+    history_placeholder,
+    config: dict,
+):
+    try:
+        config = RunnerConfig.from_dict(config)
+    except Exception as e:
+        st.error(f"Error loading RunnerConfig: {e!s}")
+        return
+    print(config)
+    runner = Runner(config=config)
+    # Add the spinning animation using HTML
+    status_placeholder.markdown(
+        """
+        <style>
+        @keyframes spin {
+            0% { content: "⚡"; }
+            25% { content: "⚡."; }
+            50% { content: "⚡.."; }
+            75% { content: "⚡..."; }
+        }
+        .spinner::before {
+            content: "⚡";
+            animation: spin 2s linear infinite;
+            display: inline-block;
+        }
+        </style>
+        <div><b>Resolving your task  </b><span class="spinner"></span></div>
+        """,
+        unsafe_allow_html=True,
+    )
+    all_steps = []
+    all_screenshots = []
+    all_soms = []
+    async for run in runner.run_generator(task):
+        # Update status with latest step
+        if run.actions:
+            latest_step = run.actions[-1].text
+            latest_step += "".join(
+                [
+                    f'<tool_call>{{"name": {tool_call.function["name"]}, "arguments": {tool_call.function["arguments"]}}}</tool_call>'  # noqa: E501
+                    for tool_call in run.actions[-1].tool_calls
+                ]
+            )
+            action_placeholder.write(f"⚡ **Latest Step:** {latest_step}")
+            all_steps.append(latest_step)
+        # Update image if available
+        if run.observations and run.observations[-1].state.image:
+            environment_placeholder.write("🌐 **Environment:**")
+            image_bytes = base64.b64decode(run.observations[-1].state.image)
+            image = Image.open(BytesIO(image_bytes))
+            image_placeholder.image(image, use_container_width=True)
+            all_screenshots.append(image)
+            som = run.observations[-1].state.text
+            all_soms.append(som)
+        # Update history
+        with history_placeholder, st.expander("🕝 **History**"):
+            for idx, (action, img, som) in enumerate(zip(all_steps, all_screenshots, all_soms, strict=False)):
+                st.write(f"**Step {idx + 1}**")
+                st.image(img, use_container_width=True)
+                st.markdown(som)
+                st.write(action)
+    action_placeholder.write(" ")
+    status_placeholder.write(f"✨ **Result:** {latest_step}")
+def main():
+    st.title("⚡ Proxy-Lite")
+    def img_to_base64(image_path):
+        with open(image_path, "rb") as img_file:
+            return base64.b64encode(img_file.read()).decode("utf-8")
+    st.markdown("Powered by **Proxy-Lite**", unsafe_allow_html=True)
+    if "config_expanded" not in st.session_state:
+        st.session_state.config_expanded = False
+    if "settings_expanded" not in st.session_state:
+        st.session_state.settings_expanded = False
+    config_expander = st.expander("⚙️ Proxy-Lite Configuration", expanded=st.session_state.config_expanded)
+    config = get_user_config(config_expander)
+    with st.form(key="run_task_form"):
+        task = st.text_input(
+            "Submit a task",
+            key="task_input",
+            help="Enter a task to be completed",
+        )
+        submit_button = st.form_submit_button("Submit a task", type="primary", use_container_width=True)
+        if submit_button:
+            st.session_state.config_expanded = False
+            if task:
+                # Create placeholders for dynamic updates
+                status_placeholder = st.empty()
+                st.write(" ")
+                action_placeholder = st.empty()
+                environment_placeholder = st.empty()
+                image_placeholder = st.empty()
+                history_placeholder = st.empty()
+                # Run the async task
+                asyncio.run(
+                    run_task_async(
+                        task,
+                        status_placeholder,
+                        action_placeholder,
+                        environment_placeholder,
+                        image_placeholder,
+                        history_placeholder,
+                        config,
+                    ),
+                )
+                st.success("Task completed!", icon="✨")
+            else:
+                st.error("Please give a task first!")
+if __name__ == "__main__":
+    main()

proxy-lite-demo-v2/src/proxy_lite/browser/__init__.py ADDED Viewed

File without changes

proxy-lite-demo-v2/src/proxy_lite/browser/add_custom_select.js ADDED Viewed

	@@ -0,0 +1,123 @@

+handledSelectElementsConvergence = new WeakSet();
+overwriteDefaultSelectConvergence = (input = null) => {
+    let activeSelectElement = null;
+    // Handle iframe input element
+    let rootElement = input ? input : document.documentElement;
+    function createCustomSelectElement() {
+        // Create the custom select container
+        const customSelect = document.createElement('div');
+        customSelect.id = 'convergence-custom-select-element-X2EmudtLRN';
+        customSelect.style.position = 'absolute'
+        customSelect.style.zIndex = 2147483647 - 1;
+        customSelect.style.display = 'none';
+        document.body.appendChild(customSelect);
+        // Create the select options list
+        const optionsList = document.createElement('div');
+        optionsList.style.border = '1px solid #ccc';
+        optionsList.style.backgroundColor = '#fff';
+        optionsList.style.color = 'black';
+        customSelect.appendChild(optionsList);
+        return customSelect;
+    }
+    function showCustomSelect(select) {
+        activeSelectElement = select;
+        // Clear previous options
+        const customSelect = rootElement.querySelector('#convergence-custom-select-element-X2EmudtLRN');
+        let optionsList = customSelect.firstChild;
+        optionsList.innerHTML = '';
+        // Populate with new options
+        Array.from(select.options).forEach(option => {
+            const customOption = document.createElement('div');
+            customOption.className = 'custom-option';
+            customOption.style.padding = '8px';
+            customOption.style.cursor = 'pointer';
+            customOption.textContent = option.text;
+            customOption.dataset.value = option.value;
+            optionsList.appendChild(customOption);
+            customOption.addEventListener('mouseenter', function () {
+                customOption.style.backgroundColor = '#f0f0f0';
+            });
+            customOption.addEventListener('mouseleave', function () {
+                customOption.style.backgroundColor = '';
+            });
+            customOption.addEventListener('mousedown', (e) => {
+                e.stopPropagation();
+                select.value = customOption.dataset.value;
+                customSelect.style.display = 'none';
+                activeSelectElement = null;
+                // ensure we trigger all potential event listeners
+                select.dispatchEvent(new InputEvent('focus', { bubbles: true, cancelable: true }));
+                select.dispatchEvent(new InputEvent('input', { bubbles: true, cancelable: true }));
+                select.dispatchEvent(new InputEvent('change', { bubbles: true, cancelable: true }));
+                select.dispatchEvent(new InputEvent('blur', { bubbles: true, cancelable: true }));
+            });
+        });
+        // Position and show the custom select
+        const selectRect = select.getBoundingClientRect();
+        customSelect.style.top = `${selectRect.bottom + window.scrollY}px`;
+        customSelect.style.left = `${selectRect.left + window.scrollX}px`;
+        customSelect.style.width = `${selectRect.width}px`;
+        customSelect.style.display = 'block';
+        select.focus();
+        select.addEventListener('blur', function (e) {
+            customSelect.style.display = 'none';
+            activeSelectElement = null;
+        });
+        select.addEventListener('change', function (e) {
+            customSelect.style.display = 'none';
+            activeSelectElement = null;
+        });
+    }
+    // Ensure we have a custom select element
+    let customSelect = rootElement.querySelector(`#convergence-custom-select-element-X2EmudtLRN`);
+    if (!customSelect) {
+        customSelect = createCustomSelectElement();
+    }
+    // Find selects in shadow DOMs
+    function findSelectInShadowRoot(element) {
+        if (element.shadowRoot) {
+            return element.shadowRoot.querySelectorAll('select');
+        }
+        return [];
+    }
+    let shadowSelects = [];
+    rootElement.querySelectorAll('*').forEach(el => {
+        shadowSelects.push(...findSelectInShadowRoot(el));
+    });
+    // Find selects in the regular (light) DOM
+    const lightSelects = Array.from(rootElement.querySelectorAll('select'));
+    // Add event listeners to all select elements
+    const allSelects = [...lightSelects, ...shadowSelects];
+    allSelects.forEach(select => {
+        if (select.hasAttribute('multiple')) {
+            // skip special multiple elements as our POI code already handles them
+            return;
+        }
+        if (!handledSelectElementsConvergence.has(select)) {
+            select.addEventListener('mousedown', (e) => {
+                // only use custom select when the default behaviour is being used
+                if (!e.defaultPrevented) {
+                    showCustomSelect(select);
+                    e.preventDefault();
+                }
+            });
+            handledSelectElementsConvergence.add(select);
+        }
+    });
+}

proxy-lite-demo-v2/src/proxy_lite/browser/bounding_boxes.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import math
+from typing import Any
+import cv2
+import numpy as np
+from pydantic import BaseModel, Field, field_validator
+class Point(BaseModel):
+    x: int
+    y: int
+    def __iter__(self):
+        return iter((self.x, self.y))
+    def __getitem__(self, index) -> int:
+        return (self.x, self.y)[index]
+    def __tuple__(self) -> tuple[int, int]:
+        return (self.x, self.y)
+    def __repr__(self) -> str:
+        return f"Point(x={self.x}, y={self.y})"
+class BoundingBox(BaseModel):
+    label: str = Field(..., description="The label that's given for this bounding box")
+    left: int = Field(..., description="Left coordinate of the bounding box")
+    right: int = Field(..., description="Right coordinate of the bounding box")
+    top: int = Field(..., description="Top coordinate of the bounding box")
+    bottom: int = Field(..., description="Bottom coordinate of the bounding box")
+    @field_validator("left", "top", mode="before")
+    @classmethod
+    def round_down(cls, v):
+        return math.floor(float(v))
+    @field_validator("right", "bottom", mode="before")
+    @classmethod
+    def round_up(cls, v):
+        return math.ceil(float(v))
+class POI(BaseModel):
+    info: dict[str, Any]
+    element_centroid: Point
+    bounding_box: BoundingBox
+def calculate_dash_points(start, end, dash_length, gap_length):
+    x1, y1 = start
+    x2, y2 = end
+    dx = x2 - x1
+    dy = y2 - y1
+    dist = np.sqrt(dx * dx + dy * dy)
+    if dist == 0:
+        return []
+    unit_x = dx / dist
+    unit_y = dy / dist
+    dash_points = []
+    current_dist = 0
+    while current_dist < dist:
+        dash_end = min(current_dist + dash_length, dist)
+        dash_points.extend(
+            [
+                (int(x1 + unit_x * current_dist), int(y1 + unit_y * current_dist)),
+                (int(x1 + unit_x * dash_end), int(y1 + unit_y * dash_end)),
+            ],
+        )
+        current_dist += dash_length + gap_length
+    return dash_points
+def draw_dashed_rectangle(
+    img,
+    bbox: BoundingBox,
+    color,
+    thickness=1,
+    dash_length=10,
+    gap_length=5,
+):
+    # Calculate dash points for all sides
+    top_points = calculate_dash_points(
+        (bbox.left + 25, bbox.top + 25),
+        (bbox.right + 25, bbox.top + 25),
+        dash_length,
+        gap_length,
+    )
+    right_points = calculate_dash_points(
+        (bbox.right + 25, bbox.top + 25),
+        (bbox.right + 25, bbox.bottom + 25),
+        dash_length,
+        gap_length,
+    )
+    bottom_points = calculate_dash_points(
+        (bbox.right + 25, bbox.bottom + 25),
+        (bbox.left + 25, bbox.bottom + 25),
+        dash_length,
+        gap_length,
+    )
+    left_points = calculate_dash_points(
+        (bbox.left + 25, bbox.bottom + 25),
+        (bbox.left + 25, bbox.top + 25),
+        dash_length,
+        gap_length,
+    )
+    # Combine all points
+    all_points = top_points + right_points + bottom_points + left_points
+    # Draw all lines at once
+    if all_points:
+        all_points = np.array(all_points).reshape((-1, 2, 2))
+        cv2.polylines(img, all_points, False, color, thickness)
+# @time_it(name='Annotate bounding box')
+def annotate_bounding_box(image: bytes, bbox: BoundingBox) -> None:
+    # Draw dashed bounding box
+    draw_dashed_rectangle(
+        image,
+        bbox,
+        color=(0, 0, 255),
+        thickness=1,
+        dash_length=10,
+        gap_length=5,
+    )
+    # Prepare label
+    font_scale = 0.4 * 4  # Increased by 4x for the larger patch
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    thickness = 3  # Increased thickness for the larger patch
+    # Get text size for the larger patch
+    (label_width, label_height), _ = cv2.getTextSize(
+        bbox.label,
+        font,
+        font_scale,
+        thickness,
+    )
+    # Create a larger patch (4x)
+    large_label_patch = np.zeros(
+        (label_height + 20, label_width + 20, 4),
+        dtype=np.uint8,
+    )
+    large_label_patch[:, :, 0:3] = (0, 0, 255)  # BGR color format: Red background
+    large_label_patch[:, :, 3] = 128  # Alpha channel: 50% opacity (128/255 = 0.5)
+    # Draw text on the larger patch
+    cv2.putText(
+        large_label_patch,
+        bbox.label,
+        (8, label_height + 8),  # Adjusted position for the larger patch
+        font,
+        font_scale,
+        (255, 255, 255, 128),  # White text, 50% opaque (128/255 = 0.5)
+        thickness,
+    )
+    # Scale down the patch to improve anti-aliasing
+    label_patch = cv2.resize(
+        large_label_patch,
+        (label_width // 4 + 5, label_height // 4 + 5),
+        interpolation=cv2.INTER_AREA,
+    )
+    # Calculate position for top-left alignment
+    offset = 2  # Small offset to prevent touching the bounding box edge
+    x = min(image.shape[1], max(0, int(bbox.left + 25) - offset))
+    y = min(image.shape[0], max(0, int(bbox.top + 25) - label_patch.shape[0] - offset))
+    # Ensure we're not out of bounds
+    x_end = min(image.shape[1], x + label_patch.shape[1])
+    y_end = min(image.shape[0], y + label_patch.shape[0])
+    label_patch = label_patch[: (y_end - y), : (x_end - x)]
+    # Create a mask for the label patch
+    alpha_mask = label_patch[:, :, 3] / 255.0
+    alpha_mask = np.repeat(alpha_mask[:, :, np.newaxis], 3, axis=2)
+    # Blend the label patch with the image
+    image_section = image[y:y_end, x:x_end]
+    blended = (1 - alpha_mask) * image_section + alpha_mask * label_patch[:, :, 0:3]
+    image[y:y_end, x:x_end] = blended.astype(np.uint8)
+def annotate_bounding_boxes(image: bytes, bounding_boxes: list[BoundingBox]) -> bytes:
+    # Read the image
+    nparr = np.frombuffer(image, np.uint8)
+    # Decode the image
+    img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+    padded_img = cv2.copyMakeBorder(
+        img,
+        top=25,  # Value chosen based on label size
+        bottom=25,  # Value chosen based on label size
+        left=25,  # Value chosen based on label size
+        right=25,  # Value chosen based on label size
+        borderType=cv2.BORDER_CONSTANT,
+        value=(255, 255, 255),
+    )
+    for bounding_box in bounding_boxes:
+        # Annotate the image in place with the bounding box and the bounding box label
+        annotate_bounding_box(padded_img, bounding_box)
+    _, buffer = cv2.imencode(".jpeg", padded_img)
+    return buffer.tobytes()

proxy-lite-demo-v2/src/proxy_lite/browser/browser.py ADDED Viewed

	@@ -0,0 +1,508 @@

+import asyncio
+import logging
+import platform
+import re
+from contextlib import AsyncExitStack
+from pathlib import Path
+from typing import Literal, Optional, Self
+from playwright.async_api import Browser, BrowserContext, Page, Playwright, async_playwright
+from playwright.async_api import TimeoutError as PlaywrightTimeoutError
+from playwright_stealth import StealthConfig, stealth_async
+from pydantic import Field
+from tenacity import before_sleep_log, retry, stop_after_delay, wait_exponential
+from proxy_lite.browser.bounding_boxes import POI, BoundingBox, Point, annotate_bounding_boxes
+from proxy_lite.logger import logger
+import base64
+SELF_CONTAINED_TAGS = [
+    # many of these are non-interactive but keeping them anyway
+    "area",
+    "base",
+    "br",
+    "col",
+    "embed",
+    "hr",
+    "img",
+    "input",
+    "link",
+    "meta",
+    "param",
+    "source",
+    "track",
+    "wbr",
+]
+def element_as_text(
+    mark_id: int,
+    tag: Optional[str] = None,
+    text: Optional[str] = None,
+    **raw_attributes,
+) -> str:
+    """Return a text representation of all elements on the page."""
+    attributes = []
+    for k, v in raw_attributes.items():
+        if v is None:
+            continue
+        if isinstance(v, bool):
+            if v:
+                attributes.append(k)
+            # we ignore False bool attributes
+        else:
+            v = str(v)
+            if len(v) > 2500:
+                v = v[: 2500 - 1] + "…"
+            attributes.append(f'{k}="{v}"')
+    attributes = " ".join(attributes)
+    attributes = (" " + attributes).rstrip()
+    tag = tag.lower()
+    if text is None:
+        text = ""
+    if len(text) > 2500:
+        text = text[: 2500 - 1] + "…"
+    # sub-out line breaks so elements are easier to distinguish
+    attributes = re.sub(r"\r\n|\r|\n", "⏎", attributes)
+    text = re.sub(r"\r\n|\r|\n", "⏎", text)
+    if tag in SELF_CONTAINED_TAGS:
+        if text:
+            logger.warning(
+                f"Got self-contained element '{tag}' which contained text '{text}'.",
+            )
+        else:
+            return f"- [{mark_id}] <{tag}{attributes}/>"
+    return f"- [{mark_id}] <{tag}{attributes}>{text}</{tag}>"
+class BrowserSession:
+    def __init__(
+        self,
+        viewport_width: int = 1280,
+        viewport_height: int = 720,
+        headless: bool = True,
+    ):
+        self.viewport_width = viewport_width
+        self.viewport_height = viewport_height
+        self.headless = headless
+        self.playwright: Playwright | None = None
+        self.browser: Browser | None = None
+        self.context: BrowserContext | None = None
+        self._exit_stack: AsyncExitStack | None = None
+        self.poi_elements: list = Field(default_factory=list)
+        self.poi_centroids: list[Point] = Field(default_factory=list)
+        self.bounding_boxes: list[BoundingBox] = Field(default_factory=list)
+        self.pois: list[POI] = Field(default_factory=list)
+    async def __aenter__(self) -> Self:
+        self._exit_stack = AsyncExitStack()
+        self.playwright = await async_playwright().start()
+        self.browser = await self.playwright.chromium.launch(headless=self.headless)
+        self.context = await self.browser.new_context(
+            viewport={"width": self.viewport_width, "height": self.viewport_height},
+        )
+        # Ensure there's at least one page open
+        if not self.context.pages:
+            await self.context.new_page()
+        self.context.set_default_timeout(60_000)
+        self.current_page.set_default_timeout(60_000)
+        await stealth_async(self.current_page, StealthConfig(navigator_user_agent=False))
+        await self.context.add_init_script(
+            path=Path(__file__).with_name("add_custom_select.js"),
+        )
+        await self.context.add_init_script(
+            path=Path(__file__).with_name("find_pois.js"),
+        )
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        if self.browser:
+            await self.browser.close()
+        if self.playwright:
+            await self.playwright.stop()
+        if self._exit_stack:
+            await self._exit_stack.aclose()
+    @property
+    def current_page(self) -> Optional[Page]:
+        if self.context and self.context.pages:
+            return self.context.pages[-1] # Return the most recently opened page
+        return None
+    @property
+    def current_url(self) -> Optional[str]:
+        if self.current_page:
+            return self.current_page.url
+        return None
+    # re-run for cases of mid-run redirects
+    @retry(
+        wait=wait_exponential(multiplier=1, min=1, max=10),
+        stop=stop_after_delay(5),
+        reraise=True,
+        before_sleep=before_sleep_log(logger, logging.ERROR),
+    )
+    async def process_iframe(self, iframe) -> Optional[tuple[dict, dict]]:
+        try:
+            # Check iframe visibility and size
+            bounding_box = await iframe.bounding_box()
+            if not bounding_box:
+                return None  # Skip if iframe is not visible
+            width, height = bounding_box["width"], bounding_box["height"]
+            if width < 50 or height < 50:
+                return None
+            frame = await iframe.content_frame()
+            if not frame:
+                return None
+            poi = await frame.evaluate(
+                """() => {
+                    overwriteDefaultSelectConvergence();
+                    return findPOIsConvergence();
+                }""",
+            )
+            if not poi:
+                return None
+            iframe_offset = {"x": round(bounding_box["x"]), "y": round(bounding_box["y"])}
+            return poi, iframe_offset
+        except Exception as e:
+            logger.error(f"Error processing iframe: {e}")
+            return None
+    @retry(
+        wait=wait_exponential(multiplier=1, min=1, max=10),
+        stop=stop_after_delay(5),
+        reraise=True,
+        before_sleep=before_sleep_log(logger, logging.ERROR),
+    )
+    async def update_poi(self) -> None:
+        try:
+            # Wait for basic page load states to ensure the DOM is ready.
+            # This is a fundamental wait that should always apply.
+            await self.current_page.wait_for_load_state("domcontentloaded", timeout=60000)
+            logger.debug(f"DEBUG: wait_for_load_state('domcontentloaded') completed for {self.current_page.url}.")
+            current_url = self.current_page.url
+            # Define common Salesforce URL patterns for different states
+            login_url_patterns = [
+                "login.salesforce.com",
+                "identity.force.com",
+                "auth.lightning.force.com",
+                "setup.salesforce.com", # Sometimes a setup login redirects here temporarily
+                "my.salesforce.com" # Your specific custom domain login redirects here
+            ]
+            # This is the main Salesforce Lightning application base URL, typically seen after login.
+            # We treat this as an intermediate loading state before the specific target page.
+            intermediate_app_url_pattern = "/one/one.app"
+            # Check the current state of the page based on its URL
+            is_on_login_page = any(pattern in current_url for pattern in login_url_patterns)
+            is_on_intermediate_app_page = intermediate_app_url_pattern in current_url
+            # Note: is_on_target_forecast_page checks if the specific target path is in the URL
+            is_on_target_forecast_page = "/AccountForecastSettings/home" in current_url
+            # --- CONDITIONAL WAITING LOGIC BASED ON URL ---
+            if is_on_target_forecast_page:
+                logger.info(f"INFO: Detected target Account Forecast Settings page: {current_url}. Waiting for content.")
+                # When on the specific target page, wait for its content and spinners
+                spinner_selectors = [
+                    "div.slds-spinner_container",
+                    "div.auraLoadingBox",
+                    "div.dxp_axb_container", # Main overlay from your inspect screenshot
+                    "div.slds-sprite-astro-x-large" # Specific animated element itself
+                ]
+                for selector in spinner_selectors:
+                    try:
+                        await self.current_page.wait_for_selector(selector, state="hidden", timeout=5000) # Reduced timeout
+                        logger.debug(f"DEBUG: Spinner element '{selector}' became hidden for {self.current_page.url}.")
+                    except PlaywrightTimeoutError:
+                        logger.warning(f"DEBUGGING: Spinner element '{selector}' not detected or did not disappear on {self.current_page.url} within 5s.")
+                # Wait for a known element on the Account Forecast Settings page to ensure content is there.
+                try:
+                    # Added 'h2' for section headers, and a more generic 'div[data-aura-rendered-by]' for Lightning components
+                    await self.current_page.wait_for_selector("h1.slds-page-header__title, h2, .account-forecast-settings-component, div[data-aura-rendered-by]", state="visible", timeout=15000) # Increased timeout slightly for robust content load
+                    logger.debug(f"DEBUG: Confirmed main page element visible for {self.current_page.url}.")
+                except PlaywrightTimeoutError:
+                    logger.warning(f"DEBUGGING: Main page element not visible on {self.current_page.url} within 15s. This might indicate incomplete page load despite no spinner.")
+            elif is_on_login_page:
+                logger.info(f"INFO: Detected Salesforce login page: {current_url}. Waiting for login elements.")
+                # When on a login page, just wait for the login form elements to be visible
+                try:
+                    await self.current_page.wait_for_selector("input[type='email'], input[type='password'], input[type='submit'], #username, #password, #Login", state="visible", timeout=10000)
+                    logger.debug(f"DEBUG: Login page elements visible on {self.current_page.url}.")
+                except PlaywrightTimeoutError:
+                    logger.warning(f"DEBUGGING: Login page elements not visible on {self.current_page.url} within 10s. This may happen if elements are in an iframe or if page is extremely slow.")
+            elif is_on_intermediate_app_page:
+                logger.info(f"INFO: Detected intermediate Salesforce Lightning app loading page: {current_url}. Waiting for network idle and app spinner.")
+                # This is the /one/one.app page or similar. Don't wait for specific content, just general load.
+                try:
+                    await self.current_page.wait_for_load_state("networkidle", timeout=30000) # Give it more time for network to settle
+                    logger.debug(f"DEBUG: Network idle detected on intermediate app page: {current_url}.")
+                except PlaywrightTimeoutError:
+                    logger.warning(f"DEBUGGING: Network idle timeout on intermediate app page: {current_url}. Proceeding anyway.")
+                # Also try to wait for a common full-app spinner to disappear, if present
+                try:
+                    await self.current_page.wait_for_selector('div.app-spinner, div.auraLoadingBox', state='hidden', timeout=15000) # Added auraLoadingBox as it might reappear
+                    logger.debug(f"DEBUG: App spinner on intermediate page became hidden.")
+                except PlaywrightTimeoutError:
+                    logger.warning(f"DEBUGGING: App spinner on intermediate page not found or did not disappear.")
+            else:
+                logger.info(f"INFO: Detected unhandled URL type: {current_url}. Performing generic body wait.")
+                # Fallback for any other page, just wait for body to be visible
+                try:
+                    await self.current_page.wait_for_selector("body", timeout=5000, state="visible")
+                    logger.debug(f"DEBUG: wait_for_selector('body', state='visible') completed for {self.current_page.url}.")
+                except PlaywrightTimeoutError:
+                    logger.warning(f"DEBUGGING: Playwright Timeout (5s) on body selector for {self.current_page.url}. Continuing anyway.")
+                    pass
+        except PlaywrightTimeoutError as e:
+            logger.error(f"ERROR: Timeout waiting for page readiness for {self.current_page.url}: {e}")
+            raise # Re-raise if essential waits fail (e.g., initial domcontentloaded)
+        except Exception as e:
+            logger.error(f"ERROR: An unexpected error occurred during page readiness check for {self.current_page.url}: {e}")
+            raise
+        # Rest of update_poi: Run the bounding box javascript code to highlight the points of interest on the page
+        page_info = await self.current_page.evaluate(
+            """() => {
+                overwriteDefaultSelectConvergence();
+                return findPOIsConvergence();
+            }""",
+        )
+        # Get the points of interest on the page
+        self.poi_elements = page_info["element_descriptions"]
+        element_centroids = page_info["element_centroids"]
+        try:
+            # Select all iframes on the page
+            iframes = await self.current_page.query_selector_all("iframe")
+            max_iframes = 10
+            # Define an asynchronous function to process and filter each iframe
+            tasks = [asyncio.create_task(self.process_iframe(iframe)) for iframe in iframes[:max_iframes]]
+            results = await asyncio.gather(*tasks)
+            filtered_results = [result for result in results if result is not None]
+            iframes_pois = []
+            iframe_offsets = []
+            for poi, offset in filtered_results:
+                iframes_pois.append(poi)
+                iframe_offsets.append(offset)
+            # Combine the points of interest from the iframes with the main page and adjust the centroids
+            for index, iframe_poi in enumerate(iframes_pois):
+                self.poi_elements.extend(iframe_poi["element_descriptions"])
+                for centroid in iframe_poi["element_centroids"]:
+                    centroid["x"] += iframe_offsets[index]["x"]
+                    centroid["y"] += iframe_offsets[index]["y"]
+                    centroid["left"] += iframe_offsets[index]["x"]
+                    centroid["top"] += iframe_offsets[index]["y"]
+                    centroid["right"] += iframe_offsets[index]["x"]
+                    # Fix: Removed duplicate 'centroid["y"] += iframe_offsets[index]["y"]'
+                    centroid["bottom"] += iframe_offsets[index]["y"]
+                element_centroids.extend(iframe_poi["element_centroids"])
+        except Exception as e:
+            logger.error(f"Error in finding iframes: {e}")
+        # Get the centroids of the points of interest
+        self.poi_centroids = [Point(x=xy["x"], y=xy["y"]) for xy in element_centroids]
+        self.bounding_boxes = [BoundingBox(**xy, label=str(i)) for i, xy in enumerate(element_centroids)]
+        self.pois = [
+            POI(info=info, element_centroid=centroid, bounding_box=bbox)
+            for info, centroid, bbox in zip(
+                self.poi_elements,
+                self.poi_centroids,
+                self.bounding_boxes,
+                strict=False,
+            )
+        ]
+    @property
+    def poi_text(self) -> str:
+        # Get all points of interest on the page as text
+        texts = [element_as_text(mark_id=i, **element) for i, element in enumerate(self.poi_elements)]
+        # Return formatted text of points of interest on page
+        return "\n".join([txt for txt in texts if txt])
+    async def screenshot(
+        self,
+        delay: float = 0.0,
+        quality: int = 70,
+        type: str = "jpeg",
+        scale: str = "css",
+    ) -> tuple[bytes, bytes]:
+        if delay > 0.0:
+            await asyncio.sleep(delay)
+        await self.update_poi()
+        # Keep original logic if page is highly dynamic, but for static shots, simpler is faster
+        # old_poi_positions = [tuple(point) for point in self.poi_centroids]
+        img = await self.current_page.screenshot(type=type, quality=quality, scale=scale)
+        annotated_img = annotate_bounding_boxes(image=img, bounding_boxes=self.bounding_boxes)
+        # Re-evaluating this block for performance. Removed redundant update_poi and conditional screenshot.
+        # If precise screenshot timing is needed, the caller should manage delays and updates.
+        return img, annotated_img
+    async def goto(self, url: str) -> None:
+        await self.current_page.goto(url, wait_until="domcontentloaded")
+    async def reload(self) -> None:
+        await self.current_page.reload(wait_until="domcontentloaded")
+    async def click_tab(self, mark_id: int) -> None:
+        point: Point = self.poi_centroids[mark_id]
+        await self.hover(point)
+        await self.current_page.mouse.click(*point, button="middle")
+    async def click(self, mark_id: int) -> None:
+        point: Point = self.poi_centroids[mark_id]
+        await self.hover(point)
+        await self.current_page.mouse.click(*point)
+    async def enter_text(self, mark_id: int, text: str, submit: bool = False) -> None:
+        await self.clear_text_field(mark_id)
+        await self.click(mark_id)
+        await self.current_page.keyboard.type(text)
+        if submit:
+            await self.current_page.keyboard.press("Enter")
+    async def scroll(
+        self,
+        direction: Literal["up", "down", "left", "right"],
+        mark_id: Optional[int] = None,
+    ) -> None:
+        if mark_id is None:
+            point = Point(x=-1, y=-1)
+            max_scroll_x = self.viewport_width
+            max_scroll_y = self.viewport_height
+        else:
+            point: Point = self.poi_centroids[mark_id]
+            bbox: BoundingBox = self.bounding_boxes[mark_id]
+            max_scroll_x = bbox.right - bbox.left
+            max_scroll_y = bbox.bottom - bbox.top
+        await self.hover(point=point)
+        scroll_x = int(max_scroll_x * 0.8)
+        scroll_y = int(max_scroll_y * 0.8)
+        is_vertical = direction in ("up", "down")
+        reverse_scroll = direction in ("up", "left")
+        await self.current_page.mouse.wheel(
+            scroll_x * (-1 if reverse_scroll else 1) * (not is_vertical),
+            scroll_y * (-1 if reverse_scroll else 1) * is_vertical,
+        )
+    async def go_back(self) -> None:
+        # If there is no tab open then return
+        if not self.current_page:
+            return
+        await self.current_page.go_back(wait_until="domcontentloaded")
+        if self.current_page.url == "about:blank":
+            if not len(self.context.pages) > 1:
+                await self.current_page.go_forward(wait_until="domcontentloaded")
+                raise Exception("There is no previous page to go back to.")
+            await self.current_page.close()
+    async def hover(self, point: Point) -> None:
+        await self.current_page.mouse.move(*point)
+    async def focus(self, point: Point) -> None:
+        # Focus on the element on the page at point (x, y)
+        await self.current_page.evaluate(
+            """
+            ([x, y]) => {
+                const element = document.elementFromPoint(x, y);
+                if (element && element.focus) {
+                    element.focus();
+                }
+            }""",
+            tuple(point),
+        )
+    async def get_text(self, mark_id: int) -> str:
+        return await self.current_page.evaluate(
+            """
+            (mark_id) => {
+                const element = marked_elements_convergence[mark_id];
+                if (element && (element.value !== undefined || element.textContent !== undefined)) {
+                    return element.value || element.textContent;
+                }
+                return '';
+            }
+            """,
+            (mark_id,),
+        )
+    async def clear_text_field(self, mark_id: int) -> None:
+        existing_text = await self.get_text(mark_id)
+        if existing_text.strip():
+            # Clear existing text only if it exists
+            await self.click(mark_id)
+            if platform.system() == "Darwin":  # selecting all text is OS-specific
+                await self.click(mark_id)
+                await self.current_page.keyboard.press("Meta+a")
+                await self.current_page.keyboard.press("Backspace")
+            else:
+                await self.current_page.keyboard.press("Control+Home")
+                await self.current_page.keyboard.press("Control+Shift+End")
+            await self.current_page.keyboard.press("Backspace")
+    async def open_new_tab_and_go_to(self, url: str) -> None:
+        """
+        Opens a new browser tab/page and navigates to the specified URL.
+        Closes the old page if it's not the last one remaining.
+        """
+        logger.info(f"Attempting to open a new tab and navigate to: {url}")
+        new_page = await self.context.new_page()
+        # Close the previous page if it's not the only one left in the context
+        if len(self.context.pages) > 1 and self.current_page and self.current_page != new_page:
+            try:
+                await self.current_page.close()
+                logger.debug("Closed previous page.")
+            except Exception as e:
+                logger.warning(f"Could not close previous page (might already be closed or detached): {e}")
+        # After navigation, trigger POI update to reflect the new page's state
+        await new_page.goto(url, wait_until="domcontentloaded")
+        logger.info(f"Successfully navigated to {url} in a new tab.")
+        # Crucial: update_poi uses self.current_page, which is now new_page implicitly
+        await self.update_poi()
+if __name__ == "__main__":
+    async def dummy_test():
+        async with BrowserSession(headless=False) as s:
+            page = await s.context.new_page()
+            await page.goto("http://google.co.uk")
+            await asyncio.sleep(5)
+            await page.screenshot(path="example.png")
+            await s.update_poi()
+            _, annotated_image = await s.screenshot()
+            with open("output.png", "wb") as f:
+                f.write(annotated_image)
+    asyncio.run(dummy_test())

proxy-lite-demo-v2/src/proxy_lite/browser/find_pois.js ADDED Viewed

	@@ -0,0 +1,397 @@

+marked_elements_convergence = [];
+const interactiveTags = new Set([
+    'a', 'button', 'details', 'embed', 'input', 'label',
+    'menu', 'menuitem', 'object', 'select', 'textarea', 'summary',
+    'video', 'audio', 'option', 'iframe'
+]);
+const interactiveRoles = new Set([
+    'button', 'menu', 'menuitem', 'link', 'checkbox', 'radio',
+    'slider', 'tab', 'tabpanel', 'textbox', 'combobox', 'grid',
+    'listbox', 'option', 'progressbar', 'scrollbar', 'searchbox',
+    'switch', 'tree', 'treeitem', 'spinbutton', 'tooltip',
+    'a-button-inner', 'a-dropdown-button', 'click',
+    'menuitemcheckbox', 'menuitemradio', 'a-button-text',
+    'button-text', 'button-icon', 'button-icon-only',
+    'button-text-icon-only', 'dropdown', 'combobox'
+]);
+findPOIsConvergence = (input = null) => {
+    let rootElement = input ? input : document.documentElement;
+    function isScrollable(element) {
+        if ((input === null) && (element === document.documentElement)) {
+            // we can always scroll the full page
+            return false;
+        }
+        const style = window.getComputedStyle(element);
+        const hasScrollableYContent = element.scrollHeight > element.clientHeight
+        const overflowYScroll = style.overflowY === 'scroll' || style.overflowY === 'auto';
+        const hasScrollableXContent = element.scrollWidth > element.clientWidth;
+        const overflowXScroll = style.overflowX === 'scroll' || style.overflowX === 'auto';
+        return (hasScrollableYContent && overflowYScroll) || (hasScrollableXContent && overflowXScroll);
+    }
+    function getEventListeners(element) {
+        try {
+            return window.getEventListeners?.(element) || {};
+        } catch (e) {
+            return {};
+        }
+    }
+    function isInteractive(element) {
+        if (!element) return false;
+        return (hasInteractiveTag(element) ||
+            hasInteractiveAttributes(element) ||
+            hasInteractiveEventListeners(element)) ||
+            isScrollable(element);
+    }
+    function hasInteractiveTag(element) {
+        return interactiveTags.has(element.tagName.toLowerCase());
+    }
+    function hasInteractiveAttributes(element) {
+        const role = element.getAttribute('role');
+        const ariaRole = element.getAttribute('aria-role');
+        const tabIndex = element.getAttribute('tabindex');
+        const onAttribute = element.getAttribute('on');
+        if (element.getAttribute('contenteditable') === 'true') return true;
+        if ((role && interactiveRoles.has(role)) ||
+            (ariaRole && interactiveRoles.has(ariaRole))) return true;
+        if (tabIndex !== null && tabIndex !== '-1') return true;
+        // Add check for AMP's 'on' attribute that starts with 'tap:'
+        if (onAttribute && onAttribute.startsWith('tap:')) return true;
+        const hasAriaProps = element.hasAttribute('aria-expanded') ||
+            element.hasAttribute('aria-pressed') ||
+            element.hasAttribute('aria-selected') ||
+            element.hasAttribute('aria-checked');
+        return hasAriaProps;
+    }
+    function hasInteractiveEventListeners(element) {
+        const hasClickHandler = element.onclick !== null ||
+             element.getAttribute('onclick') !== null ||
+             element.hasAttribute('ng-click') ||
+             element.hasAttribute('@click') ||
+             element.hasAttribute('v-on:click');
+        if (hasClickHandler) return true;
+        const listeners = getEventListeners(element);
+        return listeners && (
+            listeners.click?.length > 0 ||
+            listeners.mousedown?.length > 0 ||
+            listeners.mouseup?.length > 0 ||
+            listeners.touchstart?.length > 0 ||
+            listeners.touchend?.length > 0
+        );
+    }
+    function calculateArea(rects) {
+        return rects.reduce((acc, rect) => acc + rect.width * rect.height, 0);
+    }
+    function getElementRects(element, context) {
+        const vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0);
+        const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0);
+        let rects = [...element.getClientRects()];
+        // If rects are empty (likely due to Shadow DOM), try to estimate position
+        if (rects.length === 0 && element.getBoundingClientRect) {
+            rects = [element.getBoundingClientRect()];
+        }
+        // Get iframe offset if element is in an iframe
+        let iframeOffset = { x: 0, y: 0 };
+        if (context !== document && context?.defaultView?.frameElement) {
+            const iframe = context.defaultView.frameElement;
+            if (iframe) {
+                const iframeRect = iframe.getBoundingClientRect();
+                iframeOffset = {
+                    x: iframeRect.left,
+                    y: iframeRect.top
+                };
+            }
+        }
+        return rects.filter(bb => {
+            const center_x = bb.left + bb.width / 2 + iframeOffset.x;
+            const center_y = bb.top + bb.height / 2 + iframeOffset.y;
+            const elAtCenter = context.elementFromPoint(center_x - iframeOffset.x, center_y - iframeOffset.y);
+            return elAtCenter === element || element.contains(elAtCenter);
+        }).map(bb => {
+            const rect = {
+                left: Math.max(0, bb.left + iframeOffset.x),
+                top: Math.max(0, bb.top + iframeOffset.y),
+                right: Math.min(vw, bb.right + iframeOffset.x),
+                bottom: Math.min(vh, bb.bottom + iframeOffset.y)
+            };
+            return {
+                ...rect,
+                width: rect.right - rect.left,
+                height: rect.bottom - rect.top
+            };
+        });
+    }
+    function isElementVisible(element) {
+        const style = window.getComputedStyle(element);
+        return element.offsetWidth > 0 &&
+            element.offsetHeight > 0 &&
+            style.visibility !== 'hidden' &&
+            style.display !== 'none';
+    }
+    function isTopElement(element) {
+        let doc = element.ownerDocument;
+        if (doc !== window.document) {
+            // If in an iframe's document, treat as top
+            return true;
+        }
+        const shadowRoot = element.getRootNode();
+        if (shadowRoot instanceof ShadowRoot) {
+            const rect = element.getBoundingClientRect();
+            const point = { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
+            try {
+                const topEl = shadowRoot.elementFromPoint(point.x, point.y);
+                if (!topEl) return false;
+                let current = topEl;
+                while (current && current !== shadowRoot) {
+                    if (current === element) return true;
+                    current = current.parentElement;
+                }
+                return false;
+            } catch (e) {
+                return true;
+            }
+        }
+        const rect = element.getBoundingClientRect();
+        const point = { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
+        try {
+            const topEl = document.elementFromPoint(point.x, point.y);
+            if (!topEl) return false;
+            let current = topEl;
+            while (current && current !== document.documentElement) {
+                if (current === element) return true;
+                current = current.parentElement;
+            }
+            return false;
+        } catch (e) {
+            return true;
+        }
+    }
+    function getVisibleText(element, marked_elements_convergence = []) {
+        const blockLikeDisplays = [
+            // Basic block elements
+            'block', 'flow-root', 'inline-block',
+            // Lists
+            'list-item',
+            // Table elements
+            'table', 'inline-table', 'table-row', 'table-cell',
+            'table-caption', 'table-header-group', 'table-footer-group',
+            'table-row-group',
+            // Modern layouts
+            'flex', 'inline-flex', 'grid', 'inline-grid'
+        ];
+        // Check if element is hidden
+        const style = window.getComputedStyle(element);
+        if (style.display === 'none' || style.visibility === 'hidden') {
+            return '';
+        }
+        let collectedText = [];
+        function isMarkedInteractive(el) {
+            return marked_elements_convergence.includes(el);
+        }
+        function traverse(node) {
+            if (
+                node.nodeType === Node.ELEMENT_NODE &&
+                node !== element &&
+                isMarkedInteractive(node)
+            ) {
+                return false;
+            }
+            if (node.nodeType === Node.TEXT_NODE) {
+                const trimmed = node.textContent.trim();
+                if (trimmed) {
+                    collectedText.push(trimmed);
+                }
+            } else if (node.nodeType === Node.ELEMENT_NODE) {
+                // Skip noscript elements
+                if (node.tagName === 'NOSCRIPT') {
+                    return true;
+                }
+                const nodeStyle = window.getComputedStyle(node);
+                // Skip hidden elements
+                if (nodeStyle.display === 'none' || nodeStyle.visibility === 'hidden') {
+                    return true;
+                }
+                // Add newline before block elements if we have text
+                if (blockLikeDisplays.includes(nodeStyle.display) && collectedText.length > 0) {
+                    collectedText.push('\n');
+                }
+                if (node.tagName === 'IMG') {
+                    const textParts = [];
+                    const alt = node.getAttribute('alt');
+                    const title = node.getAttribute('title');
+                    const ariaLabel = node.getAttribute('aria-label');
+                    // Add more as needed (e.g., 'aria-describedby', 'data-caption', etc.)
+                    if (alt) textParts.push(`alt="${alt}"`);
+                    if (title) textParts.push(`title="${title}"`);
+                    if (ariaLabel) textParts.push(`aria-label="${ariaLabel}"`);
+                    if (textParts.length > 0) {
+                        collectedText.push(`[img - ${textParts.join(' ')}]`);
+                    }
+                    return true;
+                }
+                for (const child of node.childNodes) {
+                    const shouldContinue = traverse(child);
+                    if (shouldContinue === false) {
+                        return false;
+                    }
+                }
+                // Add newline after block elements
+                if (blockLikeDisplays.includes(nodeStyle.display)) {
+                    collectedText.push('\n');
+                }
+            }
+            return true;
+        }
+        traverse(element);
+        // Join text and normalize whitespace
+        return collectedText.join(' ').trim().replace(/\s{2,}/g, ' ').trim();
+    }
+    function extractInteractiveItems(rootElement) {
+        const items = [];
+        function processElement(element, context) {
+            if (!element) return;
+            // Recursively process elements
+            if (element.nodeType === Node.ELEMENT_NODE && isInteractive(element) && isElementVisible(element) && isTopElement(element)) {
+                const rects = getElementRects(element, context);
+                const area = calculateArea(rects);
+                items.push({
+                    element: element,
+                    area,
+                    rects,
+                    is_scrollable: isScrollable(element),
+                });
+            }
+            if (element.shadowRoot) {
+                // if it's shadow DOM, process elements in the shadow DOM
+                Array.from(element.shadowRoot.childNodes || []).forEach(child => {
+                    processElement(child, element.shadowRoot);
+                });
+            }
+            if (element.tagName === 'SLOT') {
+                // Handle both assigned elements and nodes
+                const assigned = element.assignedNodes ? element.assignedNodes() : element.assignedElements();
+                assigned.forEach(child => {
+                    processElement(child, context);
+                });
+            }
+            else if (element.tagName === 'IFRAME') {
+                try {
+                    const iframeDoc = element.contentDocument || element.contentWindow?.document;
+                    if (iframeDoc && iframeDoc.body) {
+                        // Process elements inside iframe
+                        processElement(iframeDoc.body, iframeDoc);
+                    }
+                } catch (e) {
+                    console.warn('Unable to access iframe contents:', e);
+                }
+            } else {
+                // if it's regular child elements, process regular child elements
+                Array.from(element.children || []).forEach(child => {
+                    processElement(child, context);
+                });
+            }
+        }
+        processElement(rootElement, document);
+        return items;
+    }
+    if (marked_elements_convergence) {
+        marked_elements_convergence = [];
+    }
+    let mark_centres = [];
+    let marked_element_descriptions = [];
+    var items = extractInteractiveItems(rootElement);
+    // Lets create a floating border on top of these elements that will always be visible
+    let index = 0;
+    items.forEach(function (item) {
+        item.rects.forEach((bbox) => {
+            marked_elements_convergence.push(item.element);
+            mark_centres.push({
+                x: Math.round((bbox.left + bbox.right) / 2),
+                y: Math.round((bbox.top + bbox.bottom) / 2),
+                left: bbox.left,
+                top: bbox.top,
+                right: bbox.right,
+                bottom: bbox.bottom,
+            });
+            marked_element_descriptions.push({
+                tag: item.element.tagName,
+                text: getVisibleText(item.element),
+                // NOTE: all other attributes will be shown to the model when present
+                // TODO: incorperate child attributes, e.g. <img alt="..."> when img is a child of the link element
+                value: item.element.value,
+                placeholder: item.element.getAttribute("placeholder"),
+                element_type: item.element.getAttribute("type"),
+                aria_label: item.element.getAttribute("aria-label"),
+                name: item.element.getAttribute("name"),
+                required: item.element.getAttribute("required"),
+                disabled: item.element.getAttribute("disabled"),
+                pattern: item.element.getAttribute("pattern"),
+                checked: item.element.getAttribute("checked"),
+                minlength: item.element.getAttribute("minlength"),
+                maxlength: item.element.getAttribute("maxlength"),
+                role: item.element.getAttribute("role"),
+                title: item.element.getAttribute("title"),
+                scrollable: item.is_scrollable
+            });
+            index++;
+        });
+    });
+    return {
+        element_descriptions: marked_element_descriptions,
+        element_centroids: mark_centres
+    };
+}

proxy-lite-demo-v2/src/proxy_lite/cli.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import argparse
+import asyncio
+import base64
+import os
+from pathlib import Path
+from proxy_lite import Runner, RunnerConfig
+from proxy_lite.gif_maker import create_run_gif
+from proxy_lite.logger import logger
+def update_config_from_env(config: RunnerConfig) -> RunnerConfig:
+    if os.getenv("PROXY_LITE_API_BASE"):
+        config.solver.agent.client.api_base = os.getenv("PROXY_LITE_API_BASE")
+    if os.getenv("PROXY_LITE_MODEL"):
+        config.solver.agent.client.model_id = os.getenv("PROXY_LITE_MODEL")
+    if os.getenv("PROXY_LITE_VIEWPORT_WIDTH"):
+        config.environment.viewport_width = int(os.getenv("PROXY_LITE_VIEWPORT_WIDTH"))
+    if os.getenv("PROXY_LITE_VIEWPORT_HEIGHT"):
+        config.environment.viewport_height = int(os.getenv("PROXY_LITE_VIEWPORT_HEIGHT"))
+    return config
+def do_command(args):
+    do_text = " ".join(args.task)
+    logger.info("🤖 Let me help you with that...")
+    # Take default config from YAML
+    config = RunnerConfig.from_yaml(args.config)
+    # Update config from environment variables
+    config = update_config_from_env(config)
+    # Update config from command-line arguments
+    if args.api_base:
+        config.solver.agent.client.api_base = args.api_base
+    if args.model:
+        config.solver.agent.client.model_id = args.model
+    if args.homepage:
+        config.environment.homepage = args.homepage
+    if args.viewport_width:
+        config.environment.viewport_width = args.viewport_width
+    if args.viewport_height:
+        config.environment.viewport_height = args.viewport_height
+    o = Runner(config=config)
+    result = asyncio.run(o.run(do_text))
+    final_screenshot = result.observations[-1].info["original_image"]
+    folder_path = Path(__file__).parent.parent.parent / "screenshots"
+    folder_path.mkdir(parents=True, exist_ok=True)
+    path = folder_path / f"{result.run_id}.png"
+    with open(path, "wb") as f:
+        f.write(base64.b64decode(final_screenshot))
+    logger.info(f"🤖 Final screenshot saved to {path}")
+    gif_folder_path = Path(__file__).parent.parent.parent / "gifs"
+    gif_folder_path.mkdir(parents=True, exist_ok=True)
+    gif_path = gif_folder_path / f"{result.run_id}.gif"
+    create_run_gif(result, gif_path, duration=1500)
+    logger.info(f"🤖 GIF saved to {gif_path}")
+def main():
+    parser = argparse.ArgumentParser(description="Proxy-Lite")
+    parser.add_argument(
+        "task",
+        type=str,
+        help="The task you want to accomplish",
+        nargs="*",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=None,
+        help="The model to use.",
+    )
+    parser.add_argument(
+        "--api_base",
+        type=str,
+        default=None,
+        help="The API base URL to use.",
+    )
+    # New option for setting a homepage URL:
+    parser.add_argument(
+        "--homepage",
+        type=str,
+        default=None,
+        help="The homepage URL to use.",
+    )
+    # New viewport controls:
+    parser.add_argument(
+        "--viewport-width",
+        type=int,
+        default=None,
+        help="Viewport width in pixels.",
+    )
+    parser.add_argument(
+        "--viewport-height",
+        type=int,
+        default=None,
+        help="Viewport height in pixels.",
+    )
+    parser.add_argument(
+        "--config",
+        type=Path,
+        default=Path(__file__).parent / "configs/default.yaml",
+        help="Path to config file (default: configs/default.yaml)",
+    )
+    args = parser.parse_args()
+    do_command(args)
+if __name__ == "__main__":
+    main()

proxy-lite-demo-v2/src/proxy_lite/client.py ADDED Viewed

	@@ -0,0 +1,405 @@

+import os
+from abc import ABC, abstractmethod
+from functools import cached_property
+from typing import ClassVar, Literal, Optional, Union
+import httpx
+from httpx import Limits, Timeout
+from openai import AsyncOpenAI
+from openai.types.chat.chat_completion import (
+    ChatCompletion,
+)
+from pydantic import BaseModel
+from proxy_lite.history import MessageHistory
+from proxy_lite.logger import logger
+from proxy_lite.serializer import (
+    BaseSerializer,
+    OpenAICompatibleSerializer,
+)
+from proxy_lite.tools import Tool
+class BaseClientConfig(BaseModel):
+    http_timeout: float = 50
+    http_concurrent_connections: int = 50
+class BaseClient(BaseModel, ABC):
+    config: BaseClientConfig
+    serializer: ClassVar[BaseSerializer]
+    @abstractmethod
+    async def create_completion(
+        self,
+        messages: MessageHistory,
+        temperature: float = 0.7,
+        seed: Optional[int] = None,
+        tools: Optional[list[Tool]] = None,
+        response_format: Optional[type[BaseModel]] = None,
+    ) -> ChatCompletion: ...
+    """
+    Create completion from model.
+    Expect subclasses to adapt from various endpoints that will handle
+    requests differently, make sure to raise appropriate warnings.
+    Returns:
+        ChatCompletion: OpenAI ChatCompletion format for consistency
+    """
+    @classmethod
+    def create(cls, config: BaseClientConfig) -> "BaseClient":
+        supported_clients = {
+            "openai": OpenAIClient,
+            "openai-azure": OpenAIClient,
+            "convergence": ConvergenceClient,
+            "gemini": GeminiClient,
+        }
+        if config.name not in supported_clients:
+            error_message = f"Unsupported model: {config.name}."
+            raise ValueError(error_message)
+        return supported_clients[config.name](config=config)
+    @property
+    def http_client(self) -> httpx.AsyncClient:
+        return httpx.AsyncClient(
+            timeout=Timeout(self.config.http_timeout),
+            limits=Limits(
+                max_connections=self.config.http_concurrent_connections,
+                max_keepalive_connections=self.config.http_concurrent_connections,
+            ),
+        )
+class OpenAIClientConfig(BaseClientConfig):
+    name: Literal["openai"] = "openai"
+    model_id: str = "gpt-4o"
+    api_key: str = os.environ.get("OPENAI_API_KEY")
+    api_base: Optional[str] = None
+class OpenAIClient(BaseClient):
+    config: OpenAIClientConfig
+    serializer: ClassVar[OpenAICompatibleSerializer] = OpenAICompatibleSerializer()
+    @cached_property
+    def external_client(self) -> AsyncOpenAI:
+        client_params = {
+            "api_key": self.config.api_key,
+            "http_client": self.http_client,
+        }
+        if self.config.api_base:
+            client_params["base_url"] = self.config.api_base
+        return AsyncOpenAI(**client_params)
+    async def create_completion(
+        self,
+        messages: MessageHistory,
+        temperature: float = 0.7,
+        seed: Optional[int] = None,
+        tools: Optional[list[Tool]] = None,
+        response_format: Optional[type[BaseModel]] = None,
+    ) -> ChatCompletion:
+        base_params = {
+            "model": self.config.model_id,
+            "messages": self.serializer.serialize_messages(messages),
+            "temperature": temperature,
+        }
+        optional_params = {
+            "seed": seed,
+            "tools": self.serializer.serialize_tools(tools) if tools else None,
+            "tool_choice": "required" if tools else None,
+            "response_format": {"type": "json_object"} if response_format else {"type": "text"},
+        }
+        base_params.update({k: v for k, v in optional_params.items() if v is not None})
+        return await self.external_client.chat.completions.create(**base_params)
+class ConvergenceClientConfig(BaseClientConfig):
+    name: Literal["convergence"] = "convergence"
+    model_id: str = "convergence-ai/proxy-lite-7b"
+    api_base: str = "http://localhost:8000/v1"
+    api_key: str = "none"
+class ConvergenceClient(OpenAIClient):
+    config: ConvergenceClientConfig
+    serializer: ClassVar[OpenAICompatibleSerializer] = OpenAICompatibleSerializer()
+    _model_validated: bool = False
+    async def _validate_model(self) -> None:
+        try:
+            response = await self.external_client.models.list()
+            assert self.config.model_id in [model.id for model in response.data], (
+                f"Model {self.config.model_id} not found in {response.data}"
+            )
+            self._model_validated = True
+            logger.debug(f"Model {self.config.model_id} validated and connected to cluster")
+        except Exception as e:
+            logger.error(f"Error retrieving model: {e}")
+            raise e
+    @cached_property
+    def external_client(self) -> AsyncOpenAI:
+        return AsyncOpenAI(
+            api_key=self.config.api_key,
+            base_url=self.config.api_base,
+            http_client=self.http_client,
+        )
+    async def create_completion(
+        self,
+        messages: MessageHistory,
+        temperature: float = 0.7,
+        seed: Optional[int] = None,
+        tools: Optional[list[Tool]] = None,
+        response_format: Optional[type[BaseModel]] = None,
+    ) -> ChatCompletion:
+        if not self._model_validated:
+            await self._validate_model()
+        base_params = {
+            "model": self.config.model_id,
+            "messages": self.serializer.serialize_messages(messages),
+            "temperature": temperature,
+        }
+        optional_params = {
+            "seed": seed,
+            "tools": self.serializer.serialize_tools(tools) if tools else None,
+            "tool_choice": "auto" if tools else None,  # vLLM does not support "required"
+            "response_format": response_format if response_format else {"type": "text"},
+        }
+        base_params.update({k: v for k, v in optional_params.items() if v is not None})
+        return await self.external_client.chat.completions.create(**base_params)
+class GeminiClientConfig(BaseClientConfig):
+    name: Literal["gemini"] = "gemini"
+    model_id: str = "gemini-2.0-flash-001"
+    api_key: str = ""
+class GeminiClient(BaseClient):
+    config: GeminiClientConfig
+    serializer: ClassVar[OpenAICompatibleSerializer] = OpenAICompatibleSerializer()
+    def _convert_messages_to_gemini_format(self, messages):
+        """Convert OpenAI format messages to Gemini format"""
+        gemini_parts = []
+        for msg in messages:
+            if msg["role"] == "user":
+                gemini_parts.append({"text": msg["content"]})
+            elif msg["role"] == "assistant":
+                gemini_parts.append({"text": msg["content"]})
+            # Skip system messages or add them to the first user message
+        return gemini_parts
+    def _clean_schema_for_gemini(self, schema):
+        """Clean up JSON schema for Gemini function calling - remove $defs and $ref"""
+        if not isinstance(schema, dict):
+            return schema
+        cleaned = {}
+        for key, value in schema.items():
+            if key == "$defs":
+                # Skip $defs - we'll inline the definitions
+                continue
+            elif key == "$ref":
+                # Skip $ref - we'll inline the referenced schema
+                continue
+            elif isinstance(value, dict):
+                cleaned[key] = self._clean_schema_for_gemini(value)
+            elif isinstance(value, list):
+                cleaned[key] = [self._clean_schema_for_gemini(item) for item in value]
+            else:
+                cleaned[key] = value
+        # If we have $defs, we need to inline them
+        if "$defs" in schema:
+            cleaned = self._inline_definitions(cleaned, schema["$defs"])
+        return cleaned
+    def _inline_definitions(self, schema, definitions):
+        """Inline $ref definitions into the schema"""
+        if not isinstance(schema, dict):
+            return schema
+        if "$ref" in schema:
+            # Extract the reference name (e.g., "#/$defs/TypeEntry" -> "TypeEntry")
+            ref_name = schema["$ref"].split("/")[-1]
+            if ref_name in definitions:
+                # Replace the $ref with the actual definition
+                return self._inline_definitions(definitions[ref_name], definitions)
+            else:
+                # If we can't find the definition, remove the $ref
+                return {k: v for k, v in schema.items() if k != "$ref"}
+        # Recursively process nested objects
+        inlined = {}
+        for key, value in schema.items():
+            if isinstance(value, dict):
+                inlined[key] = self._inline_definitions(value, definitions)
+            elif isinstance(value, list):
+                inlined[key] = [self._inline_definitions(item, definitions) for item in value]
+            else:
+                inlined[key] = value
+        return inlined
+    async def create_completion(
+        self,
+        messages: MessageHistory,
+        temperature: float = 0.7,
+        seed: Optional[int] = None,
+        tools: Optional[list[Tool]] = None,
+        response_format: Optional[type[BaseModel]] = None,
+    ) -> ChatCompletion:
+        import json
+        from openai.types.chat.chat_completion import ChatCompletion, Choice
+        from openai.types.chat.chat_completion_message import ChatCompletionMessage
+        from openai.types.completion_usage import CompletionUsage
+        from openai.types.chat.chat_completion_message_tool_call import ChatCompletionMessageToolCall
+        # Convert messages to format expected by Gemini
+        serialized_messages = self.serializer.serialize_messages(messages)
+        # For Gemini API, we need to format contents correctly with proper roles
+        contents = []
+        current_user_text = ""
+        for msg in serialized_messages:
+            # Extract the actual text content from the serialized message
+            content_text = ""
+            if isinstance(msg["content"], list):
+                # Handle complex content format
+                for item in msg["content"]:
+                    if isinstance(item, dict) and "text" in item:
+                        content_text += item["text"]
+                    elif isinstance(item, str):
+                        content_text += item
+            elif isinstance(msg["content"], str):
+                content_text = msg["content"]
+            if msg["role"] == "user":
+                # Accumulate user messages
+                current_user_text += content_text + "\n"
+            elif msg["role"] == "assistant":
+                # If we have accumulated user text, add it first
+                if current_user_text.strip():
+                    contents.append({
+                        "role": "user",
+                        "parts": [{"text": current_user_text.strip()}]
+                    })
+                    current_user_text = ""
+                # Add assistant message with role "model"
+                contents.append({
+                    "role": "model",
+                    "parts": [{"text": content_text}]
+                })
+            elif msg["role"] == "tool":
+                # Add tool messages as user messages so they're included in context
+                # Format tool message more clearly for the agent to understand
+                current_user_text += f"[ACTION COMPLETED] {content_text}\n"
+        # Add any remaining user text
+        if current_user_text.strip():
+            contents.append({
+                "role": "user",
+                "parts": [{"text": current_user_text.strip()}]
+            })
+        payload = {
+            "contents": contents,
+            "generationConfig": {
+                "temperature": temperature,
+            }
+        }
+        # Add function calling support if tools are provided
+        if tools:
+            # Convert tools to Gemini function declaration format
+            function_declarations = []
+            for tool in tools:
+                for tool_schema in tool.schema:
+                    # Clean up the schema for Gemini - remove $defs and $ref
+                    cleaned_parameters = self._clean_schema_for_gemini(tool_schema["parameters"])
+                    function_declarations.append({
+                        "name": tool_schema["name"],
+                        "description": tool_schema["description"],
+                        "parameters": cleaned_parameters
+                    })
+            payload["tools"] = [{
+                "function_declarations": function_declarations
+            }]
+        # Make direct HTTP request to native Gemini API
+        url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.config.model_id}:generateContent?key={self.config.api_key}"
+        response = await self.http_client.post(
+            url,
+            json=payload,
+            headers={"Content-Type": "application/json"}
+        )
+        response.raise_for_status()
+        response_data = response.json()
+        # Convert Gemini response to OpenAI ChatCompletion format
+        if "candidates" in response_data and len(response_data["candidates"]) > 0:
+            candidate = response_data["candidates"][0]
+            # Extract text from response
+            content = ""
+            tool_calls = []
+            if "content" in candidate and "parts" in candidate["content"]:
+                for part in candidate["content"]["parts"]:
+                    if "text" in part:
+                        content += part["text"]
+                    elif "functionCall" in part:
+                        # Handle function call
+                        func_call = part["functionCall"]
+                        tool_call = ChatCompletionMessageToolCall(
+                            id=f"call_{hash(str(func_call))}"[:16],
+                            type="function",
+                            function={
+                                "name": func_call["name"],
+                                "arguments": json.dumps(func_call.get("args", {}))
+                            }
+                        )
+                        tool_calls.append(tool_call)
+            choice = Choice(
+                index=0,
+                message=ChatCompletionMessage(
+                    role="assistant",
+                    content=content if content else None,
+                    tool_calls=tool_calls if tool_calls else None
+                ),
+                finish_reason="stop"
+            )
+            # Create a mock ChatCompletion response
+            completion = ChatCompletion(
+                id="gemini-" + str(hash(content))[:8],
+                choices=[choice],
+                created=int(__import__('time').time()),
+                model=self.config.model_id,
+                object="chat.completion",
+                usage=CompletionUsage(
+                    completion_tokens=len(content.split()),
+                    prompt_tokens=sum(len(str(msg.get("content", "")).split()) for msg in serialized_messages),
+                    total_tokens=len(content.split()) + sum(len(str(msg.get("content", "")).split()) for msg in serialized_messages)
+                )
+            )
+            return completion
+        else:
+            raise Exception(f"No valid response from Gemini API: {response_data}")
+ClientConfigTypes = Union[OpenAIClientConfig, ConvergenceClientConfig, GeminiClientConfig]
+ClientTypes = Union[OpenAIClient, ConvergenceClient, GeminiClient]

proxy-lite-demo-v2/src/proxy_lite/configs/default.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+environment:
+  name: webbrowser
+  annotate_image: true
+  screenshot_delay: 2.0
+  viewport_width: 1280
+  viewport_height: 1920
+  include_poi_text: true
+  headless: false
+  homepage: https://www.google.co.uk
+  keep_original_image: true
+solver:
+  name: simple
+  agent:
+    name: proxy_lite
+    client:
+      name: convergence
+      model_id: convergence-ai/proxy-lite-3b
+      api_base: https://convergence-ai-demo-api.hf.space/v1
+local_view: true
+task_timeout: 1800
+environment_timeout: 1800
+action_timeout: 1800
+verbose: true

proxy-lite-demo-v2/src/proxy_lite/environments/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from typing import Union
+from .environment_base import (
+    Action,
+    BaseEnvironment,
+    BaseEnvironmentConfig,
+    Environments,
+    Event,
+    EventType,
+    Observation,
+)
+from .webbrowser import (
+    WebBrowserEnvironment,
+    WebBrowserEnvironmentConfig,
+)
+EnvironmentConfigTypes = Union[*list(Environments._environment_config_registry.values())]
+EnvironmentTypes = Union[*list(Environments._environment_registry.values())]
+__all__ = [
+    "Action",
+    "BaseEnvironment",
+    "BaseEnvironmentConfig",
+    "EnvironmentConfigTypes",
+    "Environments",
+    "Event",
+    "EventType",
+    "Observation",
+    "WebBrowserEnvironment",
+    "WebBrowserEnvironmentConfig",
+]

proxy-lite-demo-v2/src/proxy_lite/environments/environment_base.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import json
+import logging
+from abc import ABC, abstractmethod
+from enum import Enum
+from functools import cached_property
+from typing import Any, Literal, Optional, Self
+from pydantic import BaseModel
+from proxy_lite.history import ToolCall
+from proxy_lite.tools import Tool, ToolExecutionResponse
+class EventType(str, Enum):
+    OBSERVATION = "observation"
+    ACTION = "action"
+    MESSAGE = "message"
+class Event(BaseModel):
+    type: EventType
+class State(BaseModel):
+    text: Optional[str] = None
+    image: Optional[str] = None  # base64 encoded image
+    html: Optional[str] = None
+    tool_responses: Optional[list[ToolExecutionResponse]] = None
+class Observation(Event):
+    type: Literal[EventType.OBSERVATION] = EventType.OBSERVATION
+    state: State
+    terminated: bool
+    reward: Optional[float] = None
+    info: Optional[dict[str, Any]] = None
+class Action(Event):
+    type: Literal[EventType.ACTION] = EventType.ACTION
+    text: Optional[str] = None
+    tool_calls: Optional[list[ToolCall]] = None
+    info: Optional[dict[str, Any]] = None
+class BaseEnvironmentConfig(BaseModel): ...
+class BaseEnvironment(BaseModel, ABC):
+    config: BaseEnvironmentConfig
+    logger: logging.Logger | None = None
+    class Config:
+        arbitrary_types_allowed = True
+    async def __aenter__(self) -> Self:
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        pass
+    @property
+    @abstractmethod
+    def info_for_user(self) -> str: ...
+    @cached_property
+    @abstractmethod
+    def tools(self) -> list[Tool]: ...
+    @abstractmethod
+    async def initialise(self) -> Observation: ...
+    @abstractmethod
+    async def execute_action(self, action: Action) -> Observation: ...
+    @abstractmethod
+    async def observe(self) -> Observation: ...
+    @abstractmethod
+    async def evaluate(self, **kwargs: dict[str, Any]) -> dict[str, Any]: ...
+    async def execute_tool(self, tool_call: ToolCall) -> None:
+        function = tool_call.function
+        for tool in self.tools:
+            if hasattr(tool, function["name"]):
+                arguments = json.loads(function["arguments"])
+                if isinstance(arguments, str):
+                    arguments = json.loads(arguments)
+                return await getattr(tool, function["name"])(
+                    **arguments,
+                )
+        msg = f'No tool function with name "{function["name"]}"'
+        raise ValueError(msg)
+    async def get_info(self) -> dict[str, Any]:
+        return {}
+class Environments:
+    _environment_registry: dict[str, type[BaseEnvironment]] = {}
+    _environment_config_registry: dict[str, type[BaseEnvironmentConfig]] = {}
+    @classmethod
+    def register_environment(cls, name: str):
+        """
+        Decorator to register an Environment class under a given name.
+        Example:
+            @Environments.register_environment("my_environment")
+            class MyEnvironment(BaseEnvironment):
+                ...
+        """
+        def decorator(env_cls: type[BaseEnvironment]) -> type[BaseEnvironment]:
+            cls._environment_registry[name] = env_cls
+            return env_cls
+        return decorator
+    @classmethod
+    def register_environment_config(cls, name: str):
+        """
+        Decorator to register an Environment configuration class under a given name.
+        Example:
+            @Environments.register_environment_config("my_environment")
+            class MyEnvironmentConfig(BaseEnvironmentConfig):
+                ...
+        """
+        def decorator(config_cls: type[BaseEnvironmentConfig]) -> type[BaseEnvironmentConfig]:
+            cls._environment_config_registry[name] = config_cls
+            return config_cls
+        return decorator
+    @classmethod
+    def get(cls, name: str) -> type[BaseEnvironment]:
+        """
+        Retrieve a registered Environment class by its name.
+        Raises:
+            ValueError: If no such environment is found.
+        """
+        try:
+            return cls._environment_registry[name]
+        except KeyError:
+            raise ValueError(f"Environment '{name}' not found.")
+    @classmethod
+    def get_config(cls, name: str) -> type[BaseEnvironmentConfig]:
+        """
+        Retrieve a registered Environment configuration class by its name.
+        Raises:
+            ValueError: If no such configuration is found.
+        """
+        try:
+            return cls._environment_config_registry[name]
+        except KeyError:
+            raise ValueError(f"Environment config for '{name}' not found.")

proxy-lite-demo-v2/src/proxy_lite/environments/webbrowser.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import base64
+from functools import cached_property
+from typing import Any, Literal, Optional, Self, List # Added List import
+from proxy_lite.browser.browser import BrowserSession
+from proxy_lite.environments.environment_base import (
+    Action,
+    BaseEnvironment,
+    BaseEnvironmentConfig,
+    Environments,
+    Observation,
+    State,
+)
+from proxy_lite.tools import BrowserTool, Tool, ToolExecutionResponse
+from proxy_lite.logger import logger
+@Environments.register_environment_config("webbrowser")
+class WebBrowserEnvironmentConfig(BaseEnvironmentConfig):
+    name: Literal["webbrowser"] = "webbrowser"
+    homepage: str = "https://google.com"
+    annotate_image: bool = True
+    screenshot_delay: float = 1.0  # seconds
+    include_html: bool = True
+    include_poi_text: bool = True
+    record_pois: bool = True
+    viewport_width: int = 1280
+    viewport_height: int = 720
+    browserbase_timeout: int = 7200
+    headless: bool = True
+    keep_original_image: bool = False
+    no_pois_in_image: bool = False
+    # --- MODIFICATION START ---
+    # Added to accept initial cookies from the RunnerConfig
+    initial_cookies: Optional[List[dict]] = None
+    # --- MODIFICATION END ---
+@Environments.register_environment("webbrowser")
+class WebBrowserEnvironment(BaseEnvironment):
+    config: WebBrowserEnvironmentConfig
+    browser: Optional[BrowserSession] = None
+    cancelled_last_action: bool = False
+    class Config:
+        arbitrary_types_allowed = True
+    async def __aenter__(self) -> Self:
+        # Initialize the BrowserSession
+        self.browser = self.browser_session(
+            viewport_width=self.config.viewport_width,
+            viewport_height=self.config.viewport_height,
+            headless=self.config.headless,
+        )
+        await self.browser.__aenter__()
+        # Initialize other resources if necessary
+        # --- MODIFICATION START ---
+        # Changed to use self.config.initial_cookies
+        if self.config.initial_cookies:
+            self.logger.info(f"🌐 [bold blue]Adding {len(self.config.initial_cookies)} initial cookies to browser context.[/]")
+            await self.browser.context.add_cookies(self.config.initial_cookies)
+        # --- MODIFICATION END ---
+        self.logger.info("🌐 [bold blue]Browser session started.[/]")
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        # Clean up the BrowserSession
+        await self.browser.__aexit__(exc_type, exc_value, traceback)
+    @property
+    def info_for_user(self) -> str:
+        return "This is a web browser environment. You can navigate the web, search the web, and perform actions on the web."  # noqa: E501
+    @cached_property
+    def tools(self) -> list[Tool]:
+        return [BrowserTool(session=self.browser)]
+    @cached_property
+    def browser_session(self) -> type[BrowserSession]:
+        return BrowserSession
+    # --- MODIFICATION START ---
+    # Modified this property to return cookies from the config.
+    # It was previously hardcoded to return an empty list.
+    @property
+    def cookies(self) -> list[dict]:
+        return self.config.initial_cookies if self.config.initial_cookies is not None else []
+    # --- MODIFICATION END ---
+    async def initialise(self) -> Observation:
+        self.logger.debug(f"DEBUG: Initialising WebBrowserEnvironment. Homepage: {self.config.homepage}")
+        try:
+            await self.browser.goto(self.config.homepage)
+            self.logger.debug(f"DEBUG: Browser navigated to homepage. Current URL: {self.browser.current_url}")
+        except Exception as e:
+            self.logger.error(f"ERROR: Failed to navigate to homepage {self.config.homepage}: {e}")
+            raise # Re-raise to propagate the error
+        original_img, annotated_img = await self.browser.screenshot(
+            delay=self.config.screenshot_delay,
+        )
+        if self.config.no_pois_in_image:
+            base64_image = base64.b64encode(original_img).decode("utf-8")
+        else:
+            base64_image = base64.b64encode(annotated_img).decode("utf-8")
+        html_content = await self.browser.current_page.content() if self.config.include_html else None
+        info = {"url": self.browser.current_url}
+        if self.config.record_pois:
+            info["pois"] = self.browser.pois
+        if self.config.keep_original_image:
+            info["original_image"] = base64.b64encode(original_img).decode("utf-8")
+        self.logger.debug(f"DEBUG: Initial observation captured. URL: {self.browser.current_url}")
+        return Observation(
+            state=State(
+                text=f"URL: {self.browser.current_url}"
+                + (f"\n{self.browser.poi_text}" if self.config.include_poi_text else ""),
+                image=base64_image,
+                html=html_content,
+            ),
+            terminated=False,
+            reward=None,
+            info=info,
+        )
+    async def should_perform_action(self) -> bool:
+        # if cancelled last action, run the action without updating POIs
+        if self.cancelled_last_action:
+            self.cancelled_last_action = False
+            return True
+        # check for page changes
+        old_points = [tuple(point) for point in self.browser.poi_centroids]
+        await self.browser.update_poi()
+        new_points = [tuple(point) for point in self.browser.poi_centroids]
+        page_changed_mid_action = old_points != new_points
+        # record if the last action was cancelled
+        if page_changed_mid_action:
+            self.cancelled_last_action = True
+            return False
+        return True
+    async def execute_action(self, action: Action) -> Observation:
+        responses = []
+        cancelled_tools_flag = False
+        if await self.should_perform_action():
+            for tool_call in action.tool_calls:
+                # Perform the chosen action
+                try:
+                    tool_response: ToolExecutionResponse = await self.execute_tool(
+                        tool_call,
+                    )
+                    tool_response.id = tool_call.id
+                    responses.append(tool_response)
+                except Exception as e:  # noqa: PERF203
+                    self.logger.warning("🌐 An error occurred taking action: %s", str(e), exc_info=False)
+                    tool_response = ToolExecutionResponse(content=str(e), id=tool_call.id)
+                    responses.append(tool_response)
+        else:
+            self.logger.warning("🌐 Page changed since last observation, cancelling action.")
+            self.cancelled_last_action = True
+            for tool_call in action.tool_calls:
+                tool_response = ToolExecutionResponse(
+                    content="The page changed before the action could be executed, instead of being ran it was cancelled.",  # noqa: E501
+                    id=tool_call.id,
+                )
+                responses.append(tool_response)
+                cancelled_tools_flag = True
+        original_img, annotated_img = await self.browser.screenshot(
+            delay=self.config.screenshot_delay,
+        )
+        base64_image = base64.b64encode(annotated_img).decode("utf-8")
+        info = {"url": self.browser.current_url, "cancelled_tools": cancelled_tools_flag}
+        if self.config.record_pois:
+            info["pois"] = self.browser.pois
+        if self.config.keep_original_image:
+            info["original_image"] = base64.b64encode(original_img).decode("utf-8")
+        html_content = await self.browser.current_page.content() if self.config.include_html else None
+        return Observation(
+            state=State(
+                text=f"URL: {self.browser.current_url}"
+                + (f"\n{self.browser.poi_text}" if self.config.include_poi_text else ""),
+                image=base64_image,
+                html=html_content,
+                tool_responses=responses,
+            ),
+            terminated=False,
+            reward=None,
+            info=info,
+        )
+    async def observe(self) -> Observation:
+        return await self.browser.observe()
+    async def evaluate(self, **kwargs: dict[str, Any]) -> dict[str, Any]:
+        return {}
+    async def get_info(self) -> dict[str, Any]:
+        info = {}
+        return info

proxy-lite-demo-v2/src/proxy_lite/gif_maker.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import base64
+import re
+import textwrap
+from io import BytesIO
+from PIL import Image, ImageDraw, ImageFont
+from proxy_lite.environments.environment_base import Action, Observation
+from proxy_lite.recorder import Run
+def create_run_gif(
+    run: Run, output_path: str, white_panel_width: int = 300, duration: int = 1500, resize_factor: int = 4
+) -> None:
+    """
+    Generate a gif from the Run object's history.
+    For each Observation record, the observation image is decoded from its base64
+    encoded string. If the next record is an Action, its text is drawn onto a
+    white panel. The observation image and the white panel are then concatenated
+    horizontally to produce a frame.
+    Parameters:
+        run (Run): A Run object with its history containing Observation and Action records.
+        output_path (str): The path where the GIF will be saved.
+        white_panel_width (int): The width of the white panel for displaying text.
+                                 Default increased to 400 for larger images.
+        duration (int): Duration between frames in milliseconds.
+                        Increased here to slow the FPS (default is 1000ms).
+        resize_factor (int): The factor to resize the image down by.
+    """
+    frames = []
+    history = run.history
+    i = 0
+    while i < len(history):
+        if isinstance(history[i], Observation):
+            observation = history[i]
+            image_data = observation.state.image
+            if not image_data:
+                i += 1
+                continue
+            # Decode the base64 image
+            image_bytes = base64.b64decode(image_data)
+            obs_img = Image.open(BytesIO(image_bytes)).convert("RGB")
+            # scale the image down
+            obs_img = obs_img.resize((obs_img.width // resize_factor, obs_img.height // resize_factor))
+            # Check if the next record is an Action and extract its text if available
+            action_text = ""
+            if i + 1 < len(history) and isinstance(history[i + 1], Action):
+                action = history[i + 1]
+                if action.text:
+                    action_text = action.text
+            # extract observation and thinking from tags in the action text
+            observation_match = re.search(r"<observation>(.*?)</observation>", action_text, re.DOTALL)
+            observation_content = observation_match.group(1).strip() if observation_match else None
+            # Extract text between thinking tags if present
+            thinking_match = re.search(r"<thinking>(.*?)</thinking>", action_text, re.DOTALL)
+            thinking_content = thinking_match.group(1).strip() if thinking_match else None
+            if observation_content and thinking_content:
+                action_text = f"**OBSERVATION**\n{observation_content}\n\n**THINKING**\n{thinking_content}"
+            # Create a white panel (same height as the observation image)
+            panel = Image.new("RGB", (white_panel_width, obs_img.height), "white")
+            draw = ImageDraw.Draw(panel)
+            font = ImageFont.load_default()
+            # Wrap the action text if it is too long
+            max_chars_per_line = 40  # Adjusted for larger font size
+            wrapped_text = textwrap.fill(action_text, width=max_chars_per_line)
+            # Calculate text block size and center it on the panel
+            try:
+                # Use multiline_textbbox if available (returns bounding box tuple)
+                bbox = draw.multiline_textbbox((0, 0), wrapped_text, font=font)
+                text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]
+            except AttributeError:
+                # Fallback for older Pillow versions: compute size for each line
+                lines = wrapped_text.splitlines() or [wrapped_text]
+                line_sizes = [draw.textsize(line, font=font) for line in lines]
+                text_width = max(width for width, _ in line_sizes)
+                text_height = sum(height for _, height in line_sizes)
+            text_x = (white_panel_width - text_width) // 2
+            text_y = (obs_img.height - text_height) // 2
+            draw.multiline_text((text_x, text_y), wrapped_text, fill="black", font=font, align="center")
+            # Create the combined frame by concatenating the observation image and the panel
+            total_width = obs_img.width + white_panel_width
+            combined_frame = Image.new("RGB", (total_width, obs_img.height))
+            combined_frame.paste(obs_img, (0, 0))
+            combined_frame.paste(panel, (obs_img.width, 0))
+            frames.append(combined_frame)
+            # Skip the Action record since it has been processed with this Observation
+            if i + 1 < len(history) and isinstance(history[i + 1], Action):
+                i += 2
+            else:
+                i += 1
+        else:
+            i += 1
+    if frames:
+        frames[0].save(output_path, save_all=True, append_images=frames[1:], duration=duration, loop=0)
+    else:
+        raise ValueError("No frames were generated from the Run object's history.")
+# Example usage:
+if __name__ == "__main__":
+    from proxy_lite.recorder import Run
+    dummy_run = Run.load("0abdb4cb-f289-48b0-ba13-35ed1210f7c1")
+    num_steps = int(len(dummy_run.history) / 2)
+    print(f"Number of steps: {num_steps}")
+    output_gif_path = "trajectory.gif"
+    create_run_gif(dummy_run, output_gif_path, duration=1000)
+    print(f"Trajectory GIF saved to {output_gif_path}")

proxy-lite-demo-v2/src/proxy_lite/history.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from __future__ import annotations
+import base64
+from collections.abc import Iterator
+from enum import Enum
+from typing import Any, Literal, Optional, Set, Union
+from pydantic import BaseModel, Field, TypeAdapter, field_validator
+class MessageLabel(str, Enum):
+    SYSTEM = "system"
+    USER_INPUT = "user_input"
+    SCREENSHOT = "screenshot"
+    AGENT_MODEL_RESPONSE = "agent_model_response"
+MAX_MESSAGES_FOR_CONTEXT_WINDOW = {
+    MessageLabel.SCREENSHOT: 1,
+}
+class MessageContent(BaseModel):
+    pass
+class Text(MessageContent):
+    type: Literal["text"] = Field(default="text", init=False)
+    text: str
+class ImageUrl(BaseModel):
+    url: str
+class Image(MessageContent):
+    type: Literal["image_url"] = Field(default="image_url", init=False)
+    image_url: ImageUrl
+class Message(BaseModel):
+    label: Optional[MessageLabel] = None
+    content: list[Union[Text, Image]] = Field(default_factory=list)
+    class Config:
+        use_enum_values = True
+    @property
+    def images(self) -> list[Image]:
+        return [content for content in self.content if isinstance(content, Image)]
+    @property
+    def texts(self) -> list[Text]:
+        return [content for content in self.content if isinstance(content, Text)]
+    @property
+    def first_image(self) -> Optional[Image]:
+        return self.images[0] if self.images else None
+    @property
+    def first_text(self) -> Optional[Text]:
+        return self.texts[0] if self.texts else None
+    def __len__(self):
+        return len(self.content)
+    @classmethod
+    def from_media(
+        cls,
+        text: Optional[str] = None,
+        image: Optional[bytes | str] = None,
+        is_base64: bool = False,
+    ) -> Message:
+        if text is not None:
+            text = Text(text=text)
+        if image is not None:
+            base64_image = image if is_base64 else base64.b64encode(image).decode("utf-8")
+            data_url = f"data:image/jpeg;base64,{base64_image}"
+            image = Image(image_url=ImageUrl(url=data_url))
+            content = [text, image] if text is not None else [image]
+        else:
+            content = [text]
+        return cls(content=content)
+class SystemMessage(Message):
+    role: Literal["system"] = Field(default="system", init=False)
+class UserMessage(Message):
+    role: Literal["user"] = Field(default="user", init=False)
+class ToolCall(BaseModel):
+    id: str
+    type: str
+    function: dict[str, Any]
+class AssistantMessage(Message):
+    role: Literal["assistant"] = Field(default="assistant", init=False)
+    tool_calls: list[ToolCall] = Field(default_factory=list)
+    def model_dump(self, **kwargs):
+        data = super().model_dump(**kwargs)
+        if not self.tool_calls:
+            data.pop("tool_calls")
+        return data
+    @field_validator("tool_calls", mode="before")
+    @classmethod
+    def ensure_list(cls, v):
+        return [] if v is None else v
+class ToolMessage(Message):
+    role: Literal["tool"] = Field(default="tool", init=False)
+    tool_call_id: str
+MessageTypes = Union[SystemMessage, UserMessage, AssistantMessage, ToolMessage]
+MessageAdapter = TypeAdapter(MessageTypes)
+class MessageHistory(BaseModel):
+    messages: list[MessageTypes] = Field(default_factory=list)
+    def append(self, message: MessageTypes, label: Optional[str] = None):
+        if label is not None:
+            message.label = label
+        self.messages.append(message)
+    def pop(self) -> MessageTypes:
+        return self.messages.pop()
+    def extend(self, history: MessageHistory):
+        self.messages.extend(history.messages)
+    def __reversed__(self):
+        return MessageHistory(messages=self.messages[::-1])
+    def __getitem__(self, index):
+        return self.messages[index]
+    def __len__(self):
+        return len(self.messages)
+    def __iter__(self) -> Iterator[MessageTypes]:
+        return iter(self.messages)
+    def to_dict(self, exclude: Set[str] | None = None) -> list[dict]:
+        exclude = exclude or set()
+        return [message.model_dump(exclude=exclude) for message in self.messages]
+    def history_view(
+        self,
+        limits: dict = MAX_MESSAGES_FOR_CONTEXT_WINDOW,
+    ) -> MessageHistory:
+        """Context window management.
+        Filters messages in reverse order, retaining a limited number of recent screenshots and prompts.
+        """
+        label_counts = {label: 0 for label in limits}
+        filtered_messages = []
+        for message in reversed(self.messages):
+            if message.label in limits:
+                maximum_count = limits[message.label]
+                if label_counts[message.label] < maximum_count:
+                    filtered_messages.append(message)
+                    label_counts[message.label] += 1
+            else:
+                filtered_messages.append(message)
+        return MessageHistory(messages=reversed(filtered_messages))
+    def __add__(self, other: MessageHistory) -> MessageHistory:
+        new_history = MessageHistory()
+        new_history.extend(self)
+        new_history.extend(other)
+        return new_history
+    def __iadd__(self, other: MessageHistory) -> MessageHistory:
+        self.extend(other)
+        return self

proxy-lite-demo-v2/src/proxy_lite/logger.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import asyncio
+import logging
+import sys
+from typing import Literal
+from uuid import uuid4
+from rich.logging import RichHandler
+class StructuredLogger(logging.Logger):
+    async def stream_message(self, message: str) -> None:
+        """Streams the message character by character asynchronously."""
+        try:
+            sys.stdout.write("\r")  # Overwrite current line
+            for char in message:
+                sys.stdout.write(char)
+                sys.stdout.flush()
+                await asyncio.sleep(0.002)
+            sys.stdout.write("\n")
+        except Exception:
+            pass
+    def _log(
+        self,
+        level,
+        msg,
+        args,
+        exc_info=None,
+        extra=None,
+        stack_info=False,
+        stacklevel=1,
+    ):
+        if extra is None:
+            extra = {}
+        json_fields = {
+            "logger_name": self.name,
+            "message": msg % args if args else msg,
+        }
+        exc_type, exc_value, exc_traceback = sys.exc_info()
+        if exc_type is not None:
+            json_fields["exception_class"] = exc_type.__name__
+            json_fields["exception_message"] = str(exc_value)
+        json_fields.update(extra)
+        super()._log(
+            level,
+            msg,
+            args,
+            exc_info,
+            {"json_fields": json_fields},
+            stack_info,
+            stacklevel + 1,
+        )
+def create_logger(
+    name: str,
+    level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO",
+    detailed_name: bool = False,
+) -> logging.Logger:
+    unique_name = f"{name}-{str(uuid4())[:8]}"
+    logger = logging.getLogger(unique_name)
+    logger.setLevel(level)
+    # Standard RichHandler for structured logs
+    rich_handler = RichHandler(
+        rich_tracebacks=True,
+        markup=True,
+        show_path=False,
+        show_time=False,
+        log_time_format="[%s]",
+    )
+    if detailed_name:
+        rich_handler.setFormatter(logging.Formatter("%(name)s:\n%(message)s"))
+    else:
+        rich_handler.setFormatter(logging.Formatter("-----\n%(message)s"))
+    logger.addHandler(rich_handler)
+    logger.propagate = False
+    return logger
+# Set StructuredLogger as the default logger class
+logging.setLoggerClass(StructuredLogger)
+# Initialize logger
+logger = create_logger(__name__, level="INFO")

proxy-lite-demo-v2/src/proxy_lite/recorder.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from __future__ import annotations
+import datetime
+import json
+import os
+import uuid
+from pathlib import Path
+from typing import Any, Optional, Self
+from pydantic import BaseModel, Field
+from proxy_lite.environments import EnvironmentConfigTypes
+from proxy_lite.environments.environment_base import Action, Observation
+from proxy_lite.history import MessageHistory
+from proxy_lite.solvers import SolverConfigTypes
+class Run(BaseModel):
+    run_id: str  # uuid.UUID
+    task: str
+    created_at: str  # datetime.datetime
+    complete: bool = False
+    terminated_at: str | None = None  # datetime.datetime
+    evaluation: dict[str, Any] | None = None
+    history: list[Observation | Action] = Field(default_factory=list)
+    solver_history: MessageHistory | None = None
+    result: str | None = None
+    env_info: dict[str, Any] = Field(default_factory=dict)
+    environment: Optional[EnvironmentConfigTypes] = None
+    solver: Optional[SolverConfigTypes] = None
+    @classmethod
+    def initialise(cls, task: str) -> Self:
+        run_id = str(uuid.uuid4())
+        return cls(
+            run_id=run_id,
+            task=task,
+            created_at=str(datetime.datetime.now(datetime.UTC)),
+        )
+    @classmethod
+    def load(cls, run_id: str) -> Self:
+        with open(Path(__file__).parent.parent.parent / "local_trajectories" / f"{run_id}.json", "r") as f:
+            return cls(**json.load(f))
+    @property
+    def observations(self) -> list[Observation]:
+        return [h for h in self.history if isinstance(h, Observation)]
+    @property
+    def actions(self) -> list[Action]:
+        return [h for h in self.history if isinstance(h, Action)]
+    @property
+    def last_action(self) -> Action | None:
+        return self.actions[-1] if self.actions else None
+    @property
+    def last_observation(self) -> Observation | None:
+        return self.observations[-1] if self.observations else None
+    def record(
+        self,
+        observation: Optional[Observation] = None,
+        action: Optional[Action] = None,
+        solver_history: Optional[MessageHistory] = None,
+    ) -> None:
+        # expect only one of observation and action to be provided in order to handle ordering
+        if observation and action:
+            raise ValueError("Only one of observation and action can be provided")
+        if observation:
+            self.history.append(observation)
+        if action:
+            self.history.append(action)
+        if solver_history:
+            self.solver_history = solver_history
+    def terminate(self) -> None:
+        self.terminated_at = str(datetime.datetime.now(datetime.UTC))
+class DataRecorder:
+    def __init__(self, local_folder: str | None = None):
+        self.local_folder = local_folder
+    def initialise_run(self, task: str) -> Run:
+        self.local_folder = Path(__file__).parent.parent.parent / "local_trajectories"
+        os.makedirs(self.local_folder, exist_ok=True)
+        return Run.initialise(task)
+    async def terminate(
+        self,
+        run: Run,
+        save: bool = True,
+    ) -> None:
+        run.terminate()
+        if save:
+            await self.save(run)
+    async def save(self, run: Run) -> None:
+        json_payload = run.model_dump()
+        with open(self.local_folder / f"{run.run_id}.json", "w") as f:
+            json.dump(json_payload, f)

proxy-lite-demo-v2/src/proxy_lite/runner.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import asyncio
+import logging
+from collections.abc import AsyncIterator
+from contextlib import asynccontextmanager
+from typing import Any, Literal, Self
+from omegaconf import OmegaConf
+from pydantic import BaseModel
+from proxy_lite.environments import (
+    Action,
+    BaseEnvironment,
+    EnvironmentConfigTypes,
+    Environments,
+    EventType,
+    Observation,
+)
+from proxy_lite.logger import create_logger
+from proxy_lite.recorder import DataRecorder, Run
+from proxy_lite.solvers import (
+    BaseSolver,
+    SolverConfigTypes,
+    Solvers,
+)
+@asynccontextmanager
+async def async_timeout(timeout: float, task_name: str = "timeout"):
+    try:
+        async with asyncio.TaskGroup() as tg:
+            async def timeout_task():
+                await asyncio.sleep(timeout)
+                raise TimeoutError(
+                    f"Operation {task_name} timed out after {timeout} seconds",
+                )
+            # Create the timeout task
+            timeout_handle = tg.create_task(timeout_task())
+            try:
+                yield
+            finally:
+                timeout_handle.cancel()
+    except* asyncio.TimeoutError as eg:
+        for e in eg.exceptions:
+            raise e
+    except* Exception as eg:
+        for e in eg.exceptions:
+            raise e
+class RunnerConfig(BaseModel):
+    environment: EnvironmentConfigTypes
+    solver: SolverConfigTypes
+    save_every_step: bool = True
+    max_steps: int = 50
+    action_timeout: float = 600.0
+    environment_timeout: float = 300.0
+    task_timeout: float = 1800.0
+    logger_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] = "INFO"
+    detailed_logger_name: bool = False
+    @classmethod
+    def from_dict(cls, config_dict: dict) -> Self:
+        conf = OmegaConf.create(config_dict)
+        config_dict = OmegaConf.to_container(conf, resolve=True)
+        return cls(**config_dict)
+    @classmethod
+    def from_yaml(cls, yaml_path: str) -> Self:
+        conf = OmegaConf.load(yaml_path)
+        config_dict = OmegaConf.to_container(conf, resolve=True)
+        return cls(**config_dict)
+class Runner(BaseModel):
+    config: RunnerConfig
+    recorder: DataRecorder | None = None
+    environment: type[BaseEnvironment] | None = None
+    solver: type[BaseSolver] | None = None
+    logger: logging.Logger | None = None
+    _run: Run | None = None
+    class Config:
+        arbitrary_types_allowed = True
+    def model_post_init(self, __context: Any) -> None:
+        super().model_post_init(__context)
+        self.environment = Environments.get(self.config.environment.name)
+        self.solver = Solvers.get(self.config.solver.name)
+        self.recorder = DataRecorder()
+        self.logger = create_logger(
+            name=f"([bold purple]{self.config.solver.name}[/]-[bold blue]{self.config.environment.name}[/])",
+            level=self.config.logger_level,
+            detailed_name=self.config.detailed_logger_name,
+        )
+    async def run_generator(self, task: str) -> AsyncIterator[Run]:
+        async with (
+            async_timeout(self.config.task_timeout, "Task"),
+        ):
+            if self.config.logger_level is not None:
+                self.logger.setLevel(self.config.logger_level)
+            run = self.recorder.initialise_run(task)
+            run.environment = self.config.environment
+            run.solver = self.config.solver
+            self.logger.debug(f"Run intialised: {run.run_id}")
+            event_queue = asyncio.Queue()
+            async with (
+                self.environment(
+                    config=self.config.environment,
+                    logger=self.logger,
+                ) as environment,
+                self.solver(config=self.config.solver, logger=self.logger) as solver,
+            ):
+                run.env_info = await environment.get_info()
+                await solver.initialise(
+                    task,
+                    environment.tools,
+                    environment.info_for_user,
+                )
+                self.logger.debug("Solver initialised.")
+                run.solver_history = solver.history
+                observation: Observation = await environment.initialise()
+                await event_queue.put(observation)
+                self.logger.debug("Environment initialised.")
+                step_count = 0
+                while step_count < self.config.max_steps:
+                    event = await event_queue.get()
+                    self.logger.debug(f"🤖 [bold purple]Processing event:[/] {event.type}")
+                    match event.type:
+                        case EventType.OBSERVATION:
+                            observation: Observation = event
+                            run.record(
+                                observation=observation,
+                                solver_history=solver.history,
+                            )
+                            async with async_timeout(
+                                self.config.action_timeout,
+                                "Action decision",
+                            ):
+                                action: Action = await solver.act(observation)
+                            await event_queue.put(action)
+                        case EventType.ACTION:
+                            action: Action = event
+                            self.logger.debug(f"Tool calls: {action.tool_calls}")
+                            run.record(action=action, solver_history=solver.history)
+                            run.complete = await solver.is_complete(observation)
+                            if self.config.save_every_step:
+                                await self.recorder.save(run)
+                            if run.complete:
+                                run.result = action.text
+                                self.logger.info(f"🤖 [bold purple]Task complete.[/] ✨ \n{run.result}")
+                                break
+                            self.logger.debug(f"DEBUG: Using environment_timeout: {self.config.environment_timeout} seconds")
+                            async with async_timeout(
+                                self.config.environment_timeout,
+                                "Environment response",
+                            ):
+                                observation: Observation = await environment.execute_action(action)
+                                step_count += 1
+                            await event_queue.put(observation)
+                    yield run
+                if not run.complete:
+                    self.logger.warning("🤖 [bold purple]Ran out of steps!")
+                await self.recorder.terminate(run, save=True)
+        yield run
+    async def run(self, task: str) -> Run:
+        async for run in self.run_generator(task):
+            self._run = run
+        return run
+    def run_concurrent(self, tasks: list[str]) -> list[Run]:
+        async def gather_runs():
+            return await asyncio.gather(
+                *[self.run(task) for task in tasks],
+                return_exceptions=True,
+            )
+        return asyncio.run(gather_runs())
+    @property
+    def complete(self) -> bool:
+        if self._run is None:
+            raise RuntimeError("Run not initialised")
+        return self._run.complete
+    @property
+    def run_id(self) -> str:
+        if self._run is None:
+            raise RuntimeError("Run not initialised")
+        return self._run.run_id
+    @property
+    def run_result(self) -> str:
+        if self._run is None:
+            raise RuntimeError("Run not initialised")
+        return self._run.result
+if __name__ == "__main__":
+    from proxy_lite.logger import logger
+    config = RunnerConfig.from_dict(
+        {
+            "environment": {
+                "name": "webbrowser",
+                "homepage": "https://www.google.com",
+                "viewport_width": 1280,
+                "viewport_height": 1920,
+                "screenshot_delay": 1,
+                "headless": False,
+            },
+            "solver": {
+                "name": "simple",
+                "agent": {
+                    "name": "proxy_lite",
+                    "client": {
+                        "name": "convergence",
+                        "model_id": "convergence-ai/proxy-lite",
+                        "api_base": "https://convergence-ai-demo-api.hf.space/v1",
+                    },
+                },
+            },
+            "max_steps": 150,
+            "action_timeout": 1800,
+            "environment_timeout": 1800,
+            "task_timeout": 18000,
+            "logger_level": "DEBUG",
+        },
+    )
+    logger.info(f"🤖 [bold purple]Config:[/] {config}")
+    runner = Runner(config=config)
+    result = asyncio.run(runner.run("Tell me the tesla stock price."))
+    print(runner.run_result)
+    print(runner.complete)

proxy-lite-demo-v2/src/proxy_lite/serializer.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import itertools
+from abc import ABC, abstractmethod
+from pydantic import BaseModel
+from proxy_lite.history import MessageAdapter, MessageHistory
+from proxy_lite.tools import Tool
+class BaseSerializer(BaseModel, ABC):
+    """Base class for serializers.
+    Serializers are responsible for converting between the internal MessageHistory/Tool
+    objects and the external API format. Deserialise is not always possible, so raise
+    appropriate warnings.
+    """
+    @abstractmethod
+    def serialize_messages(self, message_history: MessageHistory) -> list[dict]: ...
+    @abstractmethod
+    def deserialize_messages(self, data: list[dict]) -> MessageHistory: ...
+    @abstractmethod
+    def serialize_tools(self, tools: list[Tool]) -> list[dict]: ...
+class OpenAICompatibleSerializer(BaseSerializer):
+    def serialize_messages(self, message_history: MessageHistory) -> list[dict]:
+        return message_history.to_dict(exclude={"label"})
+    def deserialize_messages(self, data: list[dict]) -> MessageHistory:
+        return MessageHistory(
+            messages=[MessageAdapter.validate_python(message) for message in data],
+        )
+    def serialize_tools(self, tools: list[Tool]) -> list[dict]:
+        tool_schemas = [[{"type": "function", "function": schema} for schema in tool.schema] for tool in tools]
+        return list(itertools.chain.from_iterable(tool_schemas))

proxy-lite-demo-v2/src/proxy_lite/solvers/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from __future__ import annotations
+from typing import Union
+from .simple_solver import SimpleSolver, SimpleSolverConfig
+from .solver_base import BaseSolver, BaseSolverConfig, Solvers
+SolverConfigTypes = Union[*Solvers._solver_config_registry.values()]
+SolverTypes = Union[*Solvers._solver_registry.values()]
+__all__ = [
+    "BaseSolver",
+    "BaseSolverConfig",
+    "SimpleSolver",
+    "SimpleSolverConfig",
+    "SolverConfigTypes",
+    "SolverTypes",
+    "Solvers",
+]

proxy-lite-demo-v2/src/proxy_lite/solvers/simple_solver.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# ruff: noqa: E501
+import json
+import re
+from functools import cached_property
+from typing import Literal, Optional
+from proxy_lite.agents import AgentConfigTypes, Agents, BaseAgent
+from proxy_lite.environments.environment_base import Action, Observation
+from proxy_lite.history import (
+    MessageHistory,
+    MessageLabel,
+    SystemMessage,
+)
+from proxy_lite.solvers.solver_base import BaseSolver, BaseSolverConfig, Solvers
+from proxy_lite.tools import ReturnValueTool, Tool
+WEB_TOOL_TURN = """The action has been attempted in the computer."""
+@Solvers.register_solver_config("simple")
+class SimpleSolverConfig(BaseSolverConfig):
+    name: Literal["simple"] = "simple"
+    agent: AgentConfigTypes
+@Solvers.register_solver("simple")
+class SimpleSolver(BaseSolver):
+    task: Optional[str] = None
+    complete: bool = False
+    @cached_property
+    def tools(self) -> list[Tool]:
+        return [ReturnValueTool()] + self.env_tools
+    @cached_property
+    def agent(self) -> BaseAgent:
+        if self.logger:
+            self.logger.debug(f"Tools: {self.tools}")
+        return Agents.get(self.config.agent.name)(
+            config=self.config.agent,
+            env_tools=self.tools,
+        )
+    @property
+    def history(self) -> MessageHistory:
+        return MessageHistory(
+            messages=[SystemMessage.from_media(text=self.agent.system_prompt)] + self.agent.history.messages,
+        )
+    async def initialise(self, task: str, env_tools: list[Tool], env_info: str) -> None:
+        self.env_tools = env_tools
+        self.task = task
+        self.agent.receive_user_message(
+            text=f"Task: {task}",
+            label=MessageLabel.USER_INPUT,
+        )
+        self.logger.debug(f"Initialised with task: {task}")
+    async def act(self, observation: Observation) -> Action:
+        # Send tool responses to agent as tool messages if they exist
+        if observation.state.tool_responses:
+            for tool_response in observation.state.tool_responses:
+                if tool_response.content and tool_response.id:
+                    await self.agent.receive_tool_message(
+                        text=tool_response.content,
+                        tool_id=tool_response.id,
+                    )
+                else:
+                    print(f"🔧 DEBUG: Skipping tool response - content exists: {bool(tool_response.content)}, id exists: {bool(tool_response.id)}")
+        else:
+            print("🔧 DEBUG: No tool responses to process")
+        self.agent.receive_user_message(
+            image=observation.state.image,
+            text=observation.state.text,
+            label=MessageLabel.SCREENSHOT,
+            is_base64=True,
+        )
+        message = await self.agent.generate_output(use_tool=True)
+        self.logger.debug(f"Assistant message generated: {message}")
+        # check tool calls for return_value
+        if any(tool_call.function["name"] == "return_value" for tool_call in message.tool_calls):
+            self.complete = True
+            arguments = json.loads(message.tool_calls[0].function["arguments"])
+            if isinstance(arguments, str):
+                arguments = json.loads(arguments)
+            return_value = arguments["value"]
+            return Action(tool_calls=[], text=return_value)
+        # Handle empty content array from API response
+        if not message.content or len(message.content) == 0:
+            self.logger.warning("Message content is empty, using empty string as fallback")
+            text_content = ""
+        else:
+            text_content = message.content[0].text
+        observation_match = re.search(r"<observation>(.*?)</observation>", text_content, re.DOTALL)
+        observation_content = observation_match.group(1).strip() if observation_match else ""
+        self.logger.info("🌐 [bold blue]Observation:[/]")
+        await self.logger.stream_message(observation_content)
+        # Extract text between thinking tags if present
+        thinking_match = re.search(r"<thinking>(.*?)</thinking>", text_content, re.DOTALL)
+        thinking_content = thinking_match.group(1).strip() if thinking_match else text_content
+        self.logger.info("🧠 [bold purple]Thinking:[/]")
+        await self.logger.stream_message(thinking_content)
+        return Action(tool_calls=message.tool_calls, text=text_content)
+    async def is_complete(self, observation: Observation) -> bool:
+        env_terminated = observation.terminated
+        return self.complete or env_terminated

proxy-lite-demo-v2/src/proxy_lite/solvers/solver_base.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import logging
+from abc import ABC, abstractmethod
+from functools import cached_property
+from typing import Optional, Self, Type, cast
+from pydantic import BaseModel, Field
+from proxy_lite.environments.environment_base import Action, Observation
+from proxy_lite.tools import Tool
+class BaseSolverConfig(BaseModel):
+    pass
+class BaseSolver(BaseModel, ABC):
+    task: Optional[str] = None
+    env_tools: list[Tool] = Field(default_factory=list)
+    config: BaseSolverConfig
+    logger: logging.Logger | None = None
+    class Config:
+        arbitrary_types_allowed = True
+    async def __aenter__(self) -> Self:
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback) -> None:
+        pass
+    @cached_property
+    @abstractmethod
+    def tools(self) -> list[Tool]: ...
+    @abstractmethod
+    async def initialise(
+        self,
+        task: str,
+        env_tools: list[Tool],
+        env_info: str,
+    ) -> None:
+        """
+        Initialise the solution with the given task.
+        """
+        ...
+    @abstractmethod
+    async def act(self, observation: Observation) -> Action:
+        """
+        Return an action for interacting with the environment.
+        """
+        ...
+    async def is_complete(self, observation: Observation) -> bool:
+        """
+        Return a boolean indicating if the task is complete.
+        """
+        return observation.terminated
+class Solvers:
+    _solver_registry: dict[str, type[BaseSolver]] = {}
+    _solver_config_registry: dict[str, type[BaseSolverConfig]] = {}
+    @classmethod
+    def register_solver(cls, name: str):
+        """
+        Decorator to register a Solver class under a given name.
+        Example:
+            @Solvers.register_solver("my_solver")
+            class MySolver(BaseSolver):
+                ...
+        """
+        def decorator(solver_cls: type[BaseSolver]) -> type[BaseSolver]:
+            cls._solver_registry[name] = solver_cls
+            return solver_cls
+        return decorator
+    @classmethod
+    def register_solver_config(cls, name: str):
+        """
+        Decorator to register a Solver configuration class under a given name.
+        Example:
+            @Solvers.register_solver_config("my_solver")
+            class MySolverConfig(BaseSolverConfig):
+                ...
+        """
+        def decorator(config_cls: type[BaseSolverConfig]) -> type[BaseSolverConfig]:
+            cls._solver_config_registry[name] = config_cls
+            return config_cls
+        return decorator
+    @classmethod
+    def get(cls, name: str) -> type[BaseSolver]:
+        """
+        Retrieve a registered Solver class by its name.
+        Raises:
+            ValueError: If no such solver is found.
+        """
+        try:
+            return cast(Type[BaseSolver], cls._solver_registry[name])
+        except KeyError:
+            raise ValueError(f"Solver '{name}' not found.")
+    @classmethod
+    def get_config(cls, name: str) -> type[BaseSolverConfig]:
+        """
+        Retrieve a registered Solver configuration class by its name.
+        Raises:
+            ValueError: If no such config is found.
+        """
+        try:
+            return cast(Type[BaseSolverConfig], cls._solver_config_registry[name])
+        except KeyError:
+            raise ValueError(f"Solver config for '{name}' not found.")

proxy-lite-demo-v2/src/proxy_lite/tools/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .browser_tool import BrowserTool
+from .return_tool import ReturnValueTool
+from .tool_base import Tool, ToolExecutionResponse, attach_param_schema
+__all__ = ["BrowserTool", "ReturnValueTool", "Tool", "ToolExecutionResponse", "attach_param_schema"]

proxy-lite-demo-v2/src/proxy_lite/tools/browser_tool.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import asyncio
+from contextlib import AsyncExitStack
+from typing import List, Literal, Optional, Any
+from pydantic import BaseModel, Field
+from proxy_lite.browser.browser import BrowserSession
+from proxy_lite.logger import logger
+from .tool_base import Tool, ToolExecutionResponse, attach_param_schema
+SELF_CONTAINED_TAGS = [
+    # many of these are non-interactive but keeping them anyway
+    "area",
+    "base",
+    "br",
+    "col",
+    "embed",
+    "hr",
+    "img",
+    "input",
+    "link",
+    "meta",
+    "param",
+    "source",
+    "track",
+    "wbr",
+]
+def element_as_text(
+    mark_id: int,
+    tag: Optional[str] = None,
+    text: Optional[str] = None,
+    **raw_attributes,
+) -> str:
+    """Return a text representation of all elements on the page"""
+    attributes = []
+    for k, v in raw_attributes.items():
+        if v is None:
+            continue
+        if isinstance(v, bool):
+            if v:
+                attributes.append(k)
+            # we ignore False bool attributes
+        else:
+            v = str(v)
+            if len(v) > 2500:
+                v = v[: 2500 - 1] + "…"
+            attributes.append(f'{k}="{v}"')
+    attributes = " ".join(attributes)
+    attributes = (" " + attributes).rstrip()
+    tag = tag.lower()
+    if text is None:
+        text = ""
+    if len(text) > 2500:
+        text = text[: 2500 - 1] + "…"
+    if tag in SELF_CONTAINED_TAGS:
+        if text:
+            logger.warning(
+                f"Got self-contained element '{tag}' which contained text '{text}'.",
+            )
+        else:
+            return f"<{tag} id={mark_id}{attributes}/>"
+    return f"<{tag} id={mark_id}{attributes}>{text}</{tag}>"
+class GotoParams(BaseModel):
+    url: str = Field(..., description="The web address to visit. Must be a valid URL.")
+class GoogleSearchParams(BaseModel):
+    query_plan: str = Field(
+        ...,
+        description="Plan out the query you will make. Re-write queries in a way that will yield the best results.",
+    )
+    query: str = Field(..., description="The Google search to perform.")
+class ClickParams(BaseModel):
+    mark_id: int = Field(..., description="Element Mark ID.")
+class TypeEntry(BaseModel):
+    mark_id: int = Field(..., description="Element Mark ID.")
+    content: str = Field(..., description="The text to type into the element.")
+class TypeParams(BaseModel):
+    entries: List[TypeEntry] = Field(
+        ...,
+        description="A list of elements and contents to type.",
+    )
+    submit: bool = Field(
+        ...,
+        description='Whether to press the "Enter" key after typing in the last entry.',
+    )
+class ScrollParams(BaseModel):
+    direction: Literal["up", "down", "left", "right"] = Field(
+        ...,
+        description='Direction to scroll. Must be one of "up", "down", "left" or "right".',
+    )
+    mark_id: int = Field(
+        ...,
+        description="What to scroll. Use -1 to scroll the whole page otherwise give the mark ID of an element that is `scrollable`.",  # noqa: E501
+    )
+class BackParams(BaseModel):
+    pass
+class WaitParams(BaseModel):
+    pass
+class ReloadParams(BaseModel):
+    pass
+class DoNothingParams(BaseModel):
+    pass
+# --- NEW: Parameters for open_new_tab_and_go_to tool ---
+class OpenNewTabAndGoToParams(BaseModel):
+    url: str = Field(..., description="The URL to navigate to in the new tab.")
+# --- NEW: Parameters for select_option_by_text tool ---
+class SelectOptionByTextParams(BaseModel):
+    mark_id: int = Field(..., description="The mark ID of the select element.")
+    option_text: str = Field(..., description="The text content of the option to select.")
+class BrowserTool(Tool):
+    def __init__(self, session: BrowserSession) -> None:
+        super().__init__()
+        self.browser = session
+    async def __aenter__(self):
+        self._exit_stack = AsyncExitStack()
+        await self._exit_stack.enter_async_context(self.browser)
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self._exit_stack.aclose()
+    @property
+    def poi_text(self) -> str:
+        # Get all points of interest on the page as text
+        texts = [element_as_text(mark_id=i, **element) for i, element in enumerate(self.browser.poi_elements)]
+        # Return formatted text of points of interest on page
+        return "\n".join([txt for txt in texts if txt])
+    @attach_param_schema(GotoParams)
+    async def goto(self, url: str) -> ToolExecutionResponse:
+        """Go directly to a specific web url. Specify the exact URL."""
+        await self.browser.goto(url)
+        return ToolExecutionResponse(content=f"Successfully navigated to URL: {url}")
+    @attach_param_schema(GoogleSearchParams)
+    async def google_search(self, query_plan: str, query: str) -> ToolExecutionResponse:
+        """Perform a generic web search using Google.
+        Results may not be relevant. If you see poor results, you can try another query.
+        """
+        url = f"https://www.google.com/search?q={query}"
+        await self.browser.goto(url)
+        return ToolExecutionResponse(content=f"Performed Google search for: {query}")
+    @attach_param_schema(ClickParams)
+    async def click(self, mark_id: int) -> ToolExecutionResponse:
+        """Click on an element of the page."""
+        try:
+            await self.browser.click(mark_id=mark_id)
+            return ToolExecutionResponse(content=f"Clicked element with mark ID: {mark_id}")
+        except IndexError as e:
+            # This happens if mark_id is out of bounds for browser.poi_centroids
+            logger.error(f"Click failed: Mark ID {mark_id} not found or POI list empty. Error: {e}")
+            return ToolExecutionResponse(content=f"Failed to click element with mark ID {mark_id}. Element not found or POI list invalid.")
+        except Exception as e:
+            logger.error(f"Click failed with unexpected error for mark ID {mark_id}: {e}")
+            return ToolExecutionResponse(content=f"An unexpected error occurred while trying to click element {mark_id}: {e}")
+    @attach_param_schema(TypeParams)
+    async def type(self, entries: List[dict], submit: bool) -> ToolExecutionResponse:
+        """Type text.
+        You can type into one or more elements.
+        Note that the text inside an element is cleared before typing.
+        """
+        typed_ids = []
+        for i, entry_dict in enumerate(entries):
+            try:
+                entry = TypeEntry(**entry_dict)
+                last_entry = i == len(entries) - 1
+                old_poi_positions = [tuple(point) for point in self.browser.poi_centroids]
+                await self.browser.enter_text(
+                    mark_id=entry.mark_id,
+                    text=entry.content,
+                    submit=submit and last_entry,
+                )
+                typed_ids.append(entry.mark_id)
+                await self.browser.update_poi()
+                new_poi_positions = [tuple(point) for point in self.browser.poi_centroids]
+                if not last_entry and old_poi_positions != new_poi_positions:
+                    logger.error(
+                        "POI positions changed mid-typing, cancelling future type entries.",
+                    )
+                    break
+            except IndexError as e:
+                logger.error(f"Type failed: Mark ID {entry.mark_id} not found or POI list empty. Error: {e}")
+                return ToolExecutionResponse(content=f"Failed to type into element with mark ID {entry.mark_id}. Element not found or POI list invalid. Typed into: {typed_ids if typed_ids else 'none'}.")
+            except Exception as e:
+                logger.error(f"Type failed with unexpected error for mark ID {entry.mark_id}: {e}")
+                return ToolExecutionResponse(content=f"An unexpected error occurred while trying to type into element {entry.mark_id}: {e}. Typed into: {typed_ids if typed_ids else 'none'}.")
+        return ToolExecutionResponse(
+            content=f"Typed text into elements with mark IDs: {typed_ids}",
+        )
+    @attach_param_schema(ScrollParams)
+    async def scroll(self, direction: str, mark_id: int) -> ToolExecutionResponse:
+        """Scroll the page (or a scrollable element) up, down, left or right."""
+        try:
+            if mark_id == -1:
+                mark_id_for_browser = None # Pass None to browser.scroll for page scroll
+            else:
+                mark_id_for_browser = mark_id
+            await self.browser.scroll(direction=direction, mark_id=mark_id_for_browser)
+            return ToolExecutionResponse(content=f"Scrolled {direction} on element with mark ID: {mark_id if mark_id != -1 else 'page'}")
+        except IndexError as e:
+            logger.error(f"Scroll failed: Mark ID {mark_id} not found or POI list empty. Error: {e}")
+            return ToolExecutionResponse(content=f"Failed to scroll element with mark ID {mark_id}. Element not found or POI list invalid.")
+        except Exception as e:
+            logger.error(f"Scroll failed with unexpected error for mark ID {mark_id}: {e}")
+            return ToolExecutionResponse(content=f"An unexpected error occurred while trying to scroll element {mark_id}: {e}")
+    @attach_param_schema(BackParams)
+    async def back(self) -> ToolExecutionResponse:
+        """Go back to the previous page."""
+        try:
+            await self.browser.go_back()
+            return ToolExecutionResponse(content="Went back to the previous page.")
+        except Exception as e:
+            logger.error(f"Go back failed: {e}")
+            return ToolExecutionResponse(content=f"Failed to go back: {e}")
+    @attach_param_schema(WaitParams)
+    async def wait(self) -> ToolExecutionResponse:
+        """Wait three seconds. Useful when the page appears to still be loading, or if there are any unfinished webpage processes."""  # noqa: E501
+        await asyncio.sleep(3)
+        return ToolExecutionResponse(content="Waited for a few seconds.")
+    @attach_param_schema(ReloadParams)
+    async def reload(self) -> ToolExecutionResponse:
+        """Reload the current page. Useful when the page seems unresponsive, broken, outdated, or if you want to reset the page to its initial state."""  # noqa: E501
+        try:
+            await self.browser.reload()
+            return ToolExecutionResponse(content="Reloaded the current page.")
+        except Exception as e:
+            logger.error(f"Reload failed: {e}")
+            return ToolExecutionResponse(content=f"Failed to reload the page: {e}")
+    @attach_param_schema(DoNothingParams)
+    async def do_nothing_tool(self) -> ToolExecutionResponse:
+        """Do nothing. Use this if you have no need for the browser at this time."""
+        return ToolExecutionResponse(content="Did nothing in the browser.")
+    # --- NEW: Expose the open_new_tab_and_go_to method as a tool ---
+    @attach_param_schema(OpenNewTabAndGoToParams)
+    async def open_new_tab_and_go_to(self, url: str) -> ToolExecutionResponse:
+        """
+        Opens a new browser tab/page and navigates to the specified URL.
+        Closes the old page if it's not the last one remaining.
+        Use this to bypass loading issues by forcing a new navigation.
+        """
+        try:
+            await self.browser.open_new_tab_and_go_to(url)
+            return ToolExecutionResponse(
+                content=f"Successfully opened new tab and navigated to: {url}",
+            )
+        except Exception as e:
+            logger.error(f"Error opening new tab and navigating to {url}: {e}")
+            return ToolExecutionResponse(content=f"Failed to open new tab and navigate to {url}: {e}")
+    # --- NEW: Select option by text from select element ---
+    @attach_param_schema(SelectOptionByTextParams)
+    async def select_option_by_text(self, mark_id: int, option_text: str) -> ToolExecutionResponse:
+        """
+        Selects an option from a select element (including dual select picklists) by finding the option with matching text.
+        This is especially useful for Salesforce dual select picklists where you need to find and select a specific option.
+        Uses Playwright's native iframe handling to bypass CORS restrictions.
+        """
+        try:
+            logger.info(f"Attempting to select option '{option_text}' from element {mark_id}")
+            # First, try to click the select element to ensure it's focused
+            await self.browser.click(mark_id=mark_id)
+            await asyncio.sleep(0.5)  # Wait for click to register
+            # Use Playwright's native frame handling instead of JavaScript evaluation
+            # This bypasses CORS restrictions that prevent JavaScript access
+            # Find all frames on the page
+            main_frame = self.browser.current_page.main_frame
+            all_frames = [main_frame] + main_frame.child_frames
+            logger.info(f"Searching for element {mark_id} across {len(all_frames)} frames")
+            for frame_idx, frame in enumerate(all_frames):
+                try:
+                    # Look for select elements in this frame
+                    select_elements = await frame.query_selector_all('select')
+                    logger.info(f"Frame {frame_idx}: Found {len(select_elements)} select elements")
+                    for select_elem in select_elements:
+                        # Get all options for this select
+                        options = await select_elem.query_selector_all('option')
+                        # Check if any option contains our target text
+                        for opt_idx, option in enumerate(options):
+                            option_text_content = await option.text_content()
+                            option_value = await option.get_attribute('value')
+                            logger.info(f"Frame {frame_idx}, Select {select_elem}, Option {opt_idx}: text='{option_text_content}', value='{option_value}'")
+                            if option_text_content and option_text.lower().strip() == option_text_content.lower().strip():
+                                # Found the option! Click it directly instead of using select_option
+                                try:
+                                    # Direct click with force=True to bypass visibility checks and short timeout
+                                    await option.click(force=True, timeout=5000)
+                                    logger.info(f"Successfully clicked option '{option_text_content.strip()}' in frame {frame_idx}")
+                                    return ToolExecutionResponse(
+                                        content=f"[ACTION COMPLETED] Successfully selected '{option_text_content.strip()}' from dual select picklist"
+                                    )
+                                except Exception as select_error:
+                                    logger.info(f"Click timed out in frame {frame_idx}, but option may have been selected: {select_error}")
+                                    # Continue to next frame/option instead of failing completely
+                                    continue
+                except Exception as frame_error:
+                    logger.info(f"Could not access frame {frame_idx}: {frame_error}")
+                    continue
+            # If we get here, the option wasn't found in any frame
+            # Try to get available options for debugging
+            all_options = []
+            for frame in all_frames:
+                try:
+                    select_elements = await frame.query_selector_all('select')
+                    for select_elem in select_elements:
+                        options = await select_elem.query_selector_all('option')
+                        for option in options[:5]:  # Limit to first 5 options per select
+                            text = await option.text_content()
+                            if text:
+                                all_options.append(text.strip())
+                except:
+                    continue
+            available_options_str = ', '.join(all_options[:10]) if all_options else 'None found'
+            return ToolExecutionResponse(
+                content=f"Failed to find option '{option_text}' in any select element. Available options (first 10): {available_options_str}"
+            )
+        except Exception as e:
+            logger.error(f"Error selecting option '{option_text}' from element {mark_id}: {e}")
+            return ToolExecutionResponse(content=f"An unexpected error occurred while selecting option '{option_text}': {e}")

proxy-lite-demo-v2/src/proxy_lite/tools/return_tool.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from pydantic import BaseModel, Field
+from proxy_lite.tools.tool_base import Tool, attach_param_schema
+class ReturnValueParams(BaseModel):
+    value: str = Field(description="The value to return to the user.")
+class ReturnValueTool(Tool):
+    def __init__(self):
+        pass
+    @attach_param_schema(ReturnValueParams)
+    def return_value(self, value: str):
+        """Return a value to the user. Use this tool when you have finished the task in order to provide any information the user has requested."""  # noqa: E501
+        print(value)

proxy-lite-demo-v2/src/proxy_lite/tools/tool_base.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import inspect
+from functools import cached_property, wraps
+from typing import Any, Callable, Optional
+from pydantic import BaseModel
+class Tool:
+    async def __aenter__(self):
+        pass
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        pass
+    @cached_property
+    def schema(self) -> list[dict[str, Any]]:
+        schema = []
+        for name, method in self.__class__.__dict__.items():
+            # If function is not callable and isn't decorated using attach_param_schema
+            if not isinstance(method, Callable) or not hasattr(method, "param_model"):
+                continue
+            docstring = inspect.getdoc(method)
+            if not docstring:
+                raise ValueError(f"The tool function '{name}' is missing a docstring.")
+            # Handle multi-line docstirngs
+            description = " ".join(line.strip() for line in docstring.split("\n"))
+            tool_json = {
+                "name": name,
+                "description": description,
+                "parameters": method.param_model.model_json_schema(),
+            }
+            schema.append(tool_json)
+        return schema
+def attach_param_schema(param_model: type[BaseModel]):
+    def decorator(func: Callable) -> Callable:
+        @wraps(func)
+        def wrapper(self, **kwargs):
+            # Throw an error if there's a mismatch between the function parameters and pydantic model's fields.
+            validated_params = param_model(**kwargs)
+            return func(self, **validated_params.model_dump())
+        wrapper.param_model = param_model
+        return wrapper
+    return decorator
+class ToolExecutionResponse(BaseModel):
+    content: Optional[str] = None
+    id: Optional[str] = None

proxy-lite-demo-v2/test_tool_calling.py ADDED Viewed

	@@ -0,0 +1,65 @@

+#!/usr/bin/env python3
+import asyncio
+import os
+import sys
+sys.path.insert(0, 'src')
+from proxy_lite.client import GeminiClient, GeminiClientConfig
+from proxy_lite.history import MessageHistory, UserMessage, Text
+from proxy_lite.tools.browser_tool import BrowserTool
+from proxy_lite.browser.browser import BrowserSession
+async def test_tool_calling():
+    # Setup client
+    api_key = os.environ.get("GEMINI_API_KEY")
+    if not api_key:
+        print("❌ GEMINI_API_KEY not set")
+        return
+    config = GeminiClientConfig(api_key=api_key)
+    client = GeminiClient(config=config)
+    # Create a dummy browser tool
+    class DummyBrowserSession:
+        async def __aenter__(self):
+            return self
+        async def __aexit__(self, *args):
+            pass
+        async def open_new_tab_and_go_to(self, url):
+            print(f"✅ Would open new tab and go to: {url}")
+            return True
+    browser_tool = BrowserTool(DummyBrowserSession())
+    # Create message history
+    messages = MessageHistory()
+    messages.append(UserMessage(content=[Text(text="Please use the open_new_tab_and_go_to tool to navigate to https://google.com")]))
+    print("🚀 Testing Gemini tool calling...")
+    try:
+        # Test tool calling
+        response = await client.create_completion(
+            messages=messages,
+            tools=[browser_tool],
+            temperature=0.7
+        )
+        print(f"✅ Response received: {response}")
+        if response.choices[0].message.tool_calls:
+            print(f"✅ Tool calls found: {len(response.choices[0].message.tool_calls)}")
+            for tool_call in response.choices[0].message.tool_calls:
+                print(f"  - Tool: {tool_call.function.name}")
+                print(f"  - Args: {tool_call.function.arguments}")
+        else:
+            print("❌ No tool calls found")
+            print(f"Content: {response.choices[0].message.content}")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    asyncio.run(test_tool_calling())

proxy-lite-demo-v2/uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

proxy-lite-work/.forceignore ADDED Viewed

	@@ -0,0 +1,12 @@

+# List files or directories below to ignore them when running force:source:push, force:source:pull, and force:source:status
+# More information: https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_exclude_source.htm
+#
+package.xml
+# LWC configuration files
+**/jsconfig.json
+**/.eslintrc.json
+# LWC Jest
+**/__tests__/**