Trisha Tomy commited on
Commit
6a0e448
·
1 Parent(s): 40f15d7
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. CODEOWNERS +1 -0
  2. Dockerfile +59 -0
  3. LICENSE +3 -0
  4. Makefile +11 -0
  5. Procfile +1 -0
  6. README.md +3 -3
  7. app.py +133 -0
  8. pyproject.toml +65 -0
  9. requirements.txt +6 -0
  10. src/proxy_lite.egg-info/PKG-INFO +36 -0
  11. src/proxy_lite.egg-info/SOURCES.txt +35 -0
  12. src/proxy_lite.egg-info/dependency_links.txt +1 -0
  13. src/proxy_lite.egg-info/entry_points.txt +2 -0
  14. src/proxy_lite.egg-info/requires.txt +18 -0
  15. src/proxy_lite.egg-info/top_level.txt +1 -0
  16. src/proxy_lite/__init__.py +3 -0
  17. src/proxy_lite/__pycache__/__init__.cpython-313.pyc +0 -0
  18. src/proxy_lite/__pycache__/cli.cpython-313.pyc +0 -0
  19. src/proxy_lite/__pycache__/client.cpython-313.pyc +0 -0
  20. src/proxy_lite/__pycache__/gif_maker.cpython-313.pyc +0 -0
  21. src/proxy_lite/__pycache__/history.cpython-313.pyc +0 -0
  22. src/proxy_lite/__pycache__/logger.cpython-313.pyc +0 -0
  23. src/proxy_lite/__pycache__/recorder.cpython-313.pyc +0 -0
  24. src/proxy_lite/__pycache__/runner.cpython-313.pyc +0 -0
  25. src/proxy_lite/__pycache__/serializer.cpython-313.pyc +0 -0
  26. src/proxy_lite/agents/__init__.py +18 -0
  27. src/proxy_lite/agents/__pycache__/__init__.cpython-313.pyc +0 -0
  28. src/proxy_lite/agents/__pycache__/agent_base.cpython-313.pyc +0 -0
  29. src/proxy_lite/agents/__pycache__/proxy_lite_agent.cpython-313.pyc +0 -0
  30. src/proxy_lite/agents/agent_base.py +238 -0
  31. src/proxy_lite/agents/proxy_lite_agent.py +54 -0
  32. src/proxy_lite/app.py +239 -0
  33. src/proxy_lite/browser/__init__.py +0 -0
  34. src/proxy_lite/browser/__pycache__/__init__.cpython-313.pyc +0 -0
  35. src/proxy_lite/browser/__pycache__/bounding_boxes.cpython-313.pyc +0 -0
  36. src/proxy_lite/browser/__pycache__/browser.cpython-313.pyc +0 -0
  37. src/proxy_lite/browser/add_custom_select.js +123 -0
  38. src/proxy_lite/browser/bounding_boxes.py +210 -0
  39. src/proxy_lite/browser/browser.py +508 -0
  40. src/proxy_lite/browser/find_pois.js +397 -0
  41. src/proxy_lite/cli.py +112 -0
  42. src/proxy_lite/client.py +171 -0
  43. src/proxy_lite/configs/default.yaml +23 -0
  44. src/proxy_lite/environments/__init__.py +32 -0
  45. src/proxy_lite/environments/__pycache__/__init__.cpython-313.pyc +0 -0
  46. src/proxy_lite/environments/__pycache__/environment_base.cpython-313.pyc +0 -0
  47. src/proxy_lite/environments/__pycache__/webbrowser.cpython-313.pyc +0 -0
  48. src/proxy_lite/environments/environment_base.py +161 -0
  49. src/proxy_lite/environments/webbrowser.py +194 -0
  50. src/proxy_lite/gif_maker.py +122 -0
CODEOWNERS ADDED
@@ -0,0 +1 @@
 
 
1
+ * @aptoul @Fraser-Greenlee @XanderJC
Dockerfile ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Playwright Docker image for Python, matching your Playwright version and Debian base
2
+ FROM mcr.microsoft.com/playwright/python:v1.53.0-noble
3
+
4
+ # Set the working directory inside the container
5
+ WORKDIR /app
6
+
7
+ # The official Playwright image comes with most necessary system dependencies,
8
+ # so we only need to add git for proxy-lite and potentially any very specific missing libs.
9
+ # Removing the extensive list as it's largely redundant with the Playwright base image.
10
+ RUN apt-get update && apt-get install -y \
11
+ git \
12
+ xvfb \
13
+ # Clean up apt caches to reduce image size
14
+ && rm -rf /var/lib/apt/lists/*
15
+
16
+ # Copy common Python dependencies first (needed for pip installs)
17
+ COPY requirements.txt .
18
+
19
+ # Copy your Flask application code (app.py) and other project files.
20
+ COPY . .
21
+
22
+ # --- START: Directory permission workaround ---
23
+ # Create the directory proxy-lite's recorder insists on writing to
24
+ # and grant full permissions. This addresses the PermissionError.
25
+ # This line creates the directory *directly* under /app, which is now the correct path
26
+ RUN mkdir -p /app/local_trajectories \
27
+ && chmod -R 777 /app/local_trajectories
28
+ # --- END: Directory permission workaround ---
29
+
30
+ # Upgrade pip, setuptools, and wheel for a robust Python build environment.
31
+ RUN pip install --no-cache-dir --upgrade pip setuptools wheel
32
+
33
+ # Install your local proxy-lite package in editable mode.
34
+ RUN pip install --no-cache-dir --no-input -e .
35
+
36
+ # Install the rest of the Python dependencies from requirements.txt
37
+ RUN pip install --no-cache-dir -r requirements.txt
38
+
39
+
40
+ # Set environment variables required for Playwright at runtime
41
+ ENV DISPLAY=:99
42
+ ENV XDG_RUNTIME_DIR=/tmp
43
+ # Removed PLAYWRIGHT_BROWSERS_PATH and PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD
44
+ # as the official Playwright image manages these internally, defaulting to /ms-playwright.
45
+
46
+ # --- Debugging: Check Playwright version and browser installation (moved AFTER install in the original setup) ---
47
+ # Now checking the default Playwright browser installation path /ms-playwright
48
+ RUN echo "--- Checking Playwright Version (from base image) ---"
49
+ RUN python -m playwright --version
50
+ RUN echo "--- Listing Playwright Browser Cache (Recursive, from base image) ---"
51
+ RUN ls -alR /ms-playwright/
52
+ RUN echo "-----------------------------------"
53
+ # --- End Debugging ---
54
+
55
+ # Expose the port your Flask app will listen on. Hugging Face Spaces requires 7860.
56
+ EXPOSE 7860
57
+
58
+ # Define the command to run your Flask application using Gunicorn for production.
59
+ CMD exec gunicorn --bind 0.0.0.0:7860 --workers 2 --worker-class gevent app:app --timeout 300
LICENSE ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Creative Commons Attribution-NonCommercial 4.0 International
2
+
3
+ This work is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License. To view a copy of this license, visit https://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
Makefile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: proxy
2
+
3
+ proxy:
4
+ pip install uv
5
+ uv venv --python 3.11 --python-preference managed
6
+ uv sync
7
+ uv pip install -e .
8
+ playwright install
9
+
10
+ app:
11
+ streamlit run src/proxy_lite/app.py
Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: gunicorn --bind 0.0.0.0:7860 --workers 2 --worker-class gevent app:app --timeout 300
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Proxy Lite Demo V2
3
- emoji: 📉
4
  colorFrom: indigo
5
- colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
  ---
 
1
  ---
2
+ title: Proxy Lite Demo For Setup
3
+ emoji: 😻
4
  colorFrom: indigo
5
+ colorTo: gray
6
  sdk: docker
7
  pinned: false
8
  ---
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gevent.monkey
2
+ gevent.monkey.patch_all(asyncio=True) # Keep this at the very top
3
+
4
+ import asyncio # Keep this
5
+ from flask import Flask, request, jsonify
6
+ from proxy_lite import Runner, RunnerConfig
7
+ import os
8
+ import logging
9
+
10
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
+ logger = logging.getLogger(__name__)
12
+
13
+ app = Flask(__name__)
14
+
15
+ _runner = None
16
+
17
+ async def initialize_runner():
18
+ global _runner
19
+ if _runner is None:
20
+ logger.info("Initializing Proxy-lite Runner...")
21
+
22
+ hf_api_token = os.environ.get("HF_API_TOKEN")
23
+ if not hf_api_token:
24
+ logger.error("HF_API_TOKEN environment variable not set. Cannot initialize Runner.")
25
+ raise ValueError("HF_API_TOKEN environment variable not set. Please set it as a Space secret.")
26
+
27
+ config = RunnerConfig.from_dict({
28
+ "environment": {
29
+ "name": "webbrowser",
30
+ # Set homepage to Salesforce's generic login URL to avoid premature waits for target page elements.
31
+ "homepage": "https://login.salesforce.com/",
32
+ "headless": False, # Keep this False for local testing
33
+ "launch_args": ["--no-sandbox", "--disable-setuid-sandbox"],
34
+ "screenshot_delay": 0.5, # Reduced for faster debugging cycles
35
+ "include_html": True,
36
+ "include_poi_text": True,
37
+ },
38
+ "solver": {
39
+ "name": "simple",
40
+ "agent": {
41
+ "name": "proxy_lite",
42
+ "client": {
43
+ "name": "convergence",
44
+ "model_id": "convergence-ai/proxy-lite-3b",
45
+ "api_base": "https://convergence-ai-demo-api.hf.space/v1",
46
+ "api_key": hf_api_token
47
+ }
48
+ }
49
+ },
50
+ "environment_timeout": 1800.0,
51
+ "action_timeout": 1800.0,
52
+ "task_timeout": 18000.0,
53
+ "max_steps": 150,
54
+ "logger_level": "DEBUG",
55
+ })
56
+
57
+ logger.info(f"DEBUG: app.py - Initializing Runner with environment_timeout: {config.environment_timeout} seconds")
58
+ logger.info(f"DEBUG: app.py - Full config used: {config.model_dump_json(indent=2)}")
59
+
60
+ _runner = Runner(config=config)
61
+ logger.info("Proxy-lite Runner initialized successfully.")
62
+ return _runner
63
+
64
+
65
+ @app.route('/run_proxy_task', methods=['POST'])
66
+ async def run_proxy_task_endpoint():
67
+ data = request.json
68
+ request_task_instruction = data.get('task')
69
+
70
+ if not request_task_instruction:
71
+ logger.warning("Received request without 'task' field. Returning 400.")
72
+ return jsonify({"error": "No 'task' provided in request body"}), 400
73
+
74
+ logger.info(f"Received user request task: '{request_task_instruction}'")
75
+
76
+ salesforce_username = os.environ.get("SALESFORCE_USERNAME")
77
+ salesforce_password = os.environ.get("SALESFORCE_PASSWORD")
78
+
79
+ if not salesforce_username or not salesforce_password:
80
+ logger.error("Salesforce credentials (SALESFORCE_USERNAME, SALESFORCE_PASSWORD) environment variables not set.")
81
+ return jsonify({"error": "Salesforce credentials not configured. Please set SALESFORCE_USERNAME and SALESFORCE_PASSWORD as Space secrets."}), 500
82
+
83
+ # Define the specific Account Forecast Settings URL
84
+ account_forecast_url = "https://dwd000006jia1mae.lightning.force.com/lightning/setup/AccountForecastSettings/home"
85
+
86
+ # Define the tool code block to open a new tab and navigate after login
87
+ # Using a raw f-string for multiline tool code block
88
+ tool_code_block_new_tab = fr"""
89
+ <tool_code>
90
+ await browser.open_new_tab_and_go_to(url='{account_forecast_url}')
91
+ </tool_code>
92
+ """
93
+
94
+ # Refined agent_task instruction to be sequential and robust to Salesforce redirects
95
+ agent_task = f"""
96
+ **Task Instructions for Proxy Lite Agent:**
97
+ 1. **Start on Login Page:** Navigate to the Salesforce login page.
98
+ 2. **Perform Login:** Log in to Salesforce using the provided username '{salesforce_username}' and password '{salesforce_password}'. Ensure all login fields are filled and the 'Log In' button is clicked.
99
+ 3. **Handle Post-Login Redirect:** After clicking the 'Log In' button:
100
+ * Observe the current URL. If the URL has changed from the initial login domain (e.g., from `login.salesforce.com` or `my.salesforce.com`) **immediately execute the following tool code block to open a new tab and navigate directly to the Account Forecast Settings page (`{account_forecast_url}`) to bypass any persistent loading issues or internal redirects:**
101
+ {tool_code_block_new_tab.strip()}
102
+ 4. **Confirm Target Page Load:** After successfully navigating to '{account_forecast_url}' (either directly after login or via the new tab strategy), ensure the page is fully loaded and stable. This means no loading spinners should be visible, and the main content for 'Account Forecast Settings' (like a clear heading, relevant toggles, or data tables) should be present and interactive.
103
+ 5. **Execute Main Task:** Once the Account Forecast Settings page is confirmed loaded and stable, proceed with the original user request: {request_task_instruction}.
104
+ 6. **Report Final Status:** Report the final status of the requested action, confirming both successful login and complete page load of the Account Forecast Settings.
105
+ """
106
+
107
+ logger.info(f"Executing agent task (truncated for log): '{agent_task[:500]}...'")
108
+
109
+ try:
110
+ runner = await initialize_runner()
111
+ result = await runner.run(agent_task)
112
+
113
+ logger.info(f"Proxy-lite task completed. Output (truncated for log): {result[:500]}...")
114
+ return jsonify({"output": result})
115
+ except Exception as e:
116
+ logger.exception(f"Error processing Salesforce task: {e}")
117
+ return jsonify({"error": f"An error occurred: {str(e)}. Check logs for details."}), 500
118
+
119
+ @app.route('/')
120
+ def root():
121
+ logger.info("Root endpoint accessed.")
122
+ return "Proxy-lite API is running. Send POST requests to /run_proxy_task with a 'task' in JSON body."
123
+
124
+ if __name__ == '__main__':
125
+ # It is crucial to set HF_API_TOKEN as an environment variable (e.g., in a .env file or directly)
126
+ # for local testing as well, otherwise initialize_runner will fail.
127
+ if not os.environ.get("HF_API_TOKEN"):
128
+ logger.error("HF_API_TOKEN environment variable is not set. Please set it for local testing.")
129
+ # Removed exit(1) to allow the Flask app to start for basic connectivity checks,
130
+ # but runner initialization will still fail if token is missing.
131
+ # For full functionality, the token is essential.
132
+ logger.info("Starting Flask development server on 0.0.0.0:7860...")
133
+ app.run(host='0.0.0.0', port=7860, debug=True)
pyproject.toml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "proxy-lite"
3
+ version = "0.1.0"
4
+ description = "Proxy Lite - A mini, open-weights, version of the Convergence AI Proxy assistant."
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "omegaconf>=2.3.0",
9
+ "openai>=1.61.1",
10
+ "opencv-python>=4.11.0.86",
11
+ "opencv-python-headless>=4.11.0.86",
12
+ "playwright-stealth>=1.0.6",
13
+ "playwright>=1.50.0",
14
+ "pydantic>=2.10.6",
15
+ "rich>=13.9.4",
16
+ "setuptools>=75.8.0",
17
+ "tenacity>=9.0.0",
18
+ "torch>=2.5.1",
19
+ "torchvision>=0.20.1",
20
+ "streamlit>=1.40.2",
21
+ "pre-commit>=4.1.0",
22
+ ]
23
+
24
+ [project.scripts]
25
+ proxy = "proxy_lite.cli:main"
26
+
27
+ [project.optional-dependencies]
28
+ serving = [
29
+ "transformers",
30
+ "vllm==0.7.2",
31
+ ]
32
+
33
+ [build-system]
34
+ requires = ["setuptools"]
35
+ build-backend = "setuptools.build_meta"
36
+
37
+ [tool.setuptools]
38
+ packages = { find = { where = ["src"] } }
39
+
40
+ [tool.setuptools.package-data]
41
+ proxy_lite = ["**/*.json"]
42
+
43
+ [tool.ruff]
44
+ line-length = 120
45
+
46
+ [tool.ruff.lint]
47
+ select = ["E", "F", "B", "I", "SIM"]
48
+ ignore = [
49
+ "B028",
50
+ "E722", # ignore bare except
51
+ "B904", # ignore raise from requirement
52
+ "FA102",
53
+ ]
54
+ [tool.ruff.lint.flake8-bugbear]
55
+
56
+ extend-immutable-calls = [
57
+ "fastapi.Depends",
58
+ "fastapi.params.Depends",
59
+ "fastapi.Query",
60
+ "fastapi.params.Query",
61
+ ]
62
+
63
+ [tool.uv.sources]
64
+ transformers = { git = "https://github.com/huggingface/transformers.git", rev = "336dc69d63d56f232a183a3e7f52790429b871ef" }
65
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Flask[async]
2
+ -e .
3
+ playwright
4
+ playwright-stealth==1.0.6
5
+ gunicorn
6
+ gevent
src/proxy_lite.egg-info/PKG-INFO ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: proxy-lite
3
+ Version: 0.1.0
4
+ Summary: Proxy Lite - A mini, open-weights, version of the Convergence AI Proxy assistant.
5
+ Requires-Python: >=3.11
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: omegaconf>=2.3.0
9
+ Requires-Dist: openai>=1.61.1
10
+ Requires-Dist: opencv-python>=4.11.0.86
11
+ Requires-Dist: opencv-python-headless>=4.11.0.86
12
+ Requires-Dist: playwright-stealth>=1.0.6
13
+ Requires-Dist: playwright>=1.50.0
14
+ Requires-Dist: pydantic>=2.10.6
15
+ Requires-Dist: rich>=13.9.4
16
+ Requires-Dist: setuptools>=75.8.0
17
+ Requires-Dist: tenacity>=9.0.0
18
+ Requires-Dist: torch>=2.5.1
19
+ Requires-Dist: torchvision>=0.20.1
20
+ Requires-Dist: streamlit>=1.40.2
21
+ Requires-Dist: pre-commit>=4.1.0
22
+ Provides-Extra: serving
23
+ Requires-Dist: transformers; extra == "serving"
24
+ Requires-Dist: vllm==0.7.2; extra == "serving"
25
+ Dynamic: license-file
26
+
27
+ ---
28
+ title: Proxy Lite Demo For Setup
29
+ emoji: 😻
30
+ colorFrom: indigo
31
+ colorTo: gray
32
+ sdk: docker
33
+ pinned: false
34
+ ---
35
+
36
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
src/proxy_lite.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/proxy_lite/__init__.py
5
+ src/proxy_lite/app.py
6
+ src/proxy_lite/cli.py
7
+ src/proxy_lite/client.py
8
+ src/proxy_lite/gif_maker.py
9
+ src/proxy_lite/history.py
10
+ src/proxy_lite/logger.py
11
+ src/proxy_lite/recorder.py
12
+ src/proxy_lite/runner.py
13
+ src/proxy_lite/serializer.py
14
+ src/proxy_lite.egg-info/PKG-INFO
15
+ src/proxy_lite.egg-info/SOURCES.txt
16
+ src/proxy_lite.egg-info/dependency_links.txt
17
+ src/proxy_lite.egg-info/entry_points.txt
18
+ src/proxy_lite.egg-info/requires.txt
19
+ src/proxy_lite.egg-info/top_level.txt
20
+ src/proxy_lite/agents/__init__.py
21
+ src/proxy_lite/agents/agent_base.py
22
+ src/proxy_lite/agents/proxy_lite_agent.py
23
+ src/proxy_lite/browser/__init__.py
24
+ src/proxy_lite/browser/bounding_boxes.py
25
+ src/proxy_lite/browser/browser.py
26
+ src/proxy_lite/environments/__init__.py
27
+ src/proxy_lite/environments/environment_base.py
28
+ src/proxy_lite/environments/webbrowser.py
29
+ src/proxy_lite/solvers/__init__.py
30
+ src/proxy_lite/solvers/simple_solver.py
31
+ src/proxy_lite/solvers/solver_base.py
32
+ src/proxy_lite/tools/__init__.py
33
+ src/proxy_lite/tools/browser_tool.py
34
+ src/proxy_lite/tools/return_tool.py
35
+ src/proxy_lite/tools/tool_base.py
src/proxy_lite.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
src/proxy_lite.egg-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ proxy = proxy_lite.cli:main
src/proxy_lite.egg-info/requires.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ omegaconf>=2.3.0
2
+ openai>=1.61.1
3
+ opencv-python>=4.11.0.86
4
+ opencv-python-headless>=4.11.0.86
5
+ playwright-stealth>=1.0.6
6
+ playwright>=1.50.0
7
+ pydantic>=2.10.6
8
+ rich>=13.9.4
9
+ setuptools>=75.8.0
10
+ tenacity>=9.0.0
11
+ torch>=2.5.1
12
+ torchvision>=0.20.1
13
+ streamlit>=1.40.2
14
+ pre-commit>=4.1.0
15
+
16
+ [serving]
17
+ transformers
18
+ vllm==0.7.2
src/proxy_lite.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ proxy_lite
src/proxy_lite/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .runner import Runner, RunnerConfig
2
+
3
+ __all__ = ["Runner", "RunnerConfig"]
src/proxy_lite/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (258 Bytes). View file
 
src/proxy_lite/__pycache__/cli.cpython-313.pyc ADDED
Binary file (5.59 kB). View file
 
src/proxy_lite/__pycache__/client.cpython-313.pyc ADDED
Binary file (9.28 kB). View file
 
src/proxy_lite/__pycache__/gif_maker.cpython-313.pyc ADDED
Binary file (6.39 kB). View file
 
src/proxy_lite/__pycache__/history.cpython-313.pyc ADDED
Binary file (10.5 kB). View file
 
src/proxy_lite/__pycache__/logger.cpython-313.pyc ADDED
Binary file (3.56 kB). View file
 
src/proxy_lite/__pycache__/recorder.cpython-313.pyc ADDED
Binary file (6.73 kB). View file
 
src/proxy_lite/__pycache__/runner.cpython-313.pyc ADDED
Binary file (14.7 kB). View file
 
src/proxy_lite/__pycache__/serializer.cpython-313.pyc ADDED
Binary file (3.04 kB). View file
 
src/proxy_lite/agents/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ from .agent_base import Agents, BaseAgent, BaseAgentConfig
4
+ from .proxy_lite_agent import ProxyLiteAgent, ProxyLiteAgentConfig
5
+
6
+ AgentTypes = Union[*list(Agents._agent_registry.values())]
7
+ AgentConfigTypes = Union[*list(Agents._agent_config_registry.values())]
8
+
9
+
10
+ __all__ = [
11
+ "AgentConfigTypes",
12
+ "AgentTypes",
13
+ "Agents",
14
+ "BaseAgent",
15
+ "BaseAgentConfig",
16
+ "ProxyLiteAgent",
17
+ "ProxyLiteAgentConfig",
18
+ ]
src/proxy_lite/agents/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (758 Bytes). View file
 
src/proxy_lite/agents/__pycache__/agent_base.cpython-313.pyc ADDED
Binary file (12.8 kB). View file
 
src/proxy_lite/agents/__pycache__/proxy_lite_agent.cpython-313.pyc ADDED
Binary file (3.63 kB). View file
 
src/proxy_lite/agents/agent_base.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ from abc import ABC, abstractmethod
4
+ from contextlib import AsyncExitStack
5
+ from functools import cached_property
6
+ from typing import Any, Optional, Type, cast
7
+
8
+ from pydantic import BaseModel, Field
9
+ from tenacity import before_sleep_log, retry, stop_after_attempt, wait_exponential
10
+
11
+ from proxy_lite.client import BaseClient, ClientConfigTypes, OpenAIClientConfig
12
+ from proxy_lite.history import (
13
+ AssistantMessage,
14
+ MessageHistory,
15
+ MessageLabel,
16
+ SystemMessage,
17
+ Text,
18
+ ToolCall,
19
+ ToolMessage,
20
+ UserMessage,
21
+ )
22
+ from proxy_lite.logger import logger
23
+ from proxy_lite.tools import Tool
24
+
25
+ # if TYPE_CHECKING:
26
+ # from proxy_lite.tools import Tool
27
+
28
+
29
+ class BaseAgentConfig(BaseModel):
30
+ client: ClientConfigTypes = Field(default_factory=OpenAIClientConfig)
31
+ history_messages_limit: dict[MessageLabel, int] = Field(default_factory=lambda: dict())
32
+ history_messages_include: Optional[dict[MessageLabel, int]] = Field(
33
+ default=None,
34
+ description="If set, overrides history_messages_limit by setting all message types to 0 except those specified",
35
+ )
36
+
37
+ def model_post_init(self, __context: Any) -> None:
38
+ if self.history_messages_include is not None:
39
+ self.history_messages_limit = {label: 0 for label in MessageLabel}
40
+ self.history_messages_limit.update(self.history_messages_include)
41
+
42
+
43
+ class BaseAgent(BaseModel, ABC):
44
+ config: BaseAgentConfig
45
+ temperature: float = Field(default=0.7, ge=0, le=2)
46
+ history: MessageHistory = Field(default_factory=MessageHistory)
47
+ client: Optional[BaseClient] = None
48
+ env_tools: list[Tool] = Field(default_factory=list)
49
+ task: Optional[str] = Field(default=None)
50
+ seed: Optional[int] = Field(default=None)
51
+
52
+ class Config:
53
+ arbitrary_types_allowed = True
54
+
55
+ def __init__(self, **data) -> None:
56
+ super().__init__(**data)
57
+ self._exit_stack = AsyncExitStack()
58
+ self._tools_init_task = None
59
+
60
+ def model_post_init(self, __context: Any) -> None:
61
+ super().model_post_init(__context)
62
+ self.client = BaseClient.create(self.config.client)
63
+
64
+ @property
65
+ @abstractmethod
66
+ def system_prompt(self) -> str: ...
67
+
68
+ @cached_property
69
+ @abstractmethod
70
+ def tools(self) -> list[Tool]: ...
71
+
72
+ @cached_property
73
+ def tool_descriptions(self) -> str:
74
+ tool_descriptions = []
75
+ for tool in self.tools:
76
+ func_descriptions = "\n".join("- {name}: {description}".format(**schema) for schema in tool.schema)
77
+ tool_title = f"{tool.__class__.__name__}:\n" if len(self.tools) > 1 else ""
78
+ tool_descriptions.append(f"{tool_title}{func_descriptions}")
79
+ return "\n\n".join(tool_descriptions)
80
+
81
+ async def get_history_view(self) -> MessageHistory:
82
+ return MessageHistory(
83
+ messages=[SystemMessage(content=[Text(text=self.system_prompt)])],
84
+ ) + self.history.history_view(
85
+ limits=self.config.history_messages_limit,
86
+ )
87
+
88
+ @retry(
89
+ wait=wait_exponential(multiplier=1, min=4, max=10),
90
+ stop=stop_after_attempt(3),
91
+ reraise=True,
92
+ before_sleep=before_sleep_log(logger, logging.ERROR),
93
+ )
94
+ async def generate_output(
95
+ self,
96
+ use_tool: bool = False,
97
+ response_format: Optional[type[BaseModel]] = None,
98
+ append_assistant_message: bool = True,
99
+ ) -> AssistantMessage:
100
+ messages: MessageHistory = await self.get_history_view()
101
+ response_content = (
102
+ await self.client.create_completion(
103
+ messages=messages,
104
+ temperature=self.temperature,
105
+ seed=self.seed,
106
+ response_format=response_format,
107
+ tools=self.tools if use_tool else None,
108
+ )
109
+ ).model_dump()
110
+ response_content = response_content["choices"][0]["message"]
111
+ assistant_message = AssistantMessage(
112
+ role=response_content["role"],
113
+ content=[Text(text=response_content["content"])] if response_content["content"] else [],
114
+ tool_calls=response_content["tool_calls"],
115
+ )
116
+ if append_assistant_message:
117
+ self.history.append(message=assistant_message, label=self.message_label)
118
+ return assistant_message
119
+
120
+ def receive_user_message(
121
+ self,
122
+ text: Optional[str] = None,
123
+ image: list[bytes] = None,
124
+ label: MessageLabel = None,
125
+ is_base64: bool = False,
126
+ ) -> None:
127
+ message = UserMessage.from_media(
128
+ text=text,
129
+ image=image,
130
+ is_base64=is_base64,
131
+ )
132
+ self.history.append(message=message, label=label)
133
+
134
+ def receive_system_message(
135
+ self,
136
+ text: Optional[str] = None,
137
+ label: MessageLabel = None,
138
+ ) -> None:
139
+ message = SystemMessage.from_media(text=text)
140
+ self.history.append(message=message, label=label)
141
+
142
+ def receive_assistant_message(
143
+ self,
144
+ content: Optional[str] = None,
145
+ tool_calls: Optional[list[ToolCall]] = None,
146
+ label: MessageLabel = None,
147
+ ) -> None:
148
+ message = AssistantMessage(
149
+ content=[Text(text=content)] if content else [],
150
+ tool_calls=tool_calls,
151
+ )
152
+ self.history.append(message=message, label=label)
153
+
154
+ async def use_tool(self, tool_call: ToolCall):
155
+ function = tool_call.function
156
+ for tool in self.tools:
157
+ if hasattr(tool, function["name"]):
158
+ return await getattr(tool, function["name"])(
159
+ **json.loads(function["arguments"]),
160
+ )
161
+ msg = f'No tool function with name "{function["name"]}"'
162
+ raise ValueError(msg)
163
+
164
+ async def receive_tool_message(
165
+ self,
166
+ text: str,
167
+ tool_id: str,
168
+ label: MessageLabel = None,
169
+ ) -> None:
170
+ self.history.append(
171
+ message=ToolMessage(content=[Text(text=text)], tool_call_id=tool_id),
172
+ label=label,
173
+ )
174
+
175
+
176
+ class Agents:
177
+ _agent_registry: dict[str, type[BaseAgent]] = {}
178
+ _agent_config_registry: dict[str, type[BaseAgentConfig]] = {}
179
+
180
+ @classmethod
181
+ def register_agent(cls, name: str):
182
+ """
183
+ Decorator to register an Agent class under a given name.
184
+
185
+ Example:
186
+ @Agents.register_agent("browser")
187
+ class BrowserAgent(BaseAgent):
188
+ ...
189
+ """
190
+
191
+ def decorator(agent_cls: type[BaseAgent]) -> type[BaseAgent]:
192
+ cls._agent_registry[name] = agent_cls
193
+ return agent_cls
194
+
195
+ return decorator
196
+
197
+ @classmethod
198
+ def register_agent_config(cls, name: str):
199
+ """
200
+ Decorator to register a configuration class under a given name.
201
+
202
+ Example:
203
+ @Agents.register_agent_config("browser")
204
+ class BrowserAgentConfig(BaseAgentConfig):
205
+ ...
206
+ """
207
+
208
+ def decorator(config_cls: type[BaseAgentConfig]) -> type[BaseAgentConfig]:
209
+ cls._agent_config_registry[name] = config_cls
210
+ return config_cls
211
+
212
+ return decorator
213
+
214
+ @classmethod
215
+ def get(cls, name: str) -> type[BaseAgent]:
216
+ """
217
+ Retrieve a registered Agent class by its name.
218
+
219
+ Raises:
220
+ ValueError: If no such agent is found.
221
+ """
222
+ try:
223
+ return cast(Type[BaseAgent], cls._agent_registry[name])
224
+ except KeyError:
225
+ raise ValueError(f"Agent '{name}' not found.")
226
+
227
+ @classmethod
228
+ def get_config(cls, name: str) -> type[BaseAgentConfig]:
229
+ """
230
+ Retrieve a registered Agent configuration class by its name.
231
+
232
+ Raises:
233
+ ValueError: If no such config is found.
234
+ """
235
+ try:
236
+ return cast(type[BaseAgentConfig], cls._agent_config_registry[name])
237
+ except KeyError:
238
+ raise ValueError(f"Agent config for '{name}' not found.")
src/proxy_lite/agents/proxy_lite_agent.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import cached_property
2
+ from typing import Literal
3
+
4
+ from pydantic import Field
5
+
6
+ from proxy_lite.history import MessageHistory, MessageLabel, SystemMessage, Text
7
+ from proxy_lite.tools import Tool
8
+
9
+ from .agent_base import Agents, BaseAgent, BaseAgentConfig
10
+
11
+ MODEL_SYSTEM_PROMPT = """You are Proxy-Lite, an AI assistant that can perform actions on a computer screen.
12
+ You were developed by Convergence AI.
13
+ The user will instuct you to perform a task.
14
+ You will be shown a screen as well as relevant interactable elements highlighted by mark_ids and you will be given a set of tools to use to perform the task.
15
+ You should make observations about the screen, putting them in <observation></observation> tags.
16
+ You should then reason about what needs to be done to complete the task, putting your thoughts in <thinking></thinking> tags.
17
+ You should then use the tools to perform the task, putting the tool calls in <tool_call></tool_call> tags.
18
+ """ # noqa: E501
19
+
20
+ MAX_MESSAGES_FOR_CONTEXT_WINDOW = {
21
+ MessageLabel.SCREENSHOT: 1,
22
+ }
23
+
24
+
25
+ @Agents.register_agent_config("proxy_lite")
26
+ class ProxyLiteAgentConfig(BaseAgentConfig):
27
+ name: Literal["proxy_lite"] = "proxy_lite"
28
+ history_messages_limit: dict[MessageLabel, int] = Field(
29
+ default_factory=lambda: MAX_MESSAGES_FOR_CONTEXT_WINDOW,
30
+ )
31
+
32
+
33
+ @Agents.register_agent("proxy_lite")
34
+ class ProxyLiteAgent(BaseAgent):
35
+ config: ProxyLiteAgentConfig
36
+ message_label: MessageLabel = MessageLabel.AGENT_MODEL_RESPONSE
37
+
38
+ def __init__(self, **data):
39
+ super().__init__(**data)
40
+
41
+ @property
42
+ def system_prompt(self) -> str:
43
+ return MODEL_SYSTEM_PROMPT
44
+
45
+ @cached_property
46
+ def tools(self) -> list[Tool]:
47
+ return self.env_tools
48
+
49
+ async def get_history_view(self) -> MessageHistory:
50
+ return MessageHistory(
51
+ messages=[SystemMessage(content=[Text(text=self.system_prompt)])],
52
+ ) + self.history.history_view(
53
+ limits=self.config.history_messages_limit,
54
+ )
src/proxy_lite/app.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import base64
3
+ from io import BytesIO
4
+
5
+ import streamlit as st
6
+ from PIL import Image
7
+
8
+ from proxy_lite import Runner, RunnerConfig
9
+
10
+
11
+ def get_user_config(config_expander):
12
+ config = {
13
+ "environment": {
14
+ "name": "webbrowser",
15
+ "annotate_image": True,
16
+ "screenshot_delay": 2.0,
17
+ "include_html": False,
18
+ "viewport_width": 1280,
19
+ "viewport_height": 1920,
20
+ "include_poi_text": True,
21
+ "homepage": "https://dwd000006jia1mae.lightning.force.com/lightning/setup/AccountForecastSettings/home",
22
+ "keep_original_image": False,
23
+ "headless": False, # without proxies headless mode often results in getting bot blocked
24
+ },
25
+ "solver": {
26
+ "name": "simple",
27
+ "agent": {
28
+ "name": "proxy_lite",
29
+ "client": {
30
+ "name": "convergence",
31
+ "model_id": "convergence-ai/proxy-lite-3b",
32
+ "api_base": "https://convergence-ai-demo-api.hf.space/v1",
33
+ },
34
+ },
35
+ },
36
+ "local_view": False,
37
+ "verbose": True,
38
+ "task_timeout": 1800, # 30 minutes
39
+ "action_timeout": 300,
40
+ "environment_timeout": 120,
41
+ }
42
+
43
+ with config_expander:
44
+ st.subheader("Environment Settings")
45
+ col1, col2 = st.columns(2)
46
+
47
+ with col1:
48
+ config["environment"]["include_html"] = st.checkbox(
49
+ "Include HTML",
50
+ value=config["environment"]["include_html"],
51
+ help="Include HTML in observations",
52
+ )
53
+ config["environment"]["include_poi_text"] = st.checkbox(
54
+ "Include POI Text",
55
+ value=config["environment"]["include_poi_text"],
56
+ help="Include points of interest text in observations",
57
+ )
58
+ config["environment"]["homepage"] = st.text_input(
59
+ "Homepage",
60
+ value=config["environment"]["homepage"],
61
+ help="Homepage to start from",
62
+ )
63
+
64
+ with col2:
65
+ config["solver"]["agent"]["client"]["api_base"] = st.text_input(
66
+ "VLLM Server URL",
67
+ value=config["solver"]["agent"]["client"]["api_base"],
68
+ help="URL of a vllm server running proxy-lite",
69
+ )
70
+ config["environment"]["screenshot_delay"] = st.slider(
71
+ "Screenshot Delay (seconds)",
72
+ min_value=0.5,
73
+ max_value=10.0,
74
+ value=config["environment"]["screenshot_delay"],
75
+ step=0.5,
76
+ help="Delay before taking screenshots",
77
+ )
78
+
79
+ st.subheader("Advanced Settings")
80
+ config["task_timeout"] = st.number_input(
81
+ "Task Timeout (seconds)",
82
+ min_value=60,
83
+ max_value=3600,
84
+ step=60,
85
+ value=config["task_timeout"],
86
+ help="Maximum time allowed for task completion",
87
+ )
88
+ config["action_timeout"] = st.number_input(
89
+ "Action Timeout (seconds)",
90
+ min_value=10,
91
+ max_value=300,
92
+ step=10,
93
+ value=config["action_timeout"],
94
+ help="Maximum time allowed for an action to complete",
95
+ )
96
+ config["environment_timeout"] = st.number_input(
97
+ "Environment Timeout (seconds)",
98
+ min_value=10,
99
+ max_value=300,
100
+ step=10,
101
+ value=config["environment_timeout"],
102
+ help="Maximum time allowed for environment to respond",
103
+ )
104
+
105
+ return config
106
+
107
+
108
+ async def run_task_async(
109
+ task: str,
110
+ status_placeholder,
111
+ action_placeholder,
112
+ environment_placeholder,
113
+ image_placeholder,
114
+ history_placeholder,
115
+ config: dict,
116
+ ):
117
+ try:
118
+ config = RunnerConfig.from_dict(config)
119
+ except Exception as e:
120
+ st.error(f"Error loading RunnerConfig: {e!s}")
121
+ return
122
+ print(config)
123
+ runner = Runner(config=config)
124
+
125
+ # Add the spinning animation using HTML
126
+ status_placeholder.markdown(
127
+ """
128
+ <style>
129
+ @keyframes spin {
130
+ 0% { content: "⚡"; }
131
+ 25% { content: "⚡."; }
132
+ 50% { content: "⚡.."; }
133
+ 75% { content: "⚡..."; }
134
+ }
135
+ .spinner::before {
136
+ content: "⚡";
137
+ animation: spin 2s linear infinite;
138
+ display: inline-block;
139
+ }
140
+ </style>
141
+ <div><b>Resolving your task </b><span class="spinner"></span></div>
142
+ """,
143
+ unsafe_allow_html=True,
144
+ )
145
+
146
+ all_steps = []
147
+ all_screenshots = []
148
+ all_soms = []
149
+
150
+ async for run in runner.run_generator(task):
151
+ # Update status with latest step
152
+ if run.actions:
153
+ latest_step = run.actions[-1].text
154
+ latest_step += "".join(
155
+ [
156
+ f'<tool_call>{{"name": {tool_call.function["name"]}, "arguments": {tool_call.function["arguments"]}}}</tool_call>' # noqa: E501
157
+ for tool_call in run.actions[-1].tool_calls
158
+ ]
159
+ )
160
+ action_placeholder.write(f"⚡ **Latest Step:** {latest_step}")
161
+ all_steps.append(latest_step)
162
+
163
+ # Update image if available
164
+ if run.observations and run.observations[-1].state.image:
165
+ environment_placeholder.write("🌐 **Environment:**")
166
+ image_bytes = base64.b64decode(run.observations[-1].state.image)
167
+ image = Image.open(BytesIO(image_bytes))
168
+ image_placeholder.image(image, use_container_width=True)
169
+ all_screenshots.append(image)
170
+ som = run.observations[-1].state.text
171
+ all_soms.append(som)
172
+
173
+ # Update history
174
+ with history_placeholder, st.expander("🕝 **History**"):
175
+ for idx, (action, img, som) in enumerate(zip(all_steps, all_screenshots, all_soms, strict=False)):
176
+ st.write(f"**Step {idx + 1}**")
177
+ st.image(img, use_container_width=True)
178
+ st.markdown(som)
179
+ st.write(action)
180
+ action_placeholder.write(" ")
181
+ status_placeholder.write(f"✨ **Result:** {latest_step}")
182
+
183
+
184
+ def main():
185
+ st.title("⚡ Proxy-Lite")
186
+
187
+ def img_to_base64(image_path):
188
+ with open(image_path, "rb") as img_file:
189
+ return base64.b64encode(img_file.read()).decode("utf-8")
190
+
191
+ st.markdown("Powered by **Proxy-Lite**", unsafe_allow_html=True)
192
+
193
+ if "config_expanded" not in st.session_state:
194
+ st.session_state.config_expanded = False
195
+ if "settings_expanded" not in st.session_state:
196
+ st.session_state.settings_expanded = False
197
+
198
+ config_expander = st.expander("⚙️ Proxy-Lite Configuration", expanded=st.session_state.config_expanded)
199
+ config = get_user_config(config_expander)
200
+
201
+ with st.form(key="run_task_form"):
202
+ task = st.text_input(
203
+ "Submit a task",
204
+ key="task_input",
205
+ help="Enter a task to be completed",
206
+ )
207
+ submit_button = st.form_submit_button("Submit a task", type="primary", use_container_width=True)
208
+
209
+ if submit_button:
210
+ st.session_state.config_expanded = False
211
+ if task:
212
+ # Create placeholders for dynamic updates
213
+ status_placeholder = st.empty()
214
+ st.write(" ")
215
+ action_placeholder = st.empty()
216
+ environment_placeholder = st.empty()
217
+ image_placeholder = st.empty()
218
+ history_placeholder = st.empty()
219
+
220
+ # Run the async task
221
+ asyncio.run(
222
+ run_task_async(
223
+ task,
224
+ status_placeholder,
225
+ action_placeholder,
226
+ environment_placeholder,
227
+ image_placeholder,
228
+ history_placeholder,
229
+ config,
230
+ ),
231
+ )
232
+
233
+ st.success("Task completed!", icon="✨")
234
+ else:
235
+ st.error("Please give a task first!")
236
+
237
+
238
+ if __name__ == "__main__":
239
+ main()
src/proxy_lite/browser/__init__.py ADDED
File without changes
src/proxy_lite/browser/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (166 Bytes). View file
 
src/proxy_lite/browser/__pycache__/bounding_boxes.cpython-313.pyc ADDED
Binary file (8.86 kB). View file
 
src/proxy_lite/browser/__pycache__/browser.cpython-313.pyc ADDED
Binary file (30.3 kB). View file
 
src/proxy_lite/browser/add_custom_select.js ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ handledSelectElementsConvergence = new WeakSet();
2
+
3
+ overwriteDefaultSelectConvergence = (input = null) => {
4
+ let activeSelectElement = null;
5
+
6
+ // Handle iframe input element
7
+ let rootElement = input ? input : document.documentElement;
8
+
9
+ function createCustomSelectElement() {
10
+ // Create the custom select container
11
+ const customSelect = document.createElement('div');
12
+ customSelect.id = 'convergence-custom-select-element-X2EmudtLRN';
13
+ customSelect.style.position = 'absolute'
14
+ customSelect.style.zIndex = 2147483647 - 1;
15
+ customSelect.style.display = 'none';
16
+ document.body.appendChild(customSelect);
17
+
18
+ // Create the select options list
19
+ const optionsList = document.createElement('div');
20
+ optionsList.style.border = '1px solid #ccc';
21
+ optionsList.style.backgroundColor = '#fff';
22
+ optionsList.style.color = 'black';
23
+ customSelect.appendChild(optionsList);
24
+
25
+ return customSelect;
26
+ }
27
+
28
+ function showCustomSelect(select) {
29
+ activeSelectElement = select;
30
+
31
+ // Clear previous options
32
+ const customSelect = rootElement.querySelector('#convergence-custom-select-element-X2EmudtLRN');
33
+ let optionsList = customSelect.firstChild;
34
+ optionsList.innerHTML = '';
35
+
36
+ // Populate with new options
37
+ Array.from(select.options).forEach(option => {
38
+ const customOption = document.createElement('div');
39
+ customOption.className = 'custom-option';
40
+ customOption.style.padding = '8px';
41
+ customOption.style.cursor = 'pointer';
42
+ customOption.textContent = option.text;
43
+ customOption.dataset.value = option.value;
44
+ optionsList.appendChild(customOption);
45
+
46
+ customOption.addEventListener('mouseenter', function () {
47
+ customOption.style.backgroundColor = '#f0f0f0';
48
+ });
49
+
50
+ customOption.addEventListener('mouseleave', function () {
51
+ customOption.style.backgroundColor = '';
52
+ });
53
+
54
+ customOption.addEventListener('mousedown', (e) => {
55
+ e.stopPropagation();
56
+ select.value = customOption.dataset.value;
57
+ customSelect.style.display = 'none';
58
+ activeSelectElement = null;
59
+ // ensure we trigger all potential event listeners
60
+ select.dispatchEvent(new InputEvent('focus', { bubbles: true, cancelable: true }));
61
+ select.dispatchEvent(new InputEvent('input', { bubbles: true, cancelable: true }));
62
+ select.dispatchEvent(new InputEvent('change', { bubbles: true, cancelable: true }));
63
+ select.dispatchEvent(new InputEvent('blur', { bubbles: true, cancelable: true }));
64
+ });
65
+ });
66
+
67
+ // Position and show the custom select
68
+ const selectRect = select.getBoundingClientRect();
69
+ customSelect.style.top = `${selectRect.bottom + window.scrollY}px`;
70
+ customSelect.style.left = `${selectRect.left + window.scrollX}px`;
71
+ customSelect.style.width = `${selectRect.width}px`;
72
+ customSelect.style.display = 'block';
73
+ select.focus();
74
+ select.addEventListener('blur', function (e) {
75
+ customSelect.style.display = 'none';
76
+ activeSelectElement = null;
77
+ });
78
+ select.addEventListener('change', function (e) {
79
+ customSelect.style.display = 'none';
80
+ activeSelectElement = null;
81
+ });
82
+ }
83
+
84
+ // Ensure we have a custom select element
85
+ let customSelect = rootElement.querySelector(`#convergence-custom-select-element-X2EmudtLRN`);
86
+ if (!customSelect) {
87
+ customSelect = createCustomSelectElement();
88
+ }
89
+
90
+ // Find selects in shadow DOMs
91
+ function findSelectInShadowRoot(element) {
92
+ if (element.shadowRoot) {
93
+ return element.shadowRoot.querySelectorAll('select');
94
+ }
95
+ return [];
96
+ }
97
+ let shadowSelects = [];
98
+ rootElement.querySelectorAll('*').forEach(el => {
99
+ shadowSelects.push(...findSelectInShadowRoot(el));
100
+ });
101
+
102
+ // Find selects in the regular (light) DOM
103
+ const lightSelects = Array.from(rootElement.querySelectorAll('select'));
104
+
105
+ // Add event listeners to all select elements
106
+ const allSelects = [...lightSelects, ...shadowSelects];
107
+ allSelects.forEach(select => {
108
+ if (select.hasAttribute('multiple')) {
109
+ // skip special multiple elements as our POI code already handles them
110
+ return;
111
+ }
112
+ if (!handledSelectElementsConvergence.has(select)) {
113
+ select.addEventListener('mousedown', (e) => {
114
+ // only use custom select when the default behaviour is being used
115
+ if (!e.defaultPrevented) {
116
+ showCustomSelect(select);
117
+ e.preventDefault();
118
+ }
119
+ });
120
+ handledSelectElementsConvergence.add(select);
121
+ }
122
+ });
123
+ }
src/proxy_lite/browser/bounding_boxes.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import Any
3
+
4
+ import cv2
5
+ import numpy as np
6
+ from pydantic import BaseModel, Field, field_validator
7
+
8
+
9
+ class Point(BaseModel):
10
+ x: int
11
+ y: int
12
+
13
+ def __iter__(self):
14
+ return iter((self.x, self.y))
15
+
16
+ def __getitem__(self, index) -> int:
17
+ return (self.x, self.y)[index]
18
+
19
+ def __tuple__(self) -> tuple[int, int]:
20
+ return (self.x, self.y)
21
+
22
+ def __repr__(self) -> str:
23
+ return f"Point(x={self.x}, y={self.y})"
24
+
25
+
26
+ class BoundingBox(BaseModel):
27
+ label: str = Field(..., description="The label that's given for this bounding box")
28
+ left: int = Field(..., description="Left coordinate of the bounding box")
29
+ right: int = Field(..., description="Right coordinate of the bounding box")
30
+ top: int = Field(..., description="Top coordinate of the bounding box")
31
+ bottom: int = Field(..., description="Bottom coordinate of the bounding box")
32
+
33
+ @field_validator("left", "top", mode="before")
34
+ @classmethod
35
+ def round_down(cls, v):
36
+ return math.floor(float(v))
37
+
38
+ @field_validator("right", "bottom", mode="before")
39
+ @classmethod
40
+ def round_up(cls, v):
41
+ return math.ceil(float(v))
42
+
43
+
44
+ class POI(BaseModel):
45
+ info: dict[str, Any]
46
+ element_centroid: Point
47
+ bounding_box: BoundingBox
48
+
49
+
50
+ def calculate_dash_points(start, end, dash_length, gap_length):
51
+ x1, y1 = start
52
+ x2, y2 = end
53
+ dx = x2 - x1
54
+ dy = y2 - y1
55
+ dist = np.sqrt(dx * dx + dy * dy)
56
+
57
+ if dist == 0:
58
+ return []
59
+
60
+ unit_x = dx / dist
61
+ unit_y = dy / dist
62
+
63
+ dash_points = []
64
+ current_dist = 0
65
+ while current_dist < dist:
66
+ dash_end = min(current_dist + dash_length, dist)
67
+ dash_points.extend(
68
+ [
69
+ (int(x1 + unit_x * current_dist), int(y1 + unit_y * current_dist)),
70
+ (int(x1 + unit_x * dash_end), int(y1 + unit_y * dash_end)),
71
+ ],
72
+ )
73
+ current_dist += dash_length + gap_length
74
+
75
+ return dash_points
76
+
77
+
78
+ def draw_dashed_rectangle(
79
+ img,
80
+ bbox: BoundingBox,
81
+ color,
82
+ thickness=1,
83
+ dash_length=10,
84
+ gap_length=5,
85
+ ):
86
+ # Calculate dash points for all sides
87
+ top_points = calculate_dash_points(
88
+ (bbox.left + 25, bbox.top + 25),
89
+ (bbox.right + 25, bbox.top + 25),
90
+ dash_length,
91
+ gap_length,
92
+ )
93
+ right_points = calculate_dash_points(
94
+ (bbox.right + 25, bbox.top + 25),
95
+ (bbox.right + 25, bbox.bottom + 25),
96
+ dash_length,
97
+ gap_length,
98
+ )
99
+ bottom_points = calculate_dash_points(
100
+ (bbox.right + 25, bbox.bottom + 25),
101
+ (bbox.left + 25, bbox.bottom + 25),
102
+ dash_length,
103
+ gap_length,
104
+ )
105
+ left_points = calculate_dash_points(
106
+ (bbox.left + 25, bbox.bottom + 25),
107
+ (bbox.left + 25, bbox.top + 25),
108
+ dash_length,
109
+ gap_length,
110
+ )
111
+
112
+ # Combine all points
113
+ all_points = top_points + right_points + bottom_points + left_points
114
+
115
+ # Draw all lines at once
116
+ if all_points:
117
+ all_points = np.array(all_points).reshape((-1, 2, 2))
118
+ cv2.polylines(img, all_points, False, color, thickness)
119
+
120
+
121
+ # @time_it(name='Annotate bounding box')
122
+ def annotate_bounding_box(image: bytes, bbox: BoundingBox) -> None:
123
+ # Draw dashed bounding box
124
+ draw_dashed_rectangle(
125
+ image,
126
+ bbox,
127
+ color=(0, 0, 255),
128
+ thickness=1,
129
+ dash_length=10,
130
+ gap_length=5,
131
+ )
132
+
133
+ # Prepare label
134
+ font_scale = 0.4 * 4 # Increased by 4x for the larger patch
135
+ font = cv2.FONT_HERSHEY_SIMPLEX
136
+ thickness = 3 # Increased thickness for the larger patch
137
+
138
+ # Get text size for the larger patch
139
+ (label_width, label_height), _ = cv2.getTextSize(
140
+ bbox.label,
141
+ font,
142
+ font_scale,
143
+ thickness,
144
+ )
145
+
146
+ # Create a larger patch (4x)
147
+ large_label_patch = np.zeros(
148
+ (label_height + 20, label_width + 20, 4),
149
+ dtype=np.uint8,
150
+ )
151
+ large_label_patch[:, :, 0:3] = (0, 0, 255) # BGR color format: Red background
152
+ large_label_patch[:, :, 3] = 128 # Alpha channel: 50% opacity (128/255 = 0.5)
153
+
154
+ # Draw text on the larger patch
155
+ cv2.putText(
156
+ large_label_patch,
157
+ bbox.label,
158
+ (8, label_height + 8), # Adjusted position for the larger patch
159
+ font,
160
+ font_scale,
161
+ (255, 255, 255, 128), # White text, 50% opaque (128/255 = 0.5)
162
+ thickness,
163
+ )
164
+
165
+ # Scale down the patch to improve anti-aliasing
166
+ label_patch = cv2.resize(
167
+ large_label_patch,
168
+ (label_width // 4 + 5, label_height // 4 + 5),
169
+ interpolation=cv2.INTER_AREA,
170
+ )
171
+
172
+ # Calculate position for top-left alignment
173
+ offset = 2 # Small offset to prevent touching the bounding box edge
174
+ x = min(image.shape[1], max(0, int(bbox.left + 25) - offset))
175
+ y = min(image.shape[0], max(0, int(bbox.top + 25) - label_patch.shape[0] - offset))
176
+
177
+ # Ensure we're not out of bounds
178
+ x_end = min(image.shape[1], x + label_patch.shape[1])
179
+ y_end = min(image.shape[0], y + label_patch.shape[0])
180
+ label_patch = label_patch[: (y_end - y), : (x_end - x)]
181
+
182
+ # Create a mask for the label patch
183
+ alpha_mask = label_patch[:, :, 3] / 255.0
184
+ alpha_mask = np.repeat(alpha_mask[:, :, np.newaxis], 3, axis=2)
185
+
186
+ # Blend the label patch with the image
187
+ image_section = image[y:y_end, x:x_end]
188
+ blended = (1 - alpha_mask) * image_section + alpha_mask * label_patch[:, :, 0:3]
189
+ image[y:y_end, x:x_end] = blended.astype(np.uint8)
190
+
191
+
192
+ def annotate_bounding_boxes(image: bytes, bounding_boxes: list[BoundingBox]) -> bytes:
193
+ # Read the image
194
+ nparr = np.frombuffer(image, np.uint8)
195
+ # Decode the image
196
+ img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
197
+ padded_img = cv2.copyMakeBorder(
198
+ img,
199
+ top=25, # Value chosen based on label size
200
+ bottom=25, # Value chosen based on label size
201
+ left=25, # Value chosen based on label size
202
+ right=25, # Value chosen based on label size
203
+ borderType=cv2.BORDER_CONSTANT,
204
+ value=(255, 255, 255),
205
+ )
206
+ for bounding_box in bounding_boxes:
207
+ # Annotate the image in place with the bounding box and the bounding box label
208
+ annotate_bounding_box(padded_img, bounding_box)
209
+ _, buffer = cv2.imencode(".jpeg", padded_img)
210
+ return buffer.tobytes()
src/proxy_lite/browser/browser.py ADDED
@@ -0,0 +1,508 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ import platform
4
+ import re
5
+ from contextlib import AsyncExitStack
6
+ from pathlib import Path
7
+ from typing import Literal, Optional, Self
8
+
9
+ from playwright.async_api import Browser, BrowserContext, Page, Playwright, async_playwright
10
+ from playwright.async_api import TimeoutError as PlaywrightTimeoutError
11
+ from playwright_stealth import StealthConfig, stealth_async
12
+ from pydantic import Field
13
+ from tenacity import before_sleep_log, retry, stop_after_delay, wait_exponential
14
+
15
+ from proxy_lite.browser.bounding_boxes import POI, BoundingBox, Point, annotate_bounding_boxes
16
+ from proxy_lite.logger import logger
17
+
18
+ import base64
19
+
20
+ SELF_CONTAINED_TAGS = [
21
+ # many of these are non-interactive but keeping them anyway
22
+ "area",
23
+ "base",
24
+ "br",
25
+ "col",
26
+ "embed",
27
+ "hr",
28
+ "img",
29
+ "input",
30
+ "link",
31
+ "meta",
32
+ "param",
33
+ "source",
34
+ "track",
35
+ "wbr",
36
+ ]
37
+
38
+
39
+ def element_as_text(
40
+ mark_id: int,
41
+ tag: Optional[str] = None,
42
+ text: Optional[str] = None,
43
+ **raw_attributes,
44
+ ) -> str:
45
+ """Return a text representation of all elements on the page."""
46
+ attributes = []
47
+ for k, v in raw_attributes.items():
48
+ if v is None:
49
+ continue
50
+ if isinstance(v, bool):
51
+ if v:
52
+ attributes.append(k)
53
+ # we ignore False bool attributes
54
+ else:
55
+ v = str(v)
56
+ if len(v) > 2500:
57
+ v = v[: 2500 - 1] + "…"
58
+ attributes.append(f'{k}="{v}"')
59
+ attributes = " ".join(attributes)
60
+ attributes = (" " + attributes).rstrip()
61
+ tag = tag.lower()
62
+ if text is None:
63
+ text = ""
64
+ if len(text) > 2500:
65
+ text = text[: 2500 - 1] + "…"
66
+
67
+ # sub-out line breaks so elements are easier to distinguish
68
+ attributes = re.sub(r"\r\n|\r|\n", "⏎", attributes)
69
+ text = re.sub(r"\r\n|\r|\n", "⏎", text)
70
+
71
+ if tag in SELF_CONTAINED_TAGS:
72
+ if text:
73
+ logger.warning(
74
+ f"Got self-contained element '{tag}' which contained text '{text}'.",
75
+ )
76
+ else:
77
+ return f"- [{mark_id}] <{tag}{attributes}/>"
78
+ return f"- [{mark_id}] <{tag}{attributes}>{text}</{tag}>"
79
+
80
+
81
+ class BrowserSession:
82
+ def __init__(
83
+ self,
84
+ viewport_width: int = 1280,
85
+ viewport_height: int = 720,
86
+ headless: bool = True,
87
+ ):
88
+ self.viewport_width = viewport_width
89
+ self.viewport_height = viewport_height
90
+ self.headless = headless
91
+ self.playwright: Playwright | None = None
92
+ self.browser: Browser | None = None
93
+ self.context: BrowserContext | None = None
94
+ self._exit_stack: AsyncExitStack | None = None
95
+
96
+ self.poi_elements: list = Field(default_factory=list)
97
+ self.poi_centroids: list[Point] = Field(default_factory=list)
98
+ self.bounding_boxes: list[BoundingBox] = Field(default_factory=list)
99
+ self.pois: list[POI] = Field(default_factory=list)
100
+
101
+ async def __aenter__(self) -> Self:
102
+ self._exit_stack = AsyncExitStack()
103
+ self.playwright = await async_playwright().start()
104
+
105
+ self.browser = await self.playwright.chromium.launch(headless=self.headless)
106
+ self.context = await self.browser.new_context(
107
+ viewport={"width": self.viewport_width, "height": self.viewport_height},
108
+ )
109
+ # Ensure there's at least one page open
110
+ if not self.context.pages:
111
+ await self.context.new_page()
112
+
113
+ self.context.set_default_timeout(60_000)
114
+ self.current_page.set_default_timeout(60_000)
115
+ await stealth_async(self.current_page, StealthConfig(navigator_user_agent=False))
116
+ await self.context.add_init_script(
117
+ path=Path(__file__).with_name("add_custom_select.js"),
118
+ )
119
+ await self.context.add_init_script(
120
+ path=Path(__file__).with_name("find_pois.js"),
121
+ )
122
+
123
+ return self
124
+
125
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
126
+ if self.browser:
127
+ await self.browser.close()
128
+ if self.playwright:
129
+ await self.playwright.stop()
130
+ if self._exit_stack:
131
+ await self._exit_stack.aclose()
132
+
133
+ @property
134
+ def current_page(self) -> Optional[Page]:
135
+ if self.context and self.context.pages:
136
+ return self.context.pages[-1] # Return the most recently opened page
137
+ return None
138
+
139
+ @property
140
+ def current_url(self) -> Optional[str]:
141
+ if self.current_page:
142
+ return self.current_page.url
143
+ return None
144
+
145
+ # re-run for cases of mid-run redirects
146
+ @retry(
147
+ wait=wait_exponential(multiplier=1, min=1, max=10),
148
+ stop=stop_after_delay(5),
149
+ reraise=True,
150
+ before_sleep=before_sleep_log(logger, logging.ERROR),
151
+ )
152
+ async def process_iframe(self, iframe) -> Optional[tuple[dict, dict]]:
153
+ try:
154
+ # Check iframe visibility and size
155
+ bounding_box = await iframe.bounding_box()
156
+ if not bounding_box:
157
+ return None # Skip if iframe is not visible
158
+
159
+ width, height = bounding_box["width"], bounding_box["height"]
160
+ if width < 50 or height < 50:
161
+ return None
162
+
163
+ frame = await iframe.content_frame()
164
+ if not frame:
165
+ return None
166
+
167
+ poi = await frame.evaluate(
168
+ """() => {
169
+ overwriteDefaultSelectConvergence();
170
+ return findPOIsConvergence();
171
+ }""",
172
+ )
173
+ if not poi:
174
+ return None
175
+
176
+ iframe_offset = {"x": round(bounding_box["x"]), "y": round(bounding_box["y"])}
177
+ return poi, iframe_offset
178
+ except Exception as e:
179
+ logger.error(f"Error processing iframe: {e}")
180
+ return None
181
+
182
+ @retry(
183
+ wait=wait_exponential(multiplier=1, min=1, max=10),
184
+ stop=stop_after_delay(5),
185
+ reraise=True,
186
+ before_sleep=before_sleep_log(logger, logging.ERROR),
187
+ )
188
+ async def update_poi(self) -> None:
189
+ try:
190
+ # Wait for basic page load states to ensure the DOM is ready.
191
+ # This is a fundamental wait that should always apply.
192
+ await self.current_page.wait_for_load_state("domcontentloaded", timeout=60000)
193
+ logger.debug(f"DEBUG: wait_for_load_state('domcontentloaded') completed for {self.current_page.url}.")
194
+
195
+ current_url = self.current_page.url
196
+
197
+ # Define common Salesforce URL patterns for different states
198
+ login_url_patterns = [
199
+ "login.salesforce.com",
200
+ "identity.force.com",
201
+ "auth.lightning.force.com",
202
+ "setup.salesforce.com", # Sometimes a setup login redirects here temporarily
203
+ "my.salesforce.com" # Your specific custom domain login redirects here
204
+ ]
205
+
206
+ # This is the main Salesforce Lightning application base URL, typically seen after login.
207
+ # We treat this as an intermediate loading state before the specific target page.
208
+ intermediate_app_url_pattern = "/one/one.app"
209
+
210
+ # Check the current state of the page based on its URL
211
+ is_on_login_page = any(pattern in current_url for pattern in login_url_patterns)
212
+ is_on_intermediate_app_page = intermediate_app_url_pattern in current_url
213
+ # Note: is_on_target_forecast_page checks if the specific target path is in the URL
214
+ is_on_target_forecast_page = "/AccountForecastSettings/home" in current_url
215
+
216
+ # --- CONDITIONAL WAITING LOGIC BASED ON URL ---
217
+ if is_on_target_forecast_page:
218
+ logger.info(f"INFO: Detected target Account Forecast Settings page: {current_url}. Waiting for content.")
219
+ # When on the specific target page, wait for its content and spinners
220
+ spinner_selectors = [
221
+ "div.slds-spinner_container",
222
+ "div.auraLoadingBox",
223
+ "div.dxp_axb_container", # Main overlay from your inspect screenshot
224
+ "div.slds-sprite-astro-x-large" # Specific animated element itself
225
+ ]
226
+ for selector in spinner_selectors:
227
+ try:
228
+ await self.current_page.wait_for_selector(selector, state="hidden", timeout=5000) # Reduced timeout
229
+ logger.debug(f"DEBUG: Spinner element '{selector}' became hidden for {self.current_page.url}.")
230
+ except PlaywrightTimeoutError:
231
+ logger.warning(f"DEBUGGING: Spinner element '{selector}' not detected or did not disappear on {self.current_page.url} within 5s.")
232
+
233
+ # Wait for a known element on the Account Forecast Settings page to ensure content is there.
234
+ try:
235
+ # Added 'h2' for section headers, and a more generic 'div[data-aura-rendered-by]' for Lightning components
236
+ await self.current_page.wait_for_selector("h1.slds-page-header__title, h2, .account-forecast-settings-component, div[data-aura-rendered-by]", state="visible", timeout=15000) # Increased timeout slightly for robust content load
237
+ logger.debug(f"DEBUG: Confirmed main page element visible for {self.current_page.url}.")
238
+ except PlaywrightTimeoutError:
239
+ logger.warning(f"DEBUGGING: Main page element not visible on {self.current_page.url} within 15s. This might indicate incomplete page load despite no spinner.")
240
+
241
+ elif is_on_login_page:
242
+ logger.info(f"INFO: Detected Salesforce login page: {current_url}. Waiting for login elements.")
243
+ # When on a login page, just wait for the login form elements to be visible
244
+ try:
245
+ await self.current_page.wait_for_selector("input[type='email'], input[type='password'], input[type='submit'], #username, #password, #Login", state="visible", timeout=10000)
246
+ logger.debug(f"DEBUG: Login page elements visible on {self.current_page.url}.")
247
+ except PlaywrightTimeoutError:
248
+ logger.warning(f"DEBUGGING: Login page elements not visible on {self.current_page.url} within 10s. This may happen if elements are in an iframe or if page is extremely slow.")
249
+
250
+ elif is_on_intermediate_app_page:
251
+ logger.info(f"INFO: Detected intermediate Salesforce Lightning app loading page: {current_url}. Waiting for network idle and app spinner.")
252
+ # This is the /one/one.app page or similar. Don't wait for specific content, just general load.
253
+ try:
254
+ await self.current_page.wait_for_load_state("networkidle", timeout=30000) # Give it more time for network to settle
255
+ logger.debug(f"DEBUG: Network idle detected on intermediate app page: {current_url}.")
256
+ except PlaywrightTimeoutError:
257
+ logger.warning(f"DEBUGGING: Network idle timeout on intermediate app page: {current_url}. Proceeding anyway.")
258
+
259
+ # Also try to wait for a common full-app spinner to disappear, if present
260
+ try:
261
+ await self.current_page.wait_for_selector('div.app-spinner, div.auraLoadingBox', state='hidden', timeout=15000) # Added auraLoadingBox as it might reappear
262
+ logger.debug(f"DEBUG: App spinner on intermediate page became hidden.")
263
+ except PlaywrightTimeoutError:
264
+ logger.warning(f"DEBUGGING: App spinner on intermediate page not found or did not disappear.")
265
+
266
+ else:
267
+ logger.info(f"INFO: Detected unhandled URL type: {current_url}. Performing generic body wait.")
268
+ # Fallback for any other page, just wait for body to be visible
269
+ try:
270
+ await self.current_page.wait_for_selector("body", timeout=5000, state="visible")
271
+ logger.debug(f"DEBUG: wait_for_selector('body', state='visible') completed for {self.current_page.url}.")
272
+ except PlaywrightTimeoutError:
273
+ logger.warning(f"DEBUGGING: Playwright Timeout (5s) on body selector for {self.current_page.url}. Continuing anyway.")
274
+ pass
275
+
276
+ except PlaywrightTimeoutError as e:
277
+ logger.error(f"ERROR: Timeout waiting for page readiness for {self.current_page.url}: {e}")
278
+ raise # Re-raise if essential waits fail (e.g., initial domcontentloaded)
279
+ except Exception as e:
280
+ logger.error(f"ERROR: An unexpected error occurred during page readiness check for {self.current_page.url}: {e}")
281
+ raise
282
+
283
+ # Rest of update_poi: Run the bounding box javascript code to highlight the points of interest on the page
284
+ page_info = await self.current_page.evaluate(
285
+ """() => {
286
+ overwriteDefaultSelectConvergence();
287
+ return findPOIsConvergence();
288
+ }""",
289
+ )
290
+ # Get the points of interest on the page
291
+ self.poi_elements = page_info["element_descriptions"]
292
+ element_centroids = page_info["element_centroids"]
293
+ try:
294
+ # Select all iframes on the page
295
+ iframes = await self.current_page.query_selector_all("iframe")
296
+
297
+ max_iframes = 10
298
+
299
+ # Define an asynchronous function to process and filter each iframe
300
+ tasks = [asyncio.create_task(self.process_iframe(iframe)) for iframe in iframes[:max_iframes]]
301
+
302
+ results = await asyncio.gather(*tasks)
303
+
304
+ filtered_results = [result for result in results if result is not None]
305
+
306
+ iframes_pois = []
307
+ iframe_offsets = []
308
+
309
+ for poi, offset in filtered_results:
310
+ iframes_pois.append(poi)
311
+ iframe_offsets.append(offset)
312
+
313
+ # Combine the points of interest from the iframes with the main page and adjust the centroids
314
+ for index, iframe_poi in enumerate(iframes_pois):
315
+ self.poi_elements.extend(iframe_poi["element_descriptions"])
316
+ for centroid in iframe_poi["element_centroids"]:
317
+ centroid["x"] += iframe_offsets[index]["x"]
318
+ centroid["y"] += iframe_offsets[index]["y"]
319
+ centroid["left"] += iframe_offsets[index]["x"]
320
+ centroid["top"] += iframe_offsets[index]["y"]
321
+ centroid["right"] += iframe_offsets[index]["x"]
322
+ # Fix: Removed duplicate 'centroid["y"] += iframe_offsets[index]["y"]'
323
+ centroid["bottom"] += iframe_offsets[index]["y"]
324
+ element_centroids.extend(iframe_poi["element_centroids"])
325
+
326
+ except Exception as e:
327
+ logger.error(f"Error in finding iframes: {e}")
328
+
329
+ # Get the centroids of the points of interest
330
+ self.poi_centroids = [Point(x=xy["x"], y=xy["y"]) for xy in element_centroids]
331
+ self.bounding_boxes = [BoundingBox(**xy, label=str(i)) for i, xy in enumerate(element_centroids)]
332
+ self.pois = [
333
+ POI(info=info, element_centroid=centroid, bounding_box=bbox)
334
+ for info, centroid, bbox in zip(
335
+ self.poi_elements,
336
+ self.poi_centroids,
337
+ self.bounding_boxes,
338
+ strict=False,
339
+ )
340
+ ]
341
+
342
+ @property
343
+ def poi_text(self) -> str:
344
+ # Get all points of interest on the page as text
345
+ texts = [element_as_text(mark_id=i, **element) for i, element in enumerate(self.poi_elements)]
346
+ # Return formatted text of points of interest on page
347
+ return "\n".join([txt for txt in texts if txt])
348
+
349
+ async def screenshot(
350
+ self,
351
+ delay: float = 0.0,
352
+ quality: int = 70,
353
+ type: str = "jpeg",
354
+ scale: str = "css",
355
+ ) -> tuple[bytes, bytes]:
356
+ if delay > 0.0:
357
+ await asyncio.sleep(delay)
358
+ await self.update_poi()
359
+ # Keep original logic if page is highly dynamic, but for static shots, simpler is faster
360
+ # old_poi_positions = [tuple(point) for point in self.poi_centroids]
361
+ img = await self.current_page.screenshot(type=type, quality=quality, scale=scale)
362
+ annotated_img = annotate_bounding_boxes(image=img, bounding_boxes=self.bounding_boxes)
363
+ # Re-evaluating this block for performance. Removed redundant update_poi and conditional screenshot.
364
+ # If precise screenshot timing is needed, the caller should manage delays and updates.
365
+ return img, annotated_img
366
+
367
+ async def goto(self, url: str) -> None:
368
+ await self.current_page.goto(url, wait_until="domcontentloaded")
369
+
370
+ async def reload(self) -> None:
371
+ await self.current_page.reload(wait_until="domcontentloaded")
372
+
373
+ async def click_tab(self, mark_id: int) -> None:
374
+ point: Point = self.poi_centroids[mark_id]
375
+ await self.hover(point)
376
+ await self.current_page.mouse.click(*point, button="middle")
377
+
378
+ async def click(self, mark_id: int) -> None:
379
+ point: Point = self.poi_centroids[mark_id]
380
+ await self.hover(point)
381
+ await self.current_page.mouse.click(*point)
382
+
383
+ async def enter_text(self, mark_id: int, text: str, submit: bool = False) -> None:
384
+ await self.clear_text_field(mark_id)
385
+ await self.click(mark_id)
386
+ await self.current_page.keyboard.type(text)
387
+
388
+ if submit:
389
+ await self.current_page.keyboard.press("Enter")
390
+
391
+ async def scroll(
392
+ self,
393
+ direction: Literal["up", "down", "left", "right"],
394
+ mark_id: Optional[int] = None,
395
+ ) -> None:
396
+ if mark_id is None:
397
+ point = Point(x=-1, y=-1)
398
+ max_scroll_x = self.viewport_width
399
+ max_scroll_y = self.viewport_height
400
+ else:
401
+ point: Point = self.poi_centroids[mark_id]
402
+ bbox: BoundingBox = self.bounding_boxes[mark_id]
403
+ max_scroll_x = bbox.right - bbox.left
404
+ max_scroll_y = bbox.bottom - bbox.top
405
+
406
+ await self.hover(point=point)
407
+ scroll_x = int(max_scroll_x * 0.8)
408
+ scroll_y = int(max_scroll_y * 0.8)
409
+ is_vertical = direction in ("up", "down")
410
+ reverse_scroll = direction in ("up", "left")
411
+ await self.current_page.mouse.wheel(
412
+ scroll_x * (-1 if reverse_scroll else 1) * (not is_vertical),
413
+ scroll_y * (-1 if reverse_scroll else 1) * is_vertical,
414
+ )
415
+
416
+ async def go_back(self) -> None:
417
+ # If there is no tab open then return
418
+ if not self.current_page:
419
+ return
420
+
421
+ await self.current_page.go_back(wait_until="domcontentloaded")
422
+ if self.current_page.url == "about:blank":
423
+ if not len(self.context.pages) > 1:
424
+ await self.current_page.go_forward(wait_until="domcontentloaded")
425
+ raise Exception("There is no previous page to go back to.")
426
+ await self.current_page.close()
427
+
428
+ async def hover(self, point: Point) -> None:
429
+ await self.current_page.mouse.move(*point)
430
+
431
+ async def focus(self, point: Point) -> None:
432
+ # Focus on the element on the page at point (x, y)
433
+ await self.current_page.evaluate(
434
+ """
435
+ ([x, y]) => {
436
+ const element = document.elementFromPoint(x, y);
437
+ if (element && element.focus) {
438
+ element.focus();
439
+ }
440
+ }""",
441
+ tuple(point),
442
+ )
443
+
444
+ async def get_text(self, mark_id: int) -> str:
445
+ return await self.current_page.evaluate(
446
+ """
447
+ (mark_id) => {
448
+ const element = marked_elements_convergence[mark_id];
449
+ if (element && (element.value !== undefined || element.textContent !== undefined)) {
450
+ return element.value || element.textContent;
451
+ }
452
+ return '';
453
+ }
454
+ """,
455
+ (mark_id,),
456
+ )
457
+
458
+ async def clear_text_field(self, mark_id: int) -> None:
459
+ existing_text = await self.get_text(mark_id)
460
+ if existing_text.strip():
461
+ # Clear existing text only if it exists
462
+ await self.click(mark_id)
463
+ if platform.system() == "Darwin": # selecting all text is OS-specific
464
+ await self.click(mark_id)
465
+ await self.current_page.keyboard.press("Meta+a")
466
+ await self.current_page.keyboard.press("Backspace")
467
+ else:
468
+ await self.current_page.keyboard.press("Control+Home")
469
+ await self.current_page.keyboard.press("Control+Shift+End")
470
+ await self.current_page.keyboard.press("Backspace")
471
+
472
+ async def open_new_tab_and_go_to(self, url: str) -> None:
473
+ """
474
+ Opens a new browser tab/page and navigates to the specified URL.
475
+ Closes the old page if it's not the last one remaining.
476
+ """
477
+ logger.info(f"Attempting to open a new tab and navigate to: {url}")
478
+ new_page = await self.context.new_page()
479
+
480
+ # Close the previous page if it's not the only one left in the context
481
+ if len(self.context.pages) > 1 and self.current_page and self.current_page != new_page:
482
+ try:
483
+ await self.current_page.close()
484
+ logger.debug("Closed previous page.")
485
+ except Exception as e:
486
+ logger.warning(f"Could not close previous page (might already be closed or detached): {e}")
487
+
488
+ # After navigation, trigger POI update to reflect the new page's state
489
+ await new_page.goto(url, wait_until="domcontentloaded")
490
+ logger.info(f"Successfully navigated to {url} in a new tab.")
491
+ # Crucial: update_poi uses self.current_page, which is now new_page implicitly
492
+ await self.update_poi()
493
+
494
+
495
+ if __name__ == "__main__":
496
+
497
+ async def dummy_test():
498
+ async with BrowserSession(headless=False) as s:
499
+ page = await s.context.new_page()
500
+ await page.goto("http://google.co.uk")
501
+ await asyncio.sleep(5)
502
+ await page.screenshot(path="example.png")
503
+ await s.update_poi()
504
+ _, annotated_image = await s.screenshot()
505
+ with open("output.png", "wb") as f:
506
+ f.write(annotated_image)
507
+
508
+ asyncio.run(dummy_test())
src/proxy_lite/browser/find_pois.js ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ marked_elements_convergence = [];
2
+
3
+ const interactiveTags = new Set([
4
+ 'a', 'button', 'details', 'embed', 'input', 'label',
5
+ 'menu', 'menuitem', 'object', 'select', 'textarea', 'summary',
6
+ 'video', 'audio', 'option', 'iframe'
7
+ ]);
8
+
9
+ const interactiveRoles = new Set([
10
+ 'button', 'menu', 'menuitem', 'link', 'checkbox', 'radio',
11
+ 'slider', 'tab', 'tabpanel', 'textbox', 'combobox', 'grid',
12
+ 'listbox', 'option', 'progressbar', 'scrollbar', 'searchbox',
13
+ 'switch', 'tree', 'treeitem', 'spinbutton', 'tooltip',
14
+ 'a-button-inner', 'a-dropdown-button', 'click',
15
+ 'menuitemcheckbox', 'menuitemradio', 'a-button-text',
16
+ 'button-text', 'button-icon', 'button-icon-only',
17
+ 'button-text-icon-only', 'dropdown', 'combobox'
18
+ ]);
19
+
20
+ findPOIsConvergence = (input = null) => {
21
+
22
+ let rootElement = input ? input : document.documentElement;
23
+
24
+ function isScrollable(element) {
25
+ if ((input === null) && (element === document.documentElement)) {
26
+ // we can always scroll the full page
27
+ return false;
28
+ }
29
+
30
+ const style = window.getComputedStyle(element);
31
+
32
+ const hasScrollableYContent = element.scrollHeight > element.clientHeight
33
+ const overflowYScroll = style.overflowY === 'scroll' || style.overflowY === 'auto';
34
+
35
+ const hasScrollableXContent = element.scrollWidth > element.clientWidth;
36
+ const overflowXScroll = style.overflowX === 'scroll' || style.overflowX === 'auto';
37
+
38
+ return (hasScrollableYContent && overflowYScroll) || (hasScrollableXContent && overflowXScroll);
39
+ }
40
+
41
+ function getEventListeners(element) {
42
+ try {
43
+ return window.getEventListeners?.(element) || {};
44
+ } catch (e) {
45
+ return {};
46
+ }
47
+ }
48
+
49
+ function isInteractive(element) {
50
+ if (!element) return false;
51
+
52
+ return (hasInteractiveTag(element) ||
53
+ hasInteractiveAttributes(element) ||
54
+ hasInteractiveEventListeners(element)) ||
55
+ isScrollable(element);
56
+ }
57
+
58
+ function hasInteractiveTag(element) {
59
+ return interactiveTags.has(element.tagName.toLowerCase());
60
+ }
61
+
62
+ function hasInteractiveAttributes(element) {
63
+ const role = element.getAttribute('role');
64
+ const ariaRole = element.getAttribute('aria-role');
65
+ const tabIndex = element.getAttribute('tabindex');
66
+ const onAttribute = element.getAttribute('on');
67
+
68
+ if (element.getAttribute('contenteditable') === 'true') return true;
69
+ if ((role && interactiveRoles.has(role)) ||
70
+ (ariaRole && interactiveRoles.has(ariaRole))) return true;
71
+ if (tabIndex !== null && tabIndex !== '-1') return true;
72
+
73
+ // Add check for AMP's 'on' attribute that starts with 'tap:'
74
+ if (onAttribute && onAttribute.startsWith('tap:')) return true;
75
+
76
+ const hasAriaProps = element.hasAttribute('aria-expanded') ||
77
+ element.hasAttribute('aria-pressed') ||
78
+ element.hasAttribute('aria-selected') ||
79
+ element.hasAttribute('aria-checked');
80
+
81
+ return hasAriaProps;
82
+ }
83
+
84
+ function hasInteractiveEventListeners(element) {
85
+ const hasClickHandler = element.onclick !== null ||
86
+ element.getAttribute('onclick') !== null ||
87
+ element.hasAttribute('ng-click') ||
88
+ element.hasAttribute('@click') ||
89
+ element.hasAttribute('v-on:click');
90
+ if (hasClickHandler) return true;
91
+
92
+ const listeners = getEventListeners(element);
93
+ return listeners && (
94
+ listeners.click?.length > 0 ||
95
+ listeners.mousedown?.length > 0 ||
96
+ listeners.mouseup?.length > 0 ||
97
+ listeners.touchstart?.length > 0 ||
98
+ listeners.touchend?.length > 0
99
+ );
100
+ }
101
+
102
+ function calculateArea(rects) {
103
+ return rects.reduce((acc, rect) => acc + rect.width * rect.height, 0);
104
+ }
105
+
106
+ function getElementRects(element, context) {
107
+ const vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0);
108
+ const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0);
109
+
110
+ let rects = [...element.getClientRects()];
111
+
112
+ // If rects are empty (likely due to Shadow DOM), try to estimate position
113
+ if (rects.length === 0 && element.getBoundingClientRect) {
114
+ rects = [element.getBoundingClientRect()];
115
+ }
116
+
117
+ // Get iframe offset if element is in an iframe
118
+ let iframeOffset = { x: 0, y: 0 };
119
+ if (context !== document && context?.defaultView?.frameElement) {
120
+ const iframe = context.defaultView.frameElement;
121
+ if (iframe) {
122
+ const iframeRect = iframe.getBoundingClientRect();
123
+ iframeOffset = {
124
+ x: iframeRect.left,
125
+ y: iframeRect.top
126
+ };
127
+ }
128
+ }
129
+
130
+ return rects.filter(bb => {
131
+ const center_x = bb.left + bb.width / 2 + iframeOffset.x;
132
+ const center_y = bb.top + bb.height / 2 + iframeOffset.y;
133
+ const elAtCenter = context.elementFromPoint(center_x - iframeOffset.x, center_y - iframeOffset.y);
134
+
135
+ return elAtCenter === element || element.contains(elAtCenter);
136
+ }).map(bb => {
137
+ const rect = {
138
+ left: Math.max(0, bb.left + iframeOffset.x),
139
+ top: Math.max(0, bb.top + iframeOffset.y),
140
+ right: Math.min(vw, bb.right + iframeOffset.x),
141
+ bottom: Math.min(vh, bb.bottom + iframeOffset.y)
142
+ };
143
+ return {
144
+ ...rect,
145
+ width: rect.right - rect.left,
146
+ height: rect.bottom - rect.top
147
+ };
148
+ });
149
+ }
150
+
151
+ function isElementVisible(element) {
152
+ const style = window.getComputedStyle(element);
153
+ return element.offsetWidth > 0 &&
154
+ element.offsetHeight > 0 &&
155
+ style.visibility !== 'hidden' &&
156
+ style.display !== 'none';
157
+ }
158
+
159
+ function isTopElement(element) {
160
+ let doc = element.ownerDocument;
161
+ if (doc !== window.document) {
162
+ // If in an iframe's document, treat as top
163
+ return true;
164
+ }
165
+ const shadowRoot = element.getRootNode();
166
+ if (shadowRoot instanceof ShadowRoot) {
167
+ const rect = element.getBoundingClientRect();
168
+ const point = { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
169
+ try {
170
+ const topEl = shadowRoot.elementFromPoint(point.x, point.y);
171
+ if (!topEl) return false;
172
+ let current = topEl;
173
+ while (current && current !== shadowRoot) {
174
+ if (current === element) return true;
175
+ current = current.parentElement;
176
+ }
177
+ return false;
178
+ } catch (e) {
179
+ return true;
180
+ }
181
+ }
182
+ const rect = element.getBoundingClientRect();
183
+ const point = { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
184
+ try {
185
+ const topEl = document.elementFromPoint(point.x, point.y);
186
+ if (!topEl) return false;
187
+ let current = topEl;
188
+ while (current && current !== document.documentElement) {
189
+ if (current === element) return true;
190
+ current = current.parentElement;
191
+ }
192
+ return false;
193
+ } catch (e) {
194
+ return true;
195
+ }
196
+ }
197
+
198
+ function getVisibleText(element, marked_elements_convergence = []) {
199
+ const blockLikeDisplays = [
200
+ // Basic block elements
201
+ 'block', 'flow-root', 'inline-block',
202
+ // Lists
203
+ 'list-item',
204
+ // Table elements
205
+ 'table', 'inline-table', 'table-row', 'table-cell',
206
+ 'table-caption', 'table-header-group', 'table-footer-group',
207
+ 'table-row-group',
208
+ // Modern layouts
209
+ 'flex', 'inline-flex', 'grid', 'inline-grid'
210
+ ];
211
+
212
+ // Check if element is hidden
213
+ const style = window.getComputedStyle(element);
214
+ if (style.display === 'none' || style.visibility === 'hidden') {
215
+ return '';
216
+ }
217
+
218
+ let collectedText = [];
219
+
220
+ function isMarkedInteractive(el) {
221
+ return marked_elements_convergence.includes(el);
222
+ }
223
+
224
+ function traverse(node) {
225
+ if (
226
+ node.nodeType === Node.ELEMENT_NODE &&
227
+ node !== element &&
228
+ isMarkedInteractive(node)
229
+ ) {
230
+ return false;
231
+ }
232
+
233
+ if (node.nodeType === Node.TEXT_NODE) {
234
+ const trimmed = node.textContent.trim();
235
+ if (trimmed) {
236
+ collectedText.push(trimmed);
237
+ }
238
+ } else if (node.nodeType === Node.ELEMENT_NODE) {
239
+ // Skip noscript elements
240
+ if (node.tagName === 'NOSCRIPT') {
241
+ return true;
242
+ }
243
+
244
+ const nodeStyle = window.getComputedStyle(node);
245
+
246
+ // Skip hidden elements
247
+ if (nodeStyle.display === 'none' || nodeStyle.visibility === 'hidden') {
248
+ return true;
249
+ }
250
+
251
+ // Add newline before block elements if we have text
252
+ if (blockLikeDisplays.includes(nodeStyle.display) && collectedText.length > 0) {
253
+ collectedText.push('\n');
254
+ }
255
+
256
+ if (node.tagName === 'IMG') {
257
+ const textParts = [];
258
+ const alt = node.getAttribute('alt');
259
+ const title = node.getAttribute('title');
260
+ const ariaLabel = node.getAttribute('aria-label');
261
+ // Add more as needed (e.g., 'aria-describedby', 'data-caption', etc.)
262
+
263
+ if (alt) textParts.push(`alt="${alt}"`);
264
+ if (title) textParts.push(`title="${title}"`);
265
+ if (ariaLabel) textParts.push(`aria-label="${ariaLabel}"`);
266
+
267
+ if (textParts.length > 0) {
268
+ collectedText.push(`[img - ${textParts.join(' ')}]`);
269
+ }
270
+ return true;
271
+ }
272
+
273
+ for (const child of node.childNodes) {
274
+ const shouldContinue = traverse(child);
275
+ if (shouldContinue === false) {
276
+ return false;
277
+ }
278
+ }
279
+
280
+ // Add newline after block elements
281
+ if (blockLikeDisplays.includes(nodeStyle.display)) {
282
+ collectedText.push('\n');
283
+ }
284
+ }
285
+
286
+ return true;
287
+ }
288
+
289
+ traverse(element);
290
+
291
+ // Join text and normalize whitespace
292
+ return collectedText.join(' ').trim().replace(/\s{2,}/g, ' ').trim();
293
+ }
294
+
295
+ function extractInteractiveItems(rootElement) {
296
+ const items = [];
297
+
298
+ function processElement(element, context) {
299
+ if (!element) return;
300
+
301
+ // Recursively process elements
302
+ if (element.nodeType === Node.ELEMENT_NODE && isInteractive(element) && isElementVisible(element) && isTopElement(element)) {
303
+ const rects = getElementRects(element, context);
304
+ const area = calculateArea(rects);
305
+ items.push({
306
+ element: element,
307
+ area,
308
+ rects,
309
+ is_scrollable: isScrollable(element),
310
+ });
311
+ }
312
+
313
+ if (element.shadowRoot) {
314
+ // if it's shadow DOM, process elements in the shadow DOM
315
+ Array.from(element.shadowRoot.childNodes || []).forEach(child => {
316
+ processElement(child, element.shadowRoot);
317
+ });
318
+ }
319
+
320
+ if (element.tagName === 'SLOT') {
321
+ // Handle both assigned elements and nodes
322
+ const assigned = element.assignedNodes ? element.assignedNodes() : element.assignedElements();
323
+ assigned.forEach(child => {
324
+ processElement(child, context);
325
+ });
326
+ }
327
+ else if (element.tagName === 'IFRAME') {
328
+ try {
329
+ const iframeDoc = element.contentDocument || element.contentWindow?.document;
330
+ if (iframeDoc && iframeDoc.body) {
331
+ // Process elements inside iframe
332
+ processElement(iframeDoc.body, iframeDoc);
333
+ }
334
+ } catch (e) {
335
+ console.warn('Unable to access iframe contents:', e);
336
+ }
337
+ } else {
338
+ // if it's regular child elements, process regular child elements
339
+ Array.from(element.children || []).forEach(child => {
340
+ processElement(child, context);
341
+ });
342
+ }
343
+ }
344
+
345
+ processElement(rootElement, document);
346
+ return items;
347
+ }
348
+
349
+ if (marked_elements_convergence) {
350
+ marked_elements_convergence = [];
351
+ }
352
+ let mark_centres = [];
353
+ let marked_element_descriptions = [];
354
+ var items = extractInteractiveItems(rootElement);
355
+
356
+ // Lets create a floating border on top of these elements that will always be visible
357
+ let index = 0;
358
+ items.forEach(function (item) {
359
+ item.rects.forEach((bbox) => {
360
+ marked_elements_convergence.push(item.element);
361
+ mark_centres.push({
362
+ x: Math.round((bbox.left + bbox.right) / 2),
363
+ y: Math.round((bbox.top + bbox.bottom) / 2),
364
+ left: bbox.left,
365
+ top: bbox.top,
366
+ right: bbox.right,
367
+ bottom: bbox.bottom,
368
+ });
369
+ marked_element_descriptions.push({
370
+ tag: item.element.tagName,
371
+ text: getVisibleText(item.element),
372
+ // NOTE: all other attributes will be shown to the model when present
373
+ // TODO: incorperate child attributes, e.g. <img alt="..."> when img is a child of the link element
374
+ value: item.element.value,
375
+ placeholder: item.element.getAttribute("placeholder"),
376
+ element_type: item.element.getAttribute("type"),
377
+ aria_label: item.element.getAttribute("aria-label"),
378
+ name: item.element.getAttribute("name"),
379
+ required: item.element.getAttribute("required"),
380
+ disabled: item.element.getAttribute("disabled"),
381
+ pattern: item.element.getAttribute("pattern"),
382
+ checked: item.element.getAttribute("checked"),
383
+ minlength: item.element.getAttribute("minlength"),
384
+ maxlength: item.element.getAttribute("maxlength"),
385
+ role: item.element.getAttribute("role"),
386
+ title: item.element.getAttribute("title"),
387
+ scrollable: item.is_scrollable
388
+ });
389
+ index++;
390
+ });
391
+ });
392
+
393
+ return {
394
+ element_descriptions: marked_element_descriptions,
395
+ element_centroids: mark_centres
396
+ };
397
+ }
src/proxy_lite/cli.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import asyncio
3
+ import base64
4
+ import os
5
+ from pathlib import Path
6
+
7
+ from proxy_lite import Runner, RunnerConfig
8
+ from proxy_lite.gif_maker import create_run_gif
9
+ from proxy_lite.logger import logger
10
+
11
+
12
+ def update_config_from_env(config: RunnerConfig) -> RunnerConfig:
13
+ if os.getenv("PROXY_LITE_API_BASE"):
14
+ config.solver.agent.client.api_base = os.getenv("PROXY_LITE_API_BASE")
15
+ if os.getenv("PROXY_LITE_MODEL"):
16
+ config.solver.agent.client.model_id = os.getenv("PROXY_LITE_MODEL")
17
+ if os.getenv("PROXY_LITE_VIEWPORT_WIDTH"):
18
+ config.environment.viewport_width = int(os.getenv("PROXY_LITE_VIEWPORT_WIDTH"))
19
+ if os.getenv("PROXY_LITE_VIEWPORT_HEIGHT"):
20
+ config.environment.viewport_height = int(os.getenv("PROXY_LITE_VIEWPORT_HEIGHT"))
21
+ return config
22
+
23
+
24
+ def do_command(args):
25
+ do_text = " ".join(args.task)
26
+ logger.info("🤖 Let me help you with that...")
27
+ # Take default config from YAML
28
+ config = RunnerConfig.from_yaml(args.config)
29
+ # Update config from environment variables
30
+ config = update_config_from_env(config)
31
+ # Update config from command-line arguments
32
+ if args.api_base:
33
+ config.solver.agent.client.api_base = args.api_base
34
+ if args.model:
35
+ config.solver.agent.client.model_id = args.model
36
+ if args.homepage:
37
+ config.environment.homepage = args.homepage
38
+ if args.viewport_width:
39
+ config.environment.viewport_width = args.viewport_width
40
+ if args.viewport_height:
41
+ config.environment.viewport_height = args.viewport_height
42
+ o = Runner(config=config)
43
+ result = asyncio.run(o.run(do_text))
44
+
45
+ final_screenshot = result.observations[-1].info["original_image"]
46
+ folder_path = Path(__file__).parent.parent.parent / "screenshots"
47
+ folder_path.mkdir(parents=True, exist_ok=True)
48
+ path = folder_path / f"{result.run_id}.png"
49
+ with open(path, "wb") as f:
50
+ f.write(base64.b64decode(final_screenshot))
51
+ logger.info(f"🤖 Final screenshot saved to {path}")
52
+
53
+ gif_folder_path = Path(__file__).parent.parent.parent / "gifs"
54
+ gif_folder_path.mkdir(parents=True, exist_ok=True)
55
+ gif_path = gif_folder_path / f"{result.run_id}.gif"
56
+ create_run_gif(result, gif_path, duration=1500)
57
+ logger.info(f"🤖 GIF saved to {gif_path}")
58
+
59
+
60
+ def main():
61
+ parser = argparse.ArgumentParser(description="Proxy-Lite")
62
+ parser.add_argument(
63
+ "task",
64
+ type=str,
65
+ help="The task you want to accomplish",
66
+ nargs="*",
67
+ )
68
+ parser.add_argument(
69
+ "--model",
70
+ type=str,
71
+ default=None,
72
+ help="The model to use.",
73
+ )
74
+ parser.add_argument(
75
+ "--api_base",
76
+ type=str,
77
+ default=None,
78
+ help="The API base URL to use.",
79
+ )
80
+ # New option for setting a homepage URL:
81
+ parser.add_argument(
82
+ "--homepage",
83
+ type=str,
84
+ default=None,
85
+ help="The homepage URL to use.",
86
+ )
87
+ # New viewport controls:
88
+ parser.add_argument(
89
+ "--viewport-width",
90
+ type=int,
91
+ default=None,
92
+ help="Viewport width in pixels.",
93
+ )
94
+ parser.add_argument(
95
+ "--viewport-height",
96
+ type=int,
97
+ default=None,
98
+ help="Viewport height in pixels.",
99
+ )
100
+ parser.add_argument(
101
+ "--config",
102
+ type=Path,
103
+ default=Path(__file__).parent / "configs/default.yaml",
104
+ help="Path to config file (default: configs/default.yaml)",
105
+ )
106
+
107
+ args = parser.parse_args()
108
+ do_command(args)
109
+
110
+
111
+ if __name__ == "__main__":
112
+ main()
src/proxy_lite/client.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from abc import ABC, abstractmethod
3
+ from functools import cached_property
4
+ from typing import ClassVar, Literal, Optional, Union
5
+
6
+ import httpx
7
+ from httpx import Limits, Timeout
8
+ from openai import AsyncOpenAI
9
+ from openai.types.chat.chat_completion import (
10
+ ChatCompletion,
11
+ )
12
+ from pydantic import BaseModel
13
+
14
+ from proxy_lite.history import MessageHistory
15
+ from proxy_lite.logger import logger
16
+ from proxy_lite.serializer import (
17
+ BaseSerializer,
18
+ OpenAICompatibleSerializer,
19
+ )
20
+ from proxy_lite.tools import Tool
21
+
22
+
23
+ class BaseClientConfig(BaseModel):
24
+ http_timeout: float = 50
25
+ http_concurrent_connections: int = 50
26
+
27
+
28
+ class BaseClient(BaseModel, ABC):
29
+ config: BaseClientConfig
30
+ serializer: ClassVar[BaseSerializer]
31
+
32
+ @abstractmethod
33
+ async def create_completion(
34
+ self,
35
+ messages: MessageHistory,
36
+ temperature: float = 0.7,
37
+ seed: Optional[int] = None,
38
+ tools: Optional[list[Tool]] = None,
39
+ response_format: Optional[type[BaseModel]] = None,
40
+ ) -> ChatCompletion: ...
41
+
42
+ """
43
+ Create completion from model.
44
+ Expect subclasses to adapt from various endpoints that will handle
45
+ requests differently, make sure to raise appropriate warnings.
46
+
47
+ Returns:
48
+ ChatCompletion: OpenAI ChatCompletion format for consistency
49
+ """
50
+
51
+ @classmethod
52
+ def create(cls, config: BaseClientConfig) -> "BaseClient":
53
+ supported_clients = {
54
+ "openai-azure": OpenAIClient,
55
+ "convergence": ConvergenceClient,
56
+ }
57
+ if config.name not in supported_clients:
58
+ error_message = f"Unsupported model: {config.name}."
59
+ raise ValueError(error_message)
60
+ return supported_clients[config.name](config=config)
61
+
62
+ @property
63
+ def http_client(self) -> httpx.AsyncClient:
64
+ return httpx.AsyncClient(
65
+ timeout=Timeout(self.config.http_timeout),
66
+ limits=Limits(
67
+ max_connections=self.config.http_concurrent_connections,
68
+ max_keepalive_connections=self.config.http_concurrent_connections,
69
+ ),
70
+ )
71
+
72
+
73
+ class OpenAIClientConfig(BaseClientConfig):
74
+ name: Literal["openai"] = "openai"
75
+ model_id: str = "gpt-4o"
76
+ api_key: str = os.environ.get("OPENAI_API_KEY")
77
+
78
+
79
+ class OpenAIClient(BaseClient):
80
+ config: OpenAIClientConfig
81
+ serializer: ClassVar[OpenAICompatibleSerializer] = OpenAICompatibleSerializer()
82
+
83
+ @cached_property
84
+ def external_client(self) -> AsyncOpenAI:
85
+ return AsyncOpenAI(
86
+ api_key=self.config.api_key,
87
+ http_client=self.http_client,
88
+ )
89
+
90
+ async def create_completion(
91
+ self,
92
+ messages: MessageHistory,
93
+ temperature: float = 0.7,
94
+ seed: Optional[int] = None,
95
+ tools: Optional[list[Tool]] = None,
96
+ response_format: Optional[type[BaseModel]] = None,
97
+ ) -> ChatCompletion:
98
+ base_params = {
99
+ "model": self.config.model_id,
100
+ "messages": self.serializer.serialize_messages(messages),
101
+ "temperature": temperature,
102
+ }
103
+ optional_params = {
104
+ "seed": seed,
105
+ "tools": self.serializer.serialize_tools(tools) if tools else None,
106
+ "tool_choice": "required" if tools else None,
107
+ "response_format": {"type": "json_object"} if response_format else {"type": "text"},
108
+ }
109
+ base_params.update({k: v for k, v in optional_params.items() if v is not None})
110
+ return await self.external_client.chat.completions.create(**base_params)
111
+
112
+
113
+ class ConvergenceClientConfig(BaseClientConfig):
114
+ name: Literal["convergence"] = "convergence"
115
+ model_id: str = "convergence-ai/proxy-lite-7b"
116
+ api_base: str = "http://localhost:8000/v1"
117
+ api_key: str = "none"
118
+
119
+
120
+ class ConvergenceClient(OpenAIClient):
121
+ config: ConvergenceClientConfig
122
+ serializer: ClassVar[OpenAICompatibleSerializer] = OpenAICompatibleSerializer()
123
+ _model_validated: bool = False
124
+
125
+ async def _validate_model(self) -> None:
126
+ try:
127
+ response = await self.external_client.models.list()
128
+ assert self.config.model_id in [model.id for model in response.data], (
129
+ f"Model {self.config.model_id} not found in {response.data}"
130
+ )
131
+ self._model_validated = True
132
+ logger.debug(f"Model {self.config.model_id} validated and connected to cluster")
133
+ except Exception as e:
134
+ logger.error(f"Error retrieving model: {e}")
135
+ raise e
136
+
137
+ @cached_property
138
+ def external_client(self) -> AsyncOpenAI:
139
+ return AsyncOpenAI(
140
+ api_key=self.config.api_key,
141
+ base_url=self.config.api_base,
142
+ http_client=self.http_client,
143
+ )
144
+
145
+ async def create_completion(
146
+ self,
147
+ messages: MessageHistory,
148
+ temperature: float = 0.7,
149
+ seed: Optional[int] = None,
150
+ tools: Optional[list[Tool]] = None,
151
+ response_format: Optional[type[BaseModel]] = None,
152
+ ) -> ChatCompletion:
153
+ if not self._model_validated:
154
+ await self._validate_model()
155
+ base_params = {
156
+ "model": self.config.model_id,
157
+ "messages": self.serializer.serialize_messages(messages),
158
+ "temperature": temperature,
159
+ }
160
+ optional_params = {
161
+ "seed": seed,
162
+ "tools": self.serializer.serialize_tools(tools) if tools else None,
163
+ "tool_choice": "auto" if tools else None, # vLLM does not support "required"
164
+ "response_format": response_format if response_format else {"type": "text"},
165
+ }
166
+ base_params.update({k: v for k, v in optional_params.items() if v is not None})
167
+ return await self.external_client.chat.completions.create(**base_params)
168
+
169
+
170
+ ClientConfigTypes = Union[OpenAIClientConfig, ConvergenceClientConfig]
171
+ ClientTypes = Union[OpenAIClient, ConvergenceClient]
src/proxy_lite/configs/default.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ environment:
2
+ name: webbrowser
3
+ annotate_image: true
4
+ screenshot_delay: 2.0
5
+ viewport_width: 1280
6
+ viewport_height: 1920
7
+ include_poi_text: true
8
+ headless: false
9
+ homepage: https://www.google.co.uk
10
+ keep_original_image: true
11
+ solver:
12
+ name: simple
13
+ agent:
14
+ name: proxy_lite
15
+ client:
16
+ name: convergence
17
+ model_id: convergence-ai/proxy-lite-3b
18
+ api_base: https://convergence-ai-demo-api.hf.space/v1
19
+ local_view: true
20
+ task_timeout: 1800
21
+ environment_timeout: 1800
22
+ action_timeout: 1800
23
+ verbose: true
src/proxy_lite/environments/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ from .environment_base import (
4
+ Action,
5
+ BaseEnvironment,
6
+ BaseEnvironmentConfig,
7
+ Environments,
8
+ Event,
9
+ EventType,
10
+ Observation,
11
+ )
12
+ from .webbrowser import (
13
+ WebBrowserEnvironment,
14
+ WebBrowserEnvironmentConfig,
15
+ )
16
+
17
+ EnvironmentConfigTypes = Union[*list(Environments._environment_config_registry.values())]
18
+ EnvironmentTypes = Union[*list(Environments._environment_registry.values())]
19
+
20
+
21
+ __all__ = [
22
+ "Action",
23
+ "BaseEnvironment",
24
+ "BaseEnvironmentConfig",
25
+ "EnvironmentConfigTypes",
26
+ "Environments",
27
+ "Event",
28
+ "EventType",
29
+ "Observation",
30
+ "WebBrowserEnvironment",
31
+ "WebBrowserEnvironmentConfig",
32
+ ]
src/proxy_lite/environments/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (924 Bytes). View file
 
src/proxy_lite/environments/__pycache__/environment_base.cpython-313.pyc ADDED
Binary file (8.85 kB). View file
 
src/proxy_lite/environments/__pycache__/webbrowser.cpython-313.pyc ADDED
Binary file (12.2 kB). View file
 
src/proxy_lite/environments/environment_base.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ from abc import ABC, abstractmethod
4
+ from enum import Enum
5
+ from functools import cached_property
6
+ from typing import Any, Literal, Optional, Self
7
+
8
+ from pydantic import BaseModel
9
+
10
+ from proxy_lite.history import ToolCall
11
+ from proxy_lite.tools import Tool, ToolExecutionResponse
12
+
13
+
14
+ class EventType(str, Enum):
15
+ OBSERVATION = "observation"
16
+ ACTION = "action"
17
+ MESSAGE = "message"
18
+
19
+
20
+ class Event(BaseModel):
21
+ type: EventType
22
+
23
+
24
+ class State(BaseModel):
25
+ text: Optional[str] = None
26
+ image: Optional[str] = None # base64 encoded image
27
+ html: Optional[str] = None
28
+ tool_responses: Optional[list[ToolExecutionResponse]] = None
29
+
30
+
31
+ class Observation(Event):
32
+ type: Literal[EventType.OBSERVATION] = EventType.OBSERVATION
33
+ state: State
34
+ terminated: bool
35
+ reward: Optional[float] = None
36
+ info: Optional[dict[str, Any]] = None
37
+
38
+
39
+ class Action(Event):
40
+ type: Literal[EventType.ACTION] = EventType.ACTION
41
+ text: Optional[str] = None
42
+ tool_calls: Optional[list[ToolCall]] = None
43
+ info: Optional[dict[str, Any]] = None
44
+
45
+
46
+ class BaseEnvironmentConfig(BaseModel): ...
47
+
48
+
49
+ class BaseEnvironment(BaseModel, ABC):
50
+ config: BaseEnvironmentConfig
51
+ logger: logging.Logger | None = None
52
+
53
+ class Config:
54
+ arbitrary_types_allowed = True
55
+
56
+ async def __aenter__(self) -> Self:
57
+ return self
58
+
59
+ async def __aexit__(self, exc_type, exc_value, traceback):
60
+ pass
61
+
62
+ @property
63
+ @abstractmethod
64
+ def info_for_user(self) -> str: ...
65
+
66
+ @cached_property
67
+ @abstractmethod
68
+ def tools(self) -> list[Tool]: ...
69
+
70
+ @abstractmethod
71
+ async def initialise(self) -> Observation: ...
72
+
73
+ @abstractmethod
74
+ async def execute_action(self, action: Action) -> Observation: ...
75
+
76
+ @abstractmethod
77
+ async def observe(self) -> Observation: ...
78
+
79
+ @abstractmethod
80
+ async def evaluate(self, **kwargs: dict[str, Any]) -> dict[str, Any]: ...
81
+
82
+ async def execute_tool(self, tool_call: ToolCall) -> None:
83
+ function = tool_call.function
84
+ for tool in self.tools:
85
+ if hasattr(tool, function["name"]):
86
+ arguments = json.loads(function["arguments"])
87
+ if isinstance(arguments, str):
88
+ arguments = json.loads(arguments)
89
+ return await getattr(tool, function["name"])(
90
+ **arguments,
91
+ )
92
+ msg = f'No tool function with name "{function["name"]}"'
93
+ raise ValueError(msg)
94
+
95
+ async def get_info(self) -> dict[str, Any]:
96
+ return {}
97
+
98
+
99
+ class Environments:
100
+ _environment_registry: dict[str, type[BaseEnvironment]] = {}
101
+ _environment_config_registry: dict[str, type[BaseEnvironmentConfig]] = {}
102
+
103
+ @classmethod
104
+ def register_environment(cls, name: str):
105
+ """
106
+ Decorator to register an Environment class under a given name.
107
+
108
+ Example:
109
+ @Environments.register_environment("my_environment")
110
+ class MyEnvironment(BaseEnvironment):
111
+ ...
112
+ """
113
+
114
+ def decorator(env_cls: type[BaseEnvironment]) -> type[BaseEnvironment]:
115
+ cls._environment_registry[name] = env_cls
116
+ return env_cls
117
+
118
+ return decorator
119
+
120
+ @classmethod
121
+ def register_environment_config(cls, name: str):
122
+ """
123
+ Decorator to register an Environment configuration class under a given name.
124
+
125
+ Example:
126
+ @Environments.register_environment_config("my_environment")
127
+ class MyEnvironmentConfig(BaseEnvironmentConfig):
128
+ ...
129
+ """
130
+
131
+ def decorator(config_cls: type[BaseEnvironmentConfig]) -> type[BaseEnvironmentConfig]:
132
+ cls._environment_config_registry[name] = config_cls
133
+ return config_cls
134
+
135
+ return decorator
136
+
137
+ @classmethod
138
+ def get(cls, name: str) -> type[BaseEnvironment]:
139
+ """
140
+ Retrieve a registered Environment class by its name.
141
+
142
+ Raises:
143
+ ValueError: If no such environment is found.
144
+ """
145
+ try:
146
+ return cls._environment_registry[name]
147
+ except KeyError:
148
+ raise ValueError(f"Environment '{name}' not found.")
149
+
150
+ @classmethod
151
+ def get_config(cls, name: str) -> type[BaseEnvironmentConfig]:
152
+ """
153
+ Retrieve a registered Environment configuration class by its name.
154
+
155
+ Raises:
156
+ ValueError: If no such configuration is found.
157
+ """
158
+ try:
159
+ return cls._environment_config_registry[name]
160
+ except KeyError:
161
+ raise ValueError(f"Environment config for '{name}' not found.")
src/proxy_lite/environments/webbrowser.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from functools import cached_property
3
+ from typing import Any, Literal, Optional, Self
4
+
5
+ from proxy_lite.browser.browser import BrowserSession
6
+ from proxy_lite.environments.environment_base import (
7
+ Action,
8
+ BaseEnvironment,
9
+ BaseEnvironmentConfig,
10
+ Environments,
11
+ Observation,
12
+ State,
13
+ )
14
+ from proxy_lite.tools import BrowserTool, Tool, ToolExecutionResponse
15
+ # Import logger from proxy_lite.logger, or if it's already available via BaseEnvironment
16
+ from proxy_lite.logger import logger # Assuming you want to use the same logger
17
+
18
+ @Environments.register_environment_config("webbrowser")
19
+ class WebBrowserEnvironmentConfig(BaseEnvironmentConfig):
20
+ name: Literal["webbrowser"] = "webbrowser"
21
+ homepage: str = "https://google.com"
22
+ annotate_image: bool = True
23
+ screenshot_delay: float = 1.0 # seconds
24
+ include_html: bool = True
25
+ include_poi_text: bool = True
26
+ record_pois: bool = True
27
+ viewport_width: int = 1280
28
+ viewport_height: int = 720
29
+ browserbase_timeout: int = 7200
30
+ headless: bool = True
31
+ keep_original_image: bool = False
32
+ no_pois_in_image: bool = False
33
+
34
+
35
+ @Environments.register_environment("webbrowser")
36
+ class WebBrowserEnvironment(BaseEnvironment):
37
+ config: WebBrowserEnvironmentConfig
38
+ browser: Optional[BrowserSession] = None
39
+ cancelled_last_action: bool = False
40
+
41
+ class Config:
42
+ arbitrary_types_allowed = True
43
+
44
+ async def __aenter__(self) -> Self:
45
+ # Initialize the BrowserSession
46
+ self.browser = self.browser_session(
47
+ viewport_width=self.config.viewport_width,
48
+ viewport_height=self.config.viewport_height,
49
+ headless=self.config.headless,
50
+ )
51
+ await self.browser.__aenter__()
52
+ # Initialize other resources if necessary
53
+ if self.cookies:
54
+ await self.browser.context.add_cookies(self.cookies)
55
+ self.logger.info("🌐 [bold blue]Browser session started.[/]")
56
+ return self
57
+
58
+ async def __aexit__(self, exc_type, exc_value, traceback):
59
+ # Clean up the BrowserSession
60
+ await self.browser.__aexit__(exc_type, exc_value, traceback)
61
+
62
+ @property
63
+ def info_for_user(self) -> str:
64
+ return "This is a web browser environment. You can navigate the web, search the web, and perform actions on the web." # noqa: E501
65
+
66
+ @cached_property
67
+ def tools(self) -> list[Tool]:
68
+ return [BrowserTool(session=self.browser)]
69
+
70
+ @cached_property
71
+ def browser_session(self) -> type[BrowserSession]:
72
+ return BrowserSession
73
+
74
+ @property
75
+ def cookies(self) -> list[dict]:
76
+ return []
77
+
78
+ async def initialise(self) -> Observation:
79
+ self.logger.debug(f"DEBUG: Initialising WebBrowserEnvironment. Homepage: {self.config.homepage}")
80
+ try:
81
+ await self.browser.goto(self.config.homepage)
82
+ self.logger.debug(f"DEBUG: Browser navigated to homepage. Current URL: {self.browser.current_url}")
83
+ except Exception as e:
84
+ self.logger.error(f"ERROR: Failed to navigate to homepage {self.config.homepage}: {e}")
85
+ raise # Re-raise to propagate the error
86
+
87
+ original_img, annotated_img = await self.browser.screenshot(
88
+ delay=self.config.screenshot_delay,
89
+ )
90
+ if self.config.no_pois_in_image:
91
+ base64_image = base64.b64encode(original_img).decode("utf-8")
92
+ else:
93
+ base64_image = base64.b64encode(annotated_img).decode("utf-8")
94
+
95
+ html_content = await self.browser.current_page.content() if self.config.include_html else None
96
+
97
+ info = {"url": self.browser.current_url}
98
+ if self.config.record_pois:
99
+ info["pois"] = self.browser.pois
100
+ if self.config.keep_original_image:
101
+ info["original_image"] = base64.b64encode(original_img).decode("utf-8")
102
+
103
+ self.logger.debug(f"DEBUG: Initial observation captured. URL: {self.browser.current_url}")
104
+ return Observation(
105
+ state=State(
106
+ text=f"URL: {self.browser.current_url}"
107
+ + (f"\n{self.browser.poi_text}" if self.config.include_poi_text else ""),
108
+ image=base64_image,
109
+ html=html_content,
110
+ ),
111
+ terminated=False,
112
+ reward=None,
113
+ info=info,
114
+ )
115
+
116
+ async def should_perform_action(self) -> bool:
117
+ # if cancelled last action, run the action without updating POIs
118
+ if self.cancelled_last_action:
119
+ self.cancelled_last_action = False
120
+ return True
121
+
122
+ # check for page changes
123
+ old_points = [tuple(point) for point in self.browser.poi_centroids]
124
+ await self.browser.update_poi()
125
+ new_points = [tuple(point) for point in self.browser.poi_centroids]
126
+ page_changed_mid_action = old_points != new_points
127
+
128
+ # record if the last action was cancelled
129
+ if page_changed_mid_action:
130
+ self.cancelled_last_action = True
131
+ return False
132
+ return True
133
+
134
+ async def execute_action(self, action: Action) -> Observation:
135
+ responses = []
136
+ cancelled_tools_flag = False
137
+ if await self.should_perform_action():
138
+ for tool_call in action.tool_calls:
139
+ # Perform the chosen action
140
+ try:
141
+ tool_response: ToolExecutionResponse = await self.execute_tool(
142
+ tool_call,
143
+ )
144
+ tool_response.id = tool_call.id
145
+ responses.append(tool_response)
146
+ except Exception as e: # noqa: PERF203
147
+ self.logger.warning("🌐 An error occurred taking action: %s", str(e), exc_info=False)
148
+ tool_response = ToolExecutionResponse(content=str(e), id=tool_call.id)
149
+ responses.append(tool_response)
150
+ else:
151
+ self.logger.warning("🌐 Page changed since last observation, cancelling action.")
152
+ self.cancelled_last_action = True
153
+ for tool_call in action.tool_calls:
154
+ tool_response = ToolExecutionResponse(
155
+ content="The page changed before the action could be executed, instead of being ran it was cancelled.", # noqa: E501
156
+ id=tool_call.id,
157
+ )
158
+ responses.append(tool_response)
159
+ cancelled_tools_flag = True
160
+ original_img, annotated_img = await self.browser.screenshot(
161
+ delay=self.config.screenshot_delay,
162
+ )
163
+
164
+ base64_image = base64.b64encode(annotated_img).decode("utf-8")
165
+
166
+ info = {"url": self.browser.current_url, "cancelled_tools": cancelled_tools_flag}
167
+ if self.config.record_pois:
168
+ info["pois"] = self.browser.pois
169
+ if self.config.keep_original_image:
170
+ info["original_image"] = base64.b64encode(original_img).decode("utf-8")
171
+
172
+ html_content = await self.browser.current_page.content() if self.config.include_html else None
173
+ return Observation(
174
+ state=State(
175
+ text=f"URL: {self.browser.current_url}"
176
+ + (f"\n{self.browser.poi_text}" if self.config.include_poi_text else ""),
177
+ image=base64_image,
178
+ html=html_content,
179
+ tool_responses=responses,
180
+ ),
181
+ terminated=False,
182
+ reward=None,
183
+ info=info,
184
+ )
185
+
186
+ async def observe(self) -> Observation:
187
+ return await self.browser.observe()
188
+
189
+ async def evaluate(self, **kwargs: dict[str, Any]) -> dict[str, Any]:
190
+ return {}
191
+
192
+ async def get_info(self) -> dict[str, Any]:
193
+ info = {}
194
+ return info
src/proxy_lite/gif_maker.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import re
3
+ import textwrap
4
+ from io import BytesIO
5
+
6
+ from PIL import Image, ImageDraw, ImageFont
7
+
8
+ from proxy_lite.environments.environment_base import Action, Observation
9
+ from proxy_lite.recorder import Run
10
+
11
+
12
+ def create_run_gif(
13
+ run: Run, output_path: str, white_panel_width: int = 300, duration: int = 1500, resize_factor: int = 4
14
+ ) -> None:
15
+ """
16
+ Generate a gif from the Run object's history.
17
+
18
+ For each Observation record, the observation image is decoded from its base64
19
+ encoded string. If the next record is an Action, its text is drawn onto a
20
+ white panel. The observation image and the white panel are then concatenated
21
+ horizontally to produce a frame.
22
+
23
+ Parameters:
24
+ run (Run): A Run object with its history containing Observation and Action records.
25
+ output_path (str): The path where the GIF will be saved.
26
+ white_panel_width (int): The width of the white panel for displaying text.
27
+ Default increased to 400 for larger images.
28
+ duration (int): Duration between frames in milliseconds.
29
+ Increased here to slow the FPS (default is 1000ms).
30
+ resize_factor (int): The factor to resize the image down by.
31
+ """
32
+ frames = []
33
+ history = run.history
34
+ i = 0
35
+ while i < len(history):
36
+ if isinstance(history[i], Observation):
37
+ observation = history[i]
38
+ image_data = observation.state.image
39
+ if not image_data:
40
+ i += 1
41
+ continue
42
+ # Decode the base64 image
43
+ image_bytes = base64.b64decode(image_data)
44
+ obs_img = Image.open(BytesIO(image_bytes)).convert("RGB")
45
+
46
+ # scale the image down
47
+ obs_img = obs_img.resize((obs_img.width // resize_factor, obs_img.height // resize_factor))
48
+
49
+ # Check if the next record is an Action and extract its text if available
50
+ action_text = ""
51
+ if i + 1 < len(history) and isinstance(history[i + 1], Action):
52
+ action = history[i + 1]
53
+ if action.text:
54
+ action_text = action.text
55
+
56
+ # extract observation and thinking from tags in the action text
57
+ observation_match = re.search(r"<observation>(.*?)</observation>", action_text, re.DOTALL)
58
+ observation_content = observation_match.group(1).strip() if observation_match else None
59
+
60
+ # Extract text between thinking tags if present
61
+ thinking_match = re.search(r"<thinking>(.*?)</thinking>", action_text, re.DOTALL)
62
+ thinking_content = thinking_match.group(1).strip() if thinking_match else None
63
+
64
+ if observation_content and thinking_content:
65
+ action_text = f"**OBSERVATION**\n{observation_content}\n\n**THINKING**\n{thinking_content}"
66
+
67
+ # Create a white panel (same height as the observation image)
68
+ panel = Image.new("RGB", (white_panel_width, obs_img.height), "white")
69
+ draw = ImageDraw.Draw(panel)
70
+ font = ImageFont.load_default()
71
+
72
+ # Wrap the action text if it is too long
73
+ max_chars_per_line = 40 # Adjusted for larger font size
74
+ wrapped_text = textwrap.fill(action_text, width=max_chars_per_line)
75
+
76
+ # Calculate text block size and center it on the panel
77
+ try:
78
+ # Use multiline_textbbox if available (returns bounding box tuple)
79
+ bbox = draw.multiline_textbbox((0, 0), wrapped_text, font=font)
80
+ text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]
81
+ except AttributeError:
82
+ # Fallback for older Pillow versions: compute size for each line
83
+ lines = wrapped_text.splitlines() or [wrapped_text]
84
+ line_sizes = [draw.textsize(line, font=font) for line in lines]
85
+ text_width = max(width for width, _ in line_sizes)
86
+ text_height = sum(height for _, height in line_sizes)
87
+ text_x = (white_panel_width - text_width) // 2
88
+ text_y = (obs_img.height - text_height) // 2
89
+ draw.multiline_text((text_x, text_y), wrapped_text, fill="black", font=font, align="center")
90
+
91
+ # Create the combined frame by concatenating the observation image and the panel
92
+ total_width = obs_img.width + white_panel_width
93
+ combined_frame = Image.new("RGB", (total_width, obs_img.height))
94
+ combined_frame.paste(obs_img, (0, 0))
95
+ combined_frame.paste(panel, (obs_img.width, 0))
96
+ frames.append(combined_frame)
97
+
98
+ # Skip the Action record since it has been processed with this Observation
99
+ if i + 1 < len(history) and isinstance(history[i + 1], Action):
100
+ i += 2
101
+ else:
102
+ i += 1
103
+ else:
104
+ i += 1
105
+
106
+ if frames:
107
+ frames[0].save(output_path, save_all=True, append_images=frames[1:], duration=duration, loop=0)
108
+ else:
109
+ raise ValueError("No frames were generated from the Run object's history.")
110
+
111
+
112
+ # Example usage:
113
+ if __name__ == "__main__":
114
+ from proxy_lite.recorder import Run
115
+
116
+ dummy_run = Run.load("0abdb4cb-f289-48b0-ba13-35ed1210f7c1")
117
+
118
+ num_steps = int(len(dummy_run.history) / 2)
119
+ print(f"Number of steps: {num_steps}")
120
+ output_gif_path = "trajectory.gif"
121
+ create_run_gif(dummy_run, output_gif_path, duration=1000)
122
+ print(f"Trajectory GIF saved to {output_gif_path}")