kyle8581 commited on
Commit
dd39c08
·
1 Parent(s): d9b575c
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. BrowserGym/.gitignore +154 -0
  2. BrowserGym/.pre-commit-config.yaml +44 -0
  3. BrowserGym/.readthedocs.yaml +32 -0
  4. BrowserGym/LICENSE +13 -0
  5. BrowserGym/Makefile +17 -0
  6. BrowserGym/README.md +254 -0
  7. BrowserGym/browsergym/assistantbench/README.md +21 -0
  8. BrowserGym/browsergym/assistantbench/pyproject.toml +35 -0
  9. BrowserGym/browsergym/assistantbench/requirements.txt +4 -0
  10. BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/__init__.py +54 -0
  11. BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_dicts.py +68 -0
  12. BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_factory.py +28 -0
  13. BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_numbers.py +34 -0
  14. BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_strings.py +174 -0
  15. BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/utils.py +25 -0
  16. BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluator.py +132 -0
  17. BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/task.py +142 -0
  18. BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/utils.py +73 -0
  19. BrowserGym/browsergym/core/README.md +10 -0
  20. BrowserGym/browsergym/core/pyproject.toml +42 -0
  21. BrowserGym/browsergym/core/requirements.txt +8 -0
  22. BrowserGym/browsergym/core/src/browsergym/core/__init__.py +27 -0
  23. BrowserGym/browsergym/core/src/browsergym/core/action/__init__.py +11 -0
  24. BrowserGym/browsergym/core/src/browsergym/core/action/base.py +63 -0
  25. BrowserGym/browsergym/core/src/browsergym/core/action/functions.py +624 -0
  26. BrowserGym/browsergym/core/src/browsergym/core/action/highlevel.py +522 -0
  27. BrowserGym/browsergym/core/src/browsergym/core/action/parsers.py +92 -0
  28. BrowserGym/browsergym/core/src/browsergym/core/action/python.py +112 -0
  29. BrowserGym/browsergym/core/src/browsergym/core/action/utils.py +288 -0
  30. BrowserGym/browsergym/core/src/browsergym/core/chat.py +95 -0
  31. BrowserGym/browsergym/core/src/browsergym/core/chat_files/chatbox.html +243 -0
  32. BrowserGym/browsergym/core/src/browsergym/core/chat_files/chatbox_modern.html +379 -0
  33. BrowserGym/browsergym/core/src/browsergym/core/chat_files/img/send.svg +3 -0
  34. BrowserGym/browsergym/core/src/browsergym/core/constants.py +5 -0
  35. BrowserGym/browsergym/core/src/browsergym/core/env.py +625 -0
  36. BrowserGym/browsergym/core/src/browsergym/core/javascript/frame_mark_elements.js +295 -0
  37. BrowserGym/browsergym/core/src/browsergym/core/javascript/frame_unmark_elements.js +40 -0
  38. BrowserGym/browsergym/core/src/browsergym/core/observation.py +575 -0
  39. BrowserGym/browsergym/core/src/browsergym/core/registration.py +76 -0
  40. BrowserGym/browsergym/core/src/browsergym/core/spaces.py +140 -0
  41. BrowserGym/browsergym/core/src/browsergym/core/task.py +111 -0
  42. BrowserGym/browsergym/core/src/browsergym/utils/mcp_server.py +192 -0
  43. BrowserGym/browsergym/core/src/browsergym/utils/obs.py +554 -0
  44. BrowserGym/browsergym/experiments/README.md +12 -0
  45. BrowserGym/browsergym/experiments/pyproject.toml +65 -0
  46. BrowserGym/browsergym/experiments/requirements.txt +3 -0
  47. BrowserGym/browsergym/experiments/src/bgym/__init__.py +17 -0
  48. BrowserGym/browsergym/experiments/src/browsergym/experiments/__init__.py +2 -0
  49. BrowserGym/browsergym/experiments/src/browsergym/experiments/agent.py +112 -0
  50. BrowserGym/browsergym/experiments/src/browsergym/experiments/benchmark/__init__.py +2 -0
BrowserGym/.gitignore ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DS_store
2
+ .idea/
3
+ docs/src/generated/
4
+
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ pip-wheel-metadata/
28
+ share/python-wheels/
29
+ *.egg-info/
30
+ .installed.cfg
31
+ *.egg
32
+ MANIFEST
33
+
34
+ # PyInstaller
35
+ # Usually these files are written by a python script from a template
36
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
37
+ *.manifest
38
+ *.spec
39
+
40
+ # Installer logs
41
+ pip-log.txt
42
+ pip-delete-this-directory.txt
43
+
44
+ # Unit test / coverage reports
45
+ htmlcov/
46
+ .tox/
47
+ .nox/
48
+ .coverage
49
+ .coverage.*
50
+ .cache
51
+ nosetests.xml
52
+ coverage.xml
53
+ *.cover
54
+ *.py,cover
55
+ .hypothesis/
56
+ .pytest_cache/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ .python-version
90
+
91
+ # pipenv
92
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
+ # install all needed dependencies.
96
+ #Pipfile.lock
97
+
98
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
99
+ __pypackages__/
100
+
101
+ # Celery stuff
102
+ celerybeat-schedule
103
+ celerybeat.pid
104
+
105
+ # SageMath parsed files
106
+ *.sage.py
107
+
108
+ # Environments
109
+ .env
110
+ .venv
111
+ env/
112
+ venv/
113
+ ENV/
114
+ env.bak/
115
+ venv.bak/
116
+
117
+ # Spyder project settings
118
+ .spyderproject
119
+ .spyproject
120
+
121
+ # Rope project settings
122
+ .ropeproject
123
+
124
+ # mkdocs documentation
125
+ /site
126
+
127
+ # mypy
128
+ .mypy_cache/
129
+ .dmypy.json
130
+ dmypy.json
131
+
132
+ # Pyre type checker
133
+ .pyre/
134
+
135
+ # error logs
136
+ error_logs.txt
137
+
138
+ # tests
139
+ tests/results
140
+ tmp.py
141
+ .vscode/**
142
+
143
+ # demo and results
144
+ results/
145
+
146
+ .vscode/launch.json
147
+
148
+ # assistantbench
149
+ tests/assistantbench/assistantbench-predictions-test.jsonl
150
+
151
+ # weblinx
152
+ bg_wl_data/
153
+
154
+ uv.lock
BrowserGym/.pre-commit-config.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fail_fast: false
2
+
3
+ default_language_version:
4
+ python: python3
5
+
6
+ repos:
7
+ - repo: https://github.com/pre-commit/pre-commit-hooks
8
+ rev: v4.2.0
9
+ hooks:
10
+ - id: trailing-whitespace
11
+ exclude: ^(.*)\.md$
12
+ - id: end-of-file-fixer
13
+ - id: check-yaml
14
+ exclude: ^(.circleci/recipe|recipe) # conda build recipes are templated
15
+ - id: check-added-large-files
16
+ - repo: https://github.com/pocc/pre-commit-hooks
17
+ rev: v1.1.1
18
+ hooks:
19
+ - id: clang-format
20
+ args: [--style=file, -i]
21
+ - id: clang-tidy
22
+ args: [--fix, --fix-errors]
23
+ - repo: https://github.com/psf/black
24
+ rev: 24.2.0
25
+ hooks:
26
+ - id: black
27
+ args: [--config=./pyproject.toml]
28
+ - repo: https://github.com/asottile/blacken-docs
29
+ rev: v1.12.1
30
+ hooks:
31
+ - id: blacken-docs
32
+ args: [ '--line-length', '100' ]
33
+ additional_dependencies: [black]
34
+ - repo: https://github.com/Lucas-C/pre-commit-hooks
35
+ rev: v1.5.5
36
+ hooks:
37
+ - id: forbid-crlf
38
+ - id: remove-crlf
39
+ # Black does not clear tabs in docstrings
40
+ - id: forbid-tabs
41
+ files: '.*\.py$'
42
+ - id: remove-tabs
43
+ files: '.*\.py$'
44
+ args: [ '--whitespaces-count', '4' ]
BrowserGym/.readthedocs.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # .readthedocs.yaml
2
+ # Read the Docs configuration file
3
+ # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4
+
5
+ # Required
6
+ version: 2
7
+
8
+ # Set the OS, Python version and other tools you might need
9
+ build:
10
+ os: ubuntu-22.04
11
+ tools:
12
+ python: "3.12"
13
+ # You can also specify other tool versions:
14
+ # nodejs: "19"
15
+ # rust: "1.64"
16
+ # golang: "1.19"
17
+
18
+ # Build documentation in the "docs/" directory with Sphinx
19
+ sphinx:
20
+ configuration: docs/src/conf.py
21
+
22
+ # Optionally build your docs in additional formats such as PDF and ePub
23
+ # formats:
24
+ # - pdf
25
+ # - epub
26
+
27
+ # Optional but recommended, declare the Python requirements required
28
+ # to build your documentation
29
+ # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
30
+ python:
31
+ install:
32
+ - requirements: docs/requirements.txt
BrowserGym/LICENSE ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2024 ServiceNow
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
BrowserGym/Makefile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ install:
2
+ @echo "--- 🚀 Installing project dependencies ---"
3
+ pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/
4
+ playwright install chromium
5
+
6
+ install-demo:
7
+ @echo "--- 🚀 Installing demo dependencies ---"
8
+ pip install -r demo_agent/requirements.txt
9
+ playwright install chromium
10
+
11
+ demo:
12
+ @echo "--- 🚀 Running demo agent ---"
13
+ (set -x && cd demo_agent && python run_demo.py)
14
+
15
+ test-core:
16
+ @echo "--- 🧪 Running tests ---"
17
+ pytest -n auto ./tests/core
BrowserGym/README.md ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ ![BrowserGym banner](https://github.com/user-attachments/assets/4853f210-43ac-4107-a0d2-95c9c614dbe7)
4
+
5
+ 🛠️ [Setup](#%EF%B8%8F-setup) -
6
+ 🏋 [Usage](#-usage) -
7
+ 💻 [Demo](#-demo) -
8
+ 🌐 [Ecosystem](#-ecosystem) -
9
+ 🚀 [AgentLab](https://github.com/ServiceNow/AgentLab) -
10
+ 🌟 [Contributors](#-contributors) -
11
+ 📄 [Paper](https://arxiv.org/abs/2412.05467) -
12
+ 📝 [Citation](#-citing-this-work)
13
+
14
+ [![pypi](https://badge.fury.io/py/browsergym.svg)](https://pypi.org/project/browsergym/)
15
+ [![PyPI - License](https://img.shields.io/pypi/l/browsergym?style=flat-square)]([https://opensource.org/licenses/MIT](http://www.apache.org/licenses/LICENSE-2.0))
16
+ [![PyPI - Downloads](https://img.shields.io/pypi/dm/browsergym-core?style=flat-square)](https://pypistats.org/packages/browsergym-core)
17
+ [![GitHub star chart](https://img.shields.io/github/stars/ServiceNow/BrowserGym?style=flat-square)](https://star-history.com/#ServiceNow/BrowserGym)
18
+ [![Code Format](https://github.com/ServiceNow/BrowserGym/actions/workflows/code_format.yml/badge.svg)](https://github.com/ServiceNow/BrowserGym/actions/workflows/code_format.yml)
19
+ [![Tests](https://github.com/ServiceNow/BrowserGym/actions/workflows/unit_tests.yml/badge.svg)](https://github.com/ServiceNow/BrowserGym/actions/workflows/unit_tests.yml)
20
+
21
+ ```python
22
+ pip install browsergym
23
+ ```
24
+
25
+ </div>
26
+
27
+ > [!WARNING]
28
+ > BrowserGym is meant to provide an open, easy-to-use and extensible framework to accelerate the field of web agent research.
29
+ > It is not meant to be a consumer product. Use with caution!
30
+
31
+ > [!TIP]
32
+ > 🚀 Check out [AgentLab](https://github.com/ServiceNow/AgentLab)✨ !
33
+ > A seamless framework to implement, test, and evaluate your web agents on all BrowserGym benchmarks.
34
+
35
+ https://github.com/ServiceNow/BrowserGym/assets/26232819/e0bfc788-cc8e-44f1-b8c3-0d1114108b85
36
+
37
+ _Example of a GPT4-V agent executing openended tasks (top row, chat interactive), as well as WebArena and WorkArena tasks (bottom row)._
38
+
39
+ BrowserGym includes the following benchmarks by default:
40
+ - [MiniWoB](https://miniwob.farama.org/)
41
+ - [WebArena](https://webarena.dev/)
42
+ - [VisualWebArena](https://jykoh.com/vwa)
43
+ - [WorkArena](https://github.com/ServiceNow/WorkArena)
44
+ - [AssistantBench](https://github.com/oriyor/assistantbench)
45
+ - [WebLINX](https://github.com/McGill-NLP/weblinx) (static benchmark)
46
+
47
+ Designing new web benchmarks with BrowserGym is easy, and simply requires to inherit the [`AbstractBrowserTask`](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/core/src/browsergym/core/task.py#L7C7-L7C26) class.
48
+
49
+ ## 🛠️ Setup
50
+
51
+ To use browsergym, install one of the following packages:
52
+ ```sh
53
+ pip install browsergym # (recommended) everything below
54
+ pip install browsergym-experiments # experiment utilities (agent, loop, benchmarks) + everything below
55
+ pip install browsergym-core # core functionalities only (no benchmark, just the openended task)
56
+ pip install browsergym-miniwob # core + miniwob
57
+ pip install browsergym-webarena # core + webarena
58
+ pip install browsergym-visualwebarena # core + visualwebarena
59
+ pip install browsergym-workarena # core + workarena
60
+ pip install browsergym-assistantbench # core + assistantbench
61
+ pip install weblinx-browsergym # core + weblinx
62
+ ```
63
+
64
+ Then setup playwright by running
65
+ ```sh
66
+ playwright install chromium
67
+ ```
68
+
69
+ Finally, each benchmark comes with its own specific setup that requires to follow additional steps.
70
+ - for MiniWoB++, see [miniwob/README.md](browsergym/miniwob/README.md)
71
+ - for WebArena, see [webarena/README.md](browsergym/webarena/README.md)
72
+ - for VisualWebArena, see [visualwebarena/README.md](browsergym/visualwebarena/README.md)
73
+ - for WorkArena, see [WorkArena](https://github.com/ServiceNow/WorkArena)
74
+ - for AssistantBench, see [assistantbench/README.md](browsergym/assistantbench/README.md)
75
+
76
+ ### 🏗️ Development setup
77
+
78
+ To install browsergym locally for development, use the following commands:
79
+ ```sh
80
+ git clone [email protected]:ServiceNow/BrowserGym.git
81
+ cd BrowserGym
82
+ make install
83
+ ```
84
+
85
+ Contributions are welcome! 😊
86
+
87
+ ## 🏋 Usage
88
+
89
+ Boilerplate code to run an agent on an interactive, open-ended task:
90
+ ```python
91
+ import gymnasium as gym
92
+ import browsergym.core # register the openended task as a gym environment
93
+
94
+ # start an openended environment
95
+ env = gym.make(
96
+ "browsergym/openended",
97
+ task_kwargs={"start_url": "https://www.google.com/"}, # starting URL
98
+ wait_for_user_message=True, # wait for a user message after each agent message sent to the chat
99
+ )
100
+ # run the environment <> agent loop until termination
101
+ obs, info = env.reset()
102
+ while True:
103
+ action = ... # implement your agent here
104
+ obs, reward, terminated, truncated, info = env.step(action)
105
+ if terminated or truncated:
106
+ break
107
+ # release the environment
108
+ env.close()
109
+ ```
110
+
111
+ MiniWoB
112
+ ```python
113
+ import gymnasium as gym
114
+ import browsergym.miniwob # register miniwob tasks as gym environments
115
+
116
+ # start a miniwob task
117
+ env = gym.make("browsergym/miniwob.choose-list")
118
+ ...
119
+
120
+ # list all the available miniwob tasks
121
+ env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/miniwob")]
122
+ print("\n".join(env_ids))
123
+ ```
124
+
125
+ WorkArena
126
+ ```python
127
+ import gymnasium as gym
128
+ import browsergym.workarena # register workarena tasks as gym environments
129
+
130
+ # start a workarena task
131
+ env = gym.make("browsergym/workarena.servicenow.order-ipad-pro")
132
+ ...
133
+
134
+ # list all the available workarena tasks
135
+ env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/workarena")]
136
+ print("\n".join(env_ids))
137
+ ```
138
+
139
+ WebArena
140
+ ```python
141
+ import gymnasium as gym
142
+ import browsergym.webarena # register webarena tasks as gym environments
143
+
144
+ # start a webarena task
145
+ env = gym.make("browsergym/webarena.310")
146
+ ...
147
+
148
+ # list all the available webarena tasks
149
+ env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/webarena")]
150
+ print("\n".join(env_ids))
151
+ ```
152
+
153
+ VisualWebArena
154
+ ```python
155
+ import gymnasium as gym
156
+ import browsergym.webarena # register webarena tasks as gym environments
157
+
158
+ # start a visualwebarena task
159
+ env = gym.make("browsergym/visualwebarena.721")
160
+ ...
161
+
162
+ # list all the available visualwebarena tasks
163
+ env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/visualwebarena")]
164
+ print("\n".join(env_ids))
165
+ ```
166
+
167
+ AssistantBench
168
+ ```python
169
+ import gymnasium as gym
170
+ import browsergym.workarena # register assistantbench tasks as gym environments
171
+
172
+ # start an assistantbench task
173
+ env = gym.make("browsergym/assistantbench.validation.3")
174
+ ...
175
+
176
+ # list all the available assistantbench tasks
177
+ env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/workarena")]
178
+ print("\n".join(env_ids))
179
+ ```
180
+
181
+ ## 💻 Demo
182
+
183
+ If you want to experiment with a demo agent in BrowserGym, follow these steps
184
+ ```sh
185
+ # conda setup
186
+ conda env create -f demo_agent/environment.yml
187
+ conda activate demo_agent
188
+
189
+ # or pip setup
190
+ pip install -r demo_agent/requirements.txt
191
+
192
+ # then download the browser for playwright
193
+ playwright install chromium
194
+ ```
195
+
196
+ Our demo agent uses `openai` as a backend, be sure to set your `OPENAI_API_KEY`.
197
+
198
+ Launch the demo agent as follows
199
+ ```sh
200
+ # openended (interactive chat mode)
201
+ python demo_agent/run_demo.py --task_name openended --start_url https://www.google.com
202
+
203
+ # miniwob
204
+ python demo_agent/run_demo.py --task_name miniwob.click-test
205
+
206
+ # workarena
207
+ python demo_agent/run_demo.py --task_name workarena.servicenow.order-standard-laptop
208
+
209
+ # webarena
210
+ python demo_agent/run_demo.py --task_name webarena.4
211
+
212
+ # visualwebarena
213
+ python demo_agent/run_demo.py --task_name visualwebarena.398
214
+ ```
215
+
216
+ You can customize your experience by changing the `model_name` to your preferred LLM (it uses `gpt-4o-mini` by default), adding screenshots for your VLMs with `use_screenshot`, and much more!
217
+
218
+ ```python
219
+ python demo_agent/run_demo.py --help
220
+ ```
221
+
222
+ ## 🌐 Ecosystem
223
+
224
+ - [AgentLab](https://github.com/ServiceNow/AgentLab): Seamlessly run agents on benchmarks, collect and analyse traces.
225
+ - [WorkArena(++)](https://github.com/ServiceNow/WorkArena): A benchmark for web agents on the ServiceNow platform.
226
+ - [WebArena](https://github.com/web-arena-x/webarena): A benchmark of realistic web tasks on self-hosted domains.
227
+ - [VisualWebArena](https://github.com/web-arena-x/visualwebarena): A benchmark of realistic visual web tasks on self-hosted domains.
228
+ - [MiniWoB(++)](https://miniwob.farama.org/): A collection of over 100 web tasks on synthetic web pages.
229
+ - [WebLINX](https://github.com/McGill-NLP/weblinx): A dataset of real-world web interaction traces.
230
+ - [AssistantBench](https://github.com/oriyor/assistantbench): A benchmark of realistic and time-consuming tasks on the open web.
231
+ - [DoomArena](https://github.com/ServiceNow/DoomArena): A framework for AI agent security testing which supports injecting attacks into web pages from Browsergym environments.
232
+
233
+ ## 🌟 Contributors
234
+
235
+ [![BrowserGym contributors](https://contrib.rocks/image?repo=ServiceNow/BrowserGym&max=2000)](https://github.com/ServiceNow/BrowserGym/graphs/contributors)
236
+
237
+ ## 📝 Citing This Work
238
+
239
+ Please use the following BibTeX to cite our work:
240
+ ```tex
241
+ @inproceedings{workarena2024,
242
+ title = {{W}ork{A}rena: How Capable are Web Agents at Solving Common Knowledge Work Tasks?},
243
+ author = {Drouin, Alexandre and Gasse, Maxime and Caccia, Massimo and Laradji, Issam H. and Del Verme, Manuel and Marty, Tom and Vazquez, David and Chapados, Nicolas and Lacoste, Alexandre},
244
+ booktitle = {Proceedings of the 41st International Conference on Machine Learning},
245
+ pages = {11642--11662},
246
+ year = {2024},
247
+ editor = {Salakhutdinov, Ruslan and Kolter, Zico and Heller, Katherine and Weller, Adrian and Oliver, Nuria and Scarlett, Jonathan and Berkenkamp, Felix},
248
+ volume = {235},
249
+ series = {Proceedings of Machine Learning Research},
250
+ month = {21--27 Jul},
251
+ publisher = {PMLR},
252
+ url = {https://proceedings.mlr.press/v235/drouin24a.html},
253
+ }
254
+ ```
BrowserGym/browsergym/assistantbench/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AssistantBench <> BrowserGym
2
+
3
+ This package provides an implementation for using the [AssistantBench](https://assistantbench.github.io/) benchmark in BrowserGym.
4
+
5
+ Because AssistantBench includes open-ended tasks, setup is extremely easy and simply requires installing the package.
6
+
7
+ Please note that AssistantBench has a hidden test set, so test set predictions will need to be uploaded to the official [leaderboard](https://huggingface.co/spaces/AssistantBench/leaderboard).
8
+
9
+ ## Setting up
10
+
11
+ - Install the package (this is still a wip)
12
+ ```
13
+ pip install browsergym-assistantbench
14
+ ```
15
+
16
+ - Run inference, e.g., run the following commands for demo on a simple toy task
17
+ ```
18
+ python demo_agent/run_demo.py --task_name assistantbench.validation.3
19
+ ```
20
+
21
+ - Test set predictions will be saved to `./assistantbench-predictions-test.jsonl`. To evaluate on the official test set, upload these predictions to the official [leaderboard](https://huggingface.co/spaces/AssistantBench/leaderboard).
BrowserGym/browsergym/assistantbench/pyproject.toml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-requirements-txt"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "browsergym-assistantbench"
7
+ description = "AssistantBench benchmark for BrowserGym"
8
+ authors = [
9
+ {name = "Ori Yoran"},
10
+ {name = "Maxime Gasse"},
11
+ ]
12
+ readme = "README.md"
13
+ requires-python = ">3.7"
14
+ license = {text = "Apache-2.0"}
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Programming Language :: Python :: 3",
18
+ "Operating System :: OS Independent",
19
+ "Intended Audience :: Science/Research",
20
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
21
+ "License :: OSI Approved :: Apache Software License",
22
+ ]
23
+ dynamic = ["dependencies", "version"]
24
+
25
+ [project.urls]
26
+ homepage = "https://github.com/ServiceNow/BrowserGym"
27
+
28
+ [tool.hatch.version]
29
+ path = "../core/src/browsergym/core/__init__.py"
30
+
31
+ [tool.hatch.metadata.hooks.requirements_txt]
32
+ files = ["requirements.txt"]
33
+
34
+ [tool.hatch.build.targets.wheel]
35
+ packages = ["src/browsergym"]
BrowserGym/browsergym/assistantbench/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ browsergym-core==0.13.4
2
+ datasets
3
+ scipy
4
+ numpy
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/__init__.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from browsergym.core.registration import register_task
2
+
3
+ from . import task
4
+
5
+ TOY_AB_TASK_IDS = []
6
+ VALID_AB_TASK_IDS = []
7
+ TEST_AB_TASK_IDS = []
8
+
9
+
10
+ # register a toy easy task for testing implementation
11
+ gym_id = f"assistantbench.imp.0"
12
+ register_task(
13
+ gym_id,
14
+ task.AssistantBenchTask,
15
+ task_kwargs={
16
+ "task_id": f"imp.0",
17
+ },
18
+ default_task_kwargs={
19
+ "save_predictions": False, # can be overriden
20
+ },
21
+ )
22
+ TOY_AB_TASK_IDS.append(gym_id)
23
+
24
+ # register the AssistantBench dev set
25
+ for task_id in range(33):
26
+ gym_id = f"assistantbench.validation.{task_id}"
27
+ register_task(
28
+ gym_id,
29
+ task.AssistantBenchTask,
30
+ task_kwargs={
31
+ "task_id": f"validation.{task_id}",
32
+ },
33
+ default_task_kwargs={
34
+ "save_predictions": False, # can be overriden
35
+ },
36
+ )
37
+ VALID_AB_TASK_IDS.append(gym_id)
38
+
39
+ # register the AssistantBench test set
40
+ for task_id in range(181):
41
+ gym_id = f"assistantbench.test.{task_id}"
42
+ register_task(
43
+ gym_id,
44
+ task.AssistantBenchTask,
45
+ task_kwargs={
46
+ "task_id": f"test.{task_id}",
47
+ },
48
+ default_task_kwargs={
49
+ "save_predictions": True, # can be overriden
50
+ },
51
+ )
52
+ TEST_AB_TASK_IDS.append(gym_id)
53
+
54
+ ALL_AB_TASK_IDS = TOY_AB_TASK_IDS + VALID_AB_TASK_IDS + TEST_AB_TASK_IDS
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_dicts.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ import numpy as np
4
+
5
+ from .utils import _align_bags
6
+
7
+
8
+ def calculate_f1_score(precision, recall):
9
+ if precision + recall == 0:
10
+ return 0 # Handle the case to avoid division by zero
11
+ return 2 * (precision * recall) / (precision + recall)
12
+
13
+
14
+ def calc_recall(pred: Dict, gold: Dict, use_gold_for_eval: bool):
15
+ from .evaluate_factory import get_evaluator_from_gold_answer
16
+
17
+ recall = []
18
+ for gold_key, gold_value in gold.items():
19
+ pred_value = pred.get(gold_key)
20
+ gold_value = fix_number(gold_value)
21
+ pred_value = fix_number(pred_value)
22
+ if gold_key not in pred:
23
+ recall.append(0)
24
+ else:
25
+ evaluator = (
26
+ get_evaluator_from_gold_answer(type(gold_value))
27
+ if use_gold_for_eval
28
+ else get_evaluator_from_gold_answer(type(pred_value))
29
+ )
30
+ if type(pred_value) != type(gold_value):
31
+ recall.append(0)
32
+ continue
33
+ recall.append(evaluator(pred_value, gold_value))
34
+ avg_recall = np.average(recall)
35
+ return avg_recall
36
+
37
+
38
+ def fix_number(number):
39
+
40
+ if type(number) == str:
41
+ copy_ans = number
42
+ copy_ans = " ".join(
43
+ " ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")
44
+ ).strip()
45
+ copy_ans = copy_ans.strip()
46
+ copy_ans = copy_ans.replace(",", ".")
47
+ try:
48
+ return float(copy_ans)
49
+ except:
50
+ return number
51
+ elif type(number) == int:
52
+ return float(number)
53
+ else:
54
+ return number
55
+
56
+
57
+ def evaluate_pair_of_dicts(pred: Dict, gold: Dict):
58
+ recall = calc_recall(pred, gold, True)
59
+ precision = calc_recall(gold, pred, False)
60
+ f1 = calculate_f1_score(precision, recall)
61
+ return f1
62
+
63
+
64
+ def evaluate_dicts(pred: List[Dict], gold: List[Dict]):
65
+ if not (type(pred) == dict or len(pred) == 0 or (type(pred) == list and type(pred[0]) == dict)):
66
+ return 0
67
+ max_alignment_scores = _align_bags(pred, gold, evaluate_pair_of_dicts)
68
+ return np.average(max_alignment_scores)
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_factory.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ from .evaluate_dicts import evaluate_dicts
4
+ from .evaluate_numbers import evaluate_numbers
5
+ from .evaluate_strings import evaluate_strings
6
+
7
+ EvaluatorFactory = {
8
+ "string": evaluate_strings,
9
+ "number": evaluate_numbers,
10
+ "json": evaluate_dicts,
11
+ "string list": evaluate_strings,
12
+ }
13
+
14
+ EvaluatorFactoryFromType = {
15
+ str: evaluate_strings,
16
+ int: evaluate_numbers,
17
+ float: evaluate_numbers,
18
+ bool: evaluate_strings,
19
+ list: evaluate_strings,
20
+ }
21
+
22
+
23
+ def get_evaluator(evaluator: str):
24
+ return EvaluatorFactory[evaluator]
25
+
26
+
27
+ def get_evaluator_from_gold_answer(gold_answer: Union[str, int, float]):
28
+ return EvaluatorFactoryFromType[gold_answer]
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_numbers.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ import numpy as np
4
+
5
+
6
+ # Renamed calc_z function to distance_function_log
7
+ def distance_function_log(pred: float, gold: float):
8
+ if pred == gold == 0:
9
+ return 1
10
+ if pred == 0:
11
+ pred = 1e-4
12
+ if gold == 0:
13
+ gold = 1e-4
14
+ if pred > gold:
15
+ return max(0, 1 - np.log(pred / gold))
16
+ else:
17
+ return max(0, 1 - np.log(gold / pred))
18
+
19
+
20
+ def evaluate_numbers(pred: Union[float, str], gold: float):
21
+ res = None
22
+ if type(pred) != float and type(pred) != int:
23
+ try:
24
+ pred = float(pred)
25
+ except ValueError:
26
+ res = 0
27
+ if type(gold) != float and type(gold) != int:
28
+ try:
29
+ gold = float(gold)
30
+ except ValueError:
31
+ res = 0
32
+ if res is None:
33
+ res = distance_function_log(pred, gold)
34
+ return res
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_strings.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluation for two strings or list of strings.
3
+ Code taken from the DROP benchmark - https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
4
+ """
5
+
6
+ import re
7
+ import string
8
+ from typing import List, Set, Tuple, Union
9
+
10
+ import numpy as np
11
+ from scipy.optimize import linear_sum_assignment
12
+
13
+
14
+ # From here through _normalize_answer was originally copied from:
15
+ # https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
16
+ # Then cleaned up and modified a bit.
17
+ def _remove_articles(text: str) -> str:
18
+ regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
19
+ return re.sub(regex, " ", text)
20
+
21
+
22
+ def _white_space_fix(text: str) -> str:
23
+ return " ".join(text.split())
24
+
25
+
26
+ EXCLUDE = set(string.punctuation)
27
+
28
+
29
+ def _remove_punc(text: str) -> str:
30
+ if not _is_number(text):
31
+ return "".join(ch for ch in text if ch not in EXCLUDE)
32
+ else:
33
+ return text
34
+
35
+
36
+ def _lower(text: str) -> str:
37
+ return text.lower()
38
+
39
+
40
+ def _tokenize(text: str) -> List[str]:
41
+ return re.split(" |-", text)
42
+
43
+
44
+ def _normalize_answer(text: str) -> str:
45
+ """Lower text and remove punctuation, articles and extra whitespace."""
46
+
47
+ parts = [
48
+ _white_space_fix(_remove_articles(_normalize_number(_remove_punc(_lower(token)))))
49
+ for token in _tokenize(text)
50
+ ]
51
+ parts = [part for part in parts if part.strip()]
52
+ normalized = " ".join(parts).strip()
53
+ return normalized
54
+
55
+
56
+ def _is_number(text: str) -> bool:
57
+ try:
58
+ float(text)
59
+ return True
60
+ except ValueError:
61
+ return False
62
+
63
+
64
+ def _normalize_number(text: str) -> str:
65
+ if _is_number(text):
66
+ return str(float(text))
67
+ else:
68
+ return text
69
+
70
+
71
+ def _answer_to_bags(
72
+ answer: Union[str, List[str], Tuple[str, ...]]
73
+ ) -> Tuple[List[str], List[Set[str]]]:
74
+ if isinstance(answer, (list, tuple)):
75
+ raw_spans = answer
76
+ else:
77
+ raw_spans = [answer]
78
+ normalized_spans: List[str] = []
79
+ token_bags = []
80
+ for raw_span in raw_spans:
81
+ normalized_span = _normalize_answer(raw_span)
82
+ normalized_spans.append(normalized_span)
83
+ token_bags.append(set(normalized_span.split()))
84
+ return normalized_spans, token_bags
85
+
86
+
87
+ def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
88
+ """
89
+ Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
90
+ between them and gets maximum metric values over all the answers.
91
+ """
92
+ scores = np.zeros([len(gold), len(predicted)])
93
+ for gold_index, gold_item in enumerate(gold):
94
+ for pred_index, pred_item in enumerate(predicted):
95
+ if _match_numbers_if_present(gold_item, pred_item):
96
+ scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
97
+ row_ind, col_ind = linear_sum_assignment(-scores)
98
+
99
+ max_scores = np.zeros([max(len(gold), len(predicted))])
100
+ for row, column in zip(row_ind, col_ind):
101
+ max_scores[row] = max(max_scores[row], scores[row, column])
102
+ return max_scores
103
+
104
+
105
+ def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
106
+ intersection = len(gold_bag.intersection(predicted_bag))
107
+ if not predicted_bag:
108
+ precision = 1.0
109
+ else:
110
+ precision = intersection / float(len(predicted_bag))
111
+ if not gold_bag:
112
+ recall = 1.0
113
+ else:
114
+ recall = intersection / float(len(gold_bag))
115
+ f1 = (
116
+ (2 * precision * recall) / (precision + recall)
117
+ if not (precision == 0.0 and recall == 0.0)
118
+ else 0.0
119
+ )
120
+ return f1
121
+
122
+
123
+ def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]) -> bool:
124
+ gold_numbers = set()
125
+ predicted_numbers = set()
126
+ for word in gold_bag:
127
+ if _is_number(word):
128
+ gold_numbers.add(word)
129
+ for word in predicted_bag:
130
+ if _is_number(word):
131
+ predicted_numbers.add(word)
132
+ if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
133
+ return True
134
+ return False
135
+
136
+
137
+ def get_metrics(
138
+ predicted: Union[str, List[str], Tuple[str, ...]],
139
+ gold: Union[str, List[str], Tuple[str, ...]],
140
+ ) -> Tuple[float, float]:
141
+ """
142
+ Takes a predicted answer and a gold answer (that are both either a string or a list of
143
+ strings), and returns exact match and the DROP F1 metric for the prediction. If you are
144
+ writing a script for evaluating objects in memory (say, the output of predictions during
145
+ validation, or while training), this is the function you want to call, after using
146
+ :func:`answer_json_to_strings` when reading the gold answer from the released data file.
147
+ """
148
+ predicted_bags = _answer_to_bags(predicted)
149
+ gold_bags = _answer_to_bags(gold)
150
+
151
+ if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
152
+ exact_match = 1.0
153
+ else:
154
+ exact_match = 0.0
155
+
156
+ f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
157
+ f1 = np.mean(f1_per_bag)
158
+ f1 = round(f1, 2)
159
+ return exact_match, f1
160
+
161
+
162
+ def evaluate_strings(prediction, gold):
163
+ if type(prediction) != list and type(prediction) != str:
164
+ prediction = str(prediction)
165
+ if type(gold) != list and type(gold) != str:
166
+ gold = str(gold)
167
+ try:
168
+ predicted_bags = _answer_to_bags(prediction)
169
+ gold_bags = _answer_to_bags(gold)
170
+ f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
171
+ f1 = np.mean(f1_per_bag)
172
+ except Exception:
173
+ f1 = 0.0
174
+ return f1
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, List, Set
2
+
3
+ import numpy as np
4
+ from scipy.optimize import linear_sum_assignment
5
+
6
+
7
+ def _align_bags(
8
+ predicted: List[Set[str]],
9
+ gold: List[Set[str]],
10
+ method: Callable[[object, object], float],
11
+ ) -> List[float]:
12
+ """
13
+ Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
14
+ between them and gets maximum metric values over all the answers.
15
+ """
16
+ scores = np.zeros([len(gold), len(predicted)])
17
+ for gold_index, gold_item in enumerate(gold):
18
+ for pred_index, pred_item in enumerate(predicted):
19
+ scores[gold_index, pred_index] = method(pred_item, gold_item)
20
+ row_ind, col_ind = linear_sum_assignment(-scores)
21
+
22
+ max_scores = np.zeros([max(len(gold), len(predicted))])
23
+ for row, column in zip(row_ind, col_ind):
24
+ max_scores[row] = max(max_scores[row], scores[row, column])
25
+ return max_scores
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluator.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # todo export evaluation to a python package
2
+
3
+ import json
4
+
5
+ import numpy as np
6
+
7
+ from .evaluate_utils.evaluate_factory import get_evaluator
8
+
9
+
10
+ def find_isnan(samp):
11
+ try:
12
+ if np.isnan(samp):
13
+ return True
14
+ else:
15
+ return False
16
+ except:
17
+ return False
18
+
19
+
20
+ def fix_ans(answer):
21
+ try:
22
+ answer = (
23
+ answer.replace("{'", '{"')
24
+ .replace("', '", '", "')
25
+ .replace("': '", '": "')
26
+ .replace("'}", '"}')
27
+ )
28
+ answer = answer.replace("': ", '": ')
29
+ return answer
30
+ except:
31
+ return answer
32
+
33
+
34
+ def parse_answer(answer):
35
+ if len(answer) == 1:
36
+ ans, is_num = fix_number(answer[0])
37
+ if is_num:
38
+ return ans, "number"
39
+ try:
40
+ ans = json.loads(fix_ans(answer[0]))
41
+ return [ans], "json"
42
+ except:
43
+ ans, is_num = fix_number(answer[0])
44
+ if is_num:
45
+ return ans, "number"
46
+ else:
47
+ return answer[0], "string"
48
+ else:
49
+ try:
50
+ ans = [json.loads(fix_ans(ex)) for ex in answer]
51
+ return ans, "json"
52
+ except:
53
+ return answer, "string list"
54
+
55
+
56
+ def fix_number(number):
57
+ if type(number) == str:
58
+ copy_ans = number
59
+ copy_ans = " ".join(
60
+ " ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")
61
+ ).strip()
62
+ copy_ans = copy_ans.strip()
63
+ copy_ans = copy_ans.replace(",", ".").replace(" square kilometers", "")
64
+ try:
65
+ return float(copy_ans), True
66
+ except:
67
+ return number, False
68
+ elif type(number) == int:
69
+ return float(number), True
70
+ else:
71
+ return number, True
72
+
73
+
74
+ def fix_prediction(prediction, gold_answer, evaluator):
75
+ if (
76
+ type(prediction) == list
77
+ and len(prediction) == 1
78
+ and (
79
+ type(prediction[0]) == int
80
+ or ((type(prediction[0]) == str) and prediction[0].isnumeric())
81
+ )
82
+ ):
83
+ prediction = fix_number(prediction[0])
84
+
85
+ if type(prediction) != list:
86
+ prediction, is_num = fix_number(prediction)
87
+ if evaluator == "json":
88
+ try:
89
+ prediction = [json.loads(pred) for pred in prediction.split("\n")]
90
+ except:
91
+ prediction = [prediction]
92
+
93
+ if (hasattr(type(prediction), "__len__")) and (len(prediction) == 0):
94
+ return prediction, False
95
+
96
+ if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float:
97
+ return prediction, False
98
+
99
+ return prediction, True
100
+
101
+
102
+ def question_scorer(prediction, gold_answer):
103
+ try:
104
+ prediction = json.loads(prediction)
105
+ except:
106
+ prediction = prediction
107
+
108
+ answer_list = (
109
+ [x for x in gold_answer.split("\n") if len(x.strip()) > 0]
110
+ if type(gold_answer) != list
111
+ else gold_answer
112
+ )
113
+ gold_answer, evaluator = parse_answer(answer_list)
114
+ prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator)
115
+
116
+ has_ans = 1.0
117
+ if (type(prediction) != float and len(prediction) == 0) or find_isnan(prediction):
118
+ has_ans = 0.0
119
+
120
+ if type(prediction) == list:
121
+ if all(
122
+ (type(pred) not in {float, int} and len(pred) == 0) or find_isnan(pred)
123
+ for pred in prediction
124
+ ):
125
+ has_ans = 0
126
+
127
+ if not run_eval:
128
+ return 0.0, has_ans
129
+
130
+ metric_eval = get_evaluator(evaluator)
131
+ accuracy = metric_eval(prediction, gold_answer)
132
+ return accuracy, has_ans
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/task.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from typing import Dict, Tuple
4
+
5
+ from datasets import load_dataset
6
+ from playwright.sync_api import Page
7
+
8
+ from browsergym.core.task import AbstractBrowserTask
9
+
10
+ from .evaluation.evaluator import question_scorer
11
+ from .utils import add_prediction_to_jsonl
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ _DEFAULT_OUTPUT_FILE = None
16
+
17
+
18
+ def set_default_output_file(output_file: str):
19
+ global _DEFAULT_OUTPUT_FILE
20
+ _DEFAULT_OUTPUT_FILE = output_file
21
+
22
+
23
+ def get_default_output_file():
24
+ return _DEFAULT_OUTPUT_FILE
25
+
26
+
27
+ # Load dataset
28
+
29
+ DATA_DATASET = "AssistantBench/AssistantBench"
30
+ all_tasks = load_dataset(DATA_DATASET, trust_remote_code=True)
31
+
32
+
33
+ # Extract answers and tasks for validation and test splits
34
+ def extract_data(split_name: str) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]:
35
+ return (
36
+ {
37
+ f"{split_name}.{i}": row["answer"] if row["answer"] is not None else ""
38
+ for i, row in enumerate(all_tasks[split_name])
39
+ },
40
+ {f"{split_name}.{i}": row["task"] for i, row in enumerate(all_tasks[split_name])},
41
+ {f"{split_name}.{i}": row["id"] for i, row in enumerate(all_tasks[split_name])},
42
+ )
43
+
44
+
45
+ # Implementation data for testing
46
+ def get_implementation_testing_data() -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]:
47
+ return (
48
+ {"imp.0": "20"},
49
+ {
50
+ "imp.0": "What is the weather in Paris yesterday in Celsius? Answer with the number only."
51
+ },
52
+ {"imp.0": "test_imp_id_0"},
53
+ )
54
+
55
+
56
+ # Combine dev, test, and implementation-specific testing splits
57
+ gold_answers_dev, tasks_dev, ids_dev = extract_data("validation")
58
+ gold_answers_test, tasks_test, ids_test = extract_data("test")
59
+ gold_answers_impl_testing, tasks_test_impl_testing, ids_imp_testing = (
60
+ get_implementation_testing_data()
61
+ )
62
+ gold_answers = {**gold_answers_dev, **gold_answers_test, **gold_answers_impl_testing}
63
+ tasks = {**tasks_dev, **tasks_test, **tasks_test_impl_testing}
64
+ ids = {**ids_dev, **ids_test, **ids_imp_testing}
65
+
66
+
67
+ class AssistantBenchTask(AbstractBrowserTask):
68
+
69
+ @classmethod
70
+ def get_task_id(cls) -> str:
71
+ """
72
+ Generic class for several task ids, this way of obtaining the task id is not compatible for now.
73
+ """
74
+ raise NotImplementedError
75
+
76
+ def __init__(
77
+ self, seed: int, task_id: str, output_file: str = None, save_predictions: bool = False
78
+ ) -> None:
79
+ """
80
+ Args:
81
+ seed (int): Random seed for task initialization.
82
+ task_id (str): Unique identifier for the task (for the BrowserGym environment).
83
+ output_file (str, optional): Path to the output file for saving results, needed for test set.
84
+ save_predictions (bool, optional): Save predictions to the output file (yes/no).
85
+ """
86
+ super().__init__(seed)
87
+ self.locale = "en-US"
88
+ self.timezone_id = "America/New_York"
89
+
90
+ self.task_id = task_id
91
+ self.start_url = "https://google.com"
92
+ self.goal = tasks[str(self.task_id)]
93
+ self.gold = gold_answers[str(self.task_id)]
94
+ self.ab_task_id = ids[self.task_id]
95
+ self.save_predictions = save_predictions
96
+
97
+ self.output_file = output_file
98
+
99
+ # set output_file using the global default value, if not provided in constructor
100
+ if not self.output_file:
101
+ self.output_file = get_default_output_file()
102
+ # use env variable in last resort
103
+ if not self.output_file:
104
+ self.output_file = os.getenv("ASSISTANTBENCH_OUTPUT_FILE", None)
105
+
106
+ if self.save_predictions and self.output_file:
107
+ logger.info(f"Task prediction will be written to output file {self.output_file}")
108
+
109
+ def setup(self, page: Page) -> Tuple[str, dict]:
110
+ logger.info(f"Navigating to start url: {self.start_url}")
111
+ page.goto(self.start_url, timeout=10000)
112
+ if self.save_predictions and self.output_file:
113
+ # create an empty task entry in the output file (will raise an Exception if the entry is already there)
114
+ add_prediction_to_jsonl(
115
+ file_path=self.output_file,
116
+ task_id=self.ab_task_id,
117
+ prediction="",
118
+ override_if_exists=False,
119
+ )
120
+ return self.goal, {}
121
+
122
+ def teardown(self) -> None:
123
+ pass
124
+
125
+ def validate(self, page: Page, chat_messages: list[dict]) -> Tuple[float, bool, str, dict]:
126
+ accuracy, done, msg, info = 0.0, False, "", {}
127
+
128
+ # eval when the agent returns a response
129
+ if chat_messages and chat_messages[-1]["role"] == "assistant":
130
+ done = True
131
+ prediction = chat_messages[-1]["message"]
132
+ if self.save_predictions and self.output_file:
133
+ # update the task entry in the output file
134
+ add_prediction_to_jsonl(
135
+ file_path=self.output_file,
136
+ task_id=self.ab_task_id,
137
+ prediction=prediction,
138
+ override_if_exists=True,
139
+ )
140
+ accuracy, has_ans = question_scorer(prediction, self.gold)
141
+
142
+ return accuracy, done, msg, info
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/utils.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import pathlib
5
+ import time
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def add_prediction_to_jsonl(
11
+ file_path: str, task_id: str, prediction: object, override_if_exists: bool
12
+ ) -> None:
13
+ """
14
+ Multiprocessing-safe file write.
15
+ """
16
+ lock_file_path = pathlib.Path(file_path).with_suffix(".lock")
17
+ lock_max_wait = 10 # 10 seconds
18
+
19
+ # Acquire lock (atomic file creation)
20
+ start_time = time.time()
21
+ while True:
22
+ try:
23
+ fd = os.open(lock_file_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
24
+ with os.fdopen(fd, "w") as f:
25
+ f.write("lock")
26
+ break
27
+ except FileExistsError:
28
+ # give up if max wait time reached
29
+ seconds_waited = time.time() - start_time
30
+ if seconds_waited >= lock_max_wait:
31
+ raise RuntimeError(
32
+ f"Lock file could not be acquired after {seconds_waited} seconds ({lock_file_path})"
33
+ )
34
+ # wait for lock release
35
+ logger.info(f"Waiting for lock file to be released: {lock_file_path}")
36
+ time.sleep(1) # 1 sec
37
+
38
+ logger.info(f"Lock file acquired: {lock_file_path}")
39
+
40
+ # Check if the file exists, if not, create it
41
+ if not os.path.exists(file_path):
42
+ with open(file_path, "w") as f:
43
+ pass # Create an empty file
44
+
45
+ # Load existing data, if any
46
+ data = []
47
+ if os.path.exists(file_path):
48
+ with open(file_path, "r") as f:
49
+ data.extend([json.loads(line) for line in f if line.strip()]) # Skip empty lines
50
+
51
+ # Check if task_id already exists
52
+ existing_record = next((entry for entry in data if entry["id"] == task_id), None)
53
+
54
+ # Add or update the record
55
+ if not existing_record:
56
+ # Add new record
57
+ data.append({"id": task_id, "answer": prediction})
58
+ elif override_if_exists:
59
+ # Update existing record
60
+ existing_record["answer"] = prediction
61
+ else:
62
+ raise ValueError(
63
+ f"Prediction for task ID {repr(task_id)} already exists in file {file_path}."
64
+ )
65
+
66
+ # Write data back to the file
67
+ with open(file_path, "w") as f:
68
+ for entry in data:
69
+ f.write(json.dumps(entry) + "\n")
70
+
71
+ # Release lock (remove file)
72
+ os.remove(lock_file_path)
73
+ logger.info(f"Lock file released: {lock_file_path}")
BrowserGym/browsergym/core/README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # BrowserGym core
2
+
3
+ This package provides `browsergym.core`, which provides the core functionalities of [BrowserGym](https://github.com/ServiceNow/BrowserGym).
4
+
5
+ ## Setup
6
+
7
+ 1. Install the package
8
+ ```sh
9
+ pip install browsergym-core
10
+ ```
BrowserGym/browsergym/core/pyproject.toml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-requirements-txt"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "browsergym-core"
7
+ description = "BrowserGym: a gym environment for web task automation in the Chromium browser"
8
+ authors = [
9
+ {name = "Rim Assouel"},
10
+ {name = "Léo Boisvert"},
11
+ {name = "Massimo Caccia"},
12
+ {name = "Alex Drouin"},
13
+ {name = "Maxime Gasse"},
14
+ {name = "Imene Kerboua"},
15
+ {name = "Alex Lacoste"},
16
+ {name = "Thibault Le Sellier De Chezelles"},
17
+ {name = "Tom Marty"},
18
+ ]
19
+ readme = "README.md"
20
+ requires-python = ">3.9"
21
+ license = {text = "Apache-2.0"}
22
+ classifiers = [
23
+ "Development Status :: 3 - Alpha",
24
+ "Programming Language :: Python :: 3",
25
+ "Operating System :: OS Independent",
26
+ "Intended Audience :: Science/Research",
27
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
28
+ "License :: OSI Approved :: Apache Software License",
29
+ ]
30
+ dynamic = ["dependencies", "version"]
31
+
32
+ [project.urls]
33
+ homepage = "https://github.com/ServiceNow/BrowserGym"
34
+
35
+ [tool.hatch.version]
36
+ path = "src/browsergym/core/__init__.py"
37
+
38
+ [tool.hatch.metadata.hooks.requirements_txt]
39
+ files = ["requirements.txt"]
40
+
41
+ [tool.hatch.build.targets.wheel]
42
+ packages = ["src/browsergym"]
BrowserGym/browsergym/core/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ playwright==1.44
2
+ gymnasium>=0.27
3
+ numpy>=1.14
4
+ pyparsing>=3
5
+ Pillow>=10.1
6
+ beautifulsoup4>=4.12
7
+ lxml>=4.9
8
+ mcp[cli]>=1.6.0
BrowserGym/browsergym/core/src/browsergym/core/__init__.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __version__ = "0.13.4"
2
+
3
+ import playwright.sync_api
4
+
5
+ # we use a global playwright instance
6
+ _PLAYWRIGHT = None
7
+
8
+
9
+ def _set_global_playwright(pw: playwright.sync_api.Playwright):
10
+ global _PLAYWRIGHT
11
+ _PLAYWRIGHT = pw
12
+
13
+
14
+ def _get_global_playwright():
15
+ global _PLAYWRIGHT
16
+ if not _PLAYWRIGHT:
17
+ pw = playwright.sync_api.sync_playwright().start()
18
+ _set_global_playwright(pw)
19
+
20
+ return _PLAYWRIGHT
21
+
22
+
23
+ # register the open-ended task
24
+ from .registration import register_task
25
+ from .task import OpenEndedTask
26
+
27
+ register_task(OpenEndedTask.get_task_id(), OpenEndedTask)
BrowserGym/browsergym/core/src/browsergym/core/action/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _DEMO_MODE = False
2
+
3
+
4
+ def set_global_demo_mode(demo_mode: bool):
5
+ global _DEMO_MODE
6
+ _DEMO_MODE = demo_mode
7
+
8
+
9
+ def get_global_demo_mode():
10
+ global _DEMO_MODE
11
+ return _DEMO_MODE
BrowserGym/browsergym/core/src/browsergym/core/action/base.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+ import playwright.sync_api
4
+
5
+ from . import get_global_demo_mode
6
+
7
+
8
+ class AbstractActionSet(ABC):
9
+ def __init__(self, strict: bool = False):
10
+ self.strict = strict
11
+
12
+ @abstractmethod
13
+ def describe(self, with_long_description: bool = True, with_examples: bool = True) -> str:
14
+ """
15
+ Returns a textual description of this action space.
16
+ """
17
+
18
+ @abstractmethod
19
+ def example_action(self, abstract: bool) -> str:
20
+ """
21
+ Returns an example action as a string.
22
+ """
23
+
24
+ @abstractmethod
25
+ def to_python_code(self, action) -> str:
26
+ """
27
+ Converts the given action to browsergym-compatible python code.
28
+
29
+ Args:
30
+ action: the action to convert.
31
+
32
+ Returns:
33
+ Executable python code that performs the action in a browsergym environment.
34
+ """
35
+
36
+
37
+ def execute_python_code(
38
+ code: str,
39
+ page: playwright.sync_api.Page,
40
+ send_message_to_user: callable,
41
+ report_infeasible_instructions: callable,
42
+ ):
43
+ """
44
+ Executes Python code in a new context, except for a playwright `page` object and a `send_message_to_user` function.
45
+
46
+ WARNING: this is not safe!
47
+ https://stackoverflow.com/questions/77655440/can-you-protect-a-python-variable-with-exec
48
+
49
+ Args:
50
+ code: the Python code to execute, as a string.
51
+ page: the playwright page that will be made accessible to the code.
52
+ send_message_to_user: utility function that will be made accessible to the code. It should take one text argument.
53
+ report_infeasible_instructions: utility function that will be made accessible to the code. It should take one text argument.
54
+ """
55
+
56
+ globals = {
57
+ "page": page,
58
+ "send_message_to_user": send_message_to_user,
59
+ "report_infeasible_instructions": report_infeasible_instructions,
60
+ "DEMO_MODE": get_global_demo_mode(),
61
+ }
62
+
63
+ exec(code, globals)
BrowserGym/browsergym/core/src/browsergym/core/action/functions.py ADDED
@@ -0,0 +1,624 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # these are placeholders
2
+ # all these symbols will be available in browsergym actions
3
+ from typing import Literal
4
+
5
+ import playwright.sync_api
6
+
7
+ from .utils import (
8
+ add_demo_mode_effects,
9
+ call_fun,
10
+ get_elem_by_bid,
11
+ highlight_by_box,
12
+ smooth_move_visual_cursor_to,
13
+ )
14
+
15
+ page: playwright.sync_api.Page = None
16
+ send_message_to_user: callable = None
17
+ report_infeasible_instructions: callable = None
18
+ demo_mode: Literal["off", "default", "all_blue", "only_visible_elements"] = None
19
+ retry_with_force: bool = False
20
+
21
+ """IMPORTANT
22
+ The following primitives are meant to be included in the browsergym action using
23
+ inspect.getsource().
24
+ """
25
+
26
+
27
+ def send_msg_to_user(text: str):
28
+ """
29
+ Sends a message to the user.
30
+
31
+ Examples:
32
+ send_msg_to_user("Based on the results of my search, the city was built in 1751.")
33
+ """
34
+ send_message_to_user(text)
35
+
36
+
37
+ def report_infeasible(reason: str):
38
+ """
39
+ Notifies the user that their instructions are infeasible.
40
+
41
+ Examples:
42
+ report_infeasible("I cannot follow these instructions because there is no email field in this form.")
43
+ """
44
+ report_infeasible_instructions(reason)
45
+
46
+
47
+ def noop(wait_ms: float = 1000):
48
+ """
49
+ Do nothing, and optionally wait for the given time (in milliseconds).
50
+
51
+ Examples:
52
+ noop()
53
+ noop(500)
54
+ """
55
+ page.wait_for_timeout(wait_ms)
56
+
57
+
58
+ # https://playwright.dev/docs/input#text-input
59
+ def fill(bid: str, value: str):
60
+ """
61
+ Fill out a form field. It focuses the element and triggers an input event with the entered text.
62
+ It works for <input>, <textarea> and [contenteditable] elements.
63
+
64
+ Examples:
65
+ fill('237', 'example value')
66
+ fill('45', "multi-line\\nexample")
67
+ fill('a12', "example with \\"quotes\\"")
68
+ """
69
+ elem = get_elem_by_bid(page, bid, demo_mode != "off")
70
+ add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
71
+
72
+ def do(force: bool):
73
+ if demo_mode != "off":
74
+ delay = max(2000 / len(value), 10)
75
+ elem.clear(force=force, timeout=500)
76
+ elem.type(value, delay=delay, timeout=0) # no timeout
77
+ else:
78
+ elem.fill(value, force=force, timeout=500)
79
+
80
+ call_fun(do, retry_with_force)
81
+
82
+
83
+ # https://playwright.dev/python/docs/api/class-locator#locator-check
84
+ def check(bid: str):
85
+ """
86
+ Ensure a checkbox or radio element is checked.
87
+
88
+ Examples:
89
+ check('55')
90
+ """
91
+ elem = get_elem_by_bid(page, bid, demo_mode != "off")
92
+ add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
93
+
94
+ def do(force: bool):
95
+ elem.check(force=force, timeout=500)
96
+
97
+ call_fun(do, retry_with_force)
98
+
99
+
100
+ # https://playwright.dev/python/docs/api/class-locator#locator-uncheck
101
+ def uncheck(bid: str):
102
+ """
103
+ Ensure a checkbox or radio element is unchecked.
104
+
105
+ Examples:
106
+ uncheck('a5289')
107
+ """
108
+ elem = get_elem_by_bid(page, bid, demo_mode != "off")
109
+ add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
110
+
111
+ def do(force: bool):
112
+ elem.uncheck(force=force, timeout=500)
113
+
114
+ call_fun(do, retry_with_force)
115
+
116
+
117
+ # https://playwright.dev/docs/input#select-options
118
+ def select_option(bid: str, options: str | list[str]):
119
+ """
120
+ Select one or multiple options in a <select> element. You can specify
121
+ option value or label to select. Multiple options can be selected.
122
+
123
+ Examples:
124
+ select_option('a48', "blue")
125
+ select_option('c48', ["red", "green", "blue"])
126
+ """
127
+ elem = get_elem_by_bid(page, bid, demo_mode != "off")
128
+ add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
129
+
130
+ def do(force: bool):
131
+ elem.select_option(options, force=force, timeout=500)
132
+
133
+ call_fun(do, retry_with_force)
134
+
135
+
136
+ # https://playwright.dev/python/docs/api/class-locator#locator-click
137
+ def click(
138
+ bid: str,
139
+ button: Literal["left", "middle", "right"] = "left",
140
+ modifiers: list[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]] = [],
141
+ ):
142
+ """
143
+ Click an element.
144
+
145
+ Examples:
146
+ click('a51')
147
+ click('b22', button="right")
148
+ click('48', button="middle", modifiers=["Shift"])
149
+ """
150
+ elem = get_elem_by_bid(page, bid, demo_mode != "off")
151
+ add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
152
+
153
+ def do(force: bool):
154
+ elem.click(button=button, modifiers=modifiers, force=force, timeout=500)
155
+
156
+ call_fun(do, retry_with_force)
157
+
158
+
159
+ # https://playwright.dev/python/docs/api/class-locator#locator-dblclick
160
+ def dblclick(
161
+ bid: str,
162
+ button: Literal["left", "middle", "right"] = "left",
163
+ modifiers: list[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]] = [],
164
+ ):
165
+ """
166
+ Double click an element.
167
+
168
+ Examples:
169
+ dblclick('12')
170
+ dblclick('ca42', button="right")
171
+ dblclick('178', button="middle", modifiers=["Shift"])
172
+ """
173
+ elem = get_elem_by_bid(page, bid, demo_mode != "off")
174
+ add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
175
+
176
+ def do(force: bool):
177
+ elem.click(button=button, modifiers=modifiers, force=force, timeout=500)
178
+
179
+ call_fun(do, retry_with_force)
180
+
181
+
182
+ # https://playwright.dev/python/docs/api/class-locator#locator-hover
183
+ def hover(bid: str):
184
+ """
185
+ Hover over an element.
186
+
187
+ Examples:
188
+ hover('b8')
189
+ """
190
+ elem = get_elem_by_bid(page, bid, demo_mode != "off")
191
+ add_demo_mode_effects(
192
+ page, elem, bid, demo_mode=demo_mode, move_cursor=True, highlight_box=False
193
+ )
194
+
195
+ def do(force: bool):
196
+ elem.hover(force=force, timeout=500)
197
+
198
+ call_fun(do, retry_with_force)
199
+
200
+
201
+ # https://playwright.dev/python/docs/input#keys-and-shortcuts
202
+ def press(bid: str, key_comb: str):
203
+ """
204
+ Focus the matching element and press a combination of keys. It accepts
205
+ the logical key names that are emitted in the keyboardEvent.key property
206
+ of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace,
207
+ Tab, Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp,
208
+ ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can
209
+ alternatively specify a single character you'd like to produce such as "a"
210
+ or "#". Following modification shortcuts are also supported: Shift, Control,
211
+ Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on
212
+ Windows and Linux and to Meta on macOS.
213
+
214
+ Examples:
215
+ press('88', 'Backspace')
216
+ press('a26', 'ControlOrMeta+a')
217
+ press('a61', 'Meta+Shift+t')
218
+ """
219
+ elem = get_elem_by_bid(page, bid, demo_mode != "off")
220
+ add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
221
+ elem.press(key_comb, timeout=500)
222
+
223
+
224
+ # https://playwright.dev/python/docs/api/class-locator#locator-focus
225
+ def focus(bid: str):
226
+ """
227
+ Focus the matching element.
228
+
229
+ Examples:
230
+ focus('b455')
231
+ """
232
+ elem = get_elem_by_bid(page, bid, demo_mode != "off")
233
+ add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
234
+ elem.focus(timeout=500)
235
+
236
+
237
+ # https://playwright.dev/python/docs/api/class-locator#locator-clear
238
+ def clear(bid: str):
239
+ """
240
+ Clear the input field.
241
+
242
+ Examples:
243
+ clear('996')
244
+ """
245
+ elem = get_elem_by_bid(page, bid, demo_mode != "off")
246
+ add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
247
+ elem.clear(timeout=500)
248
+
249
+
250
+ # https://playwright.dev/python/docs/input#drag-and-drop
251
+ def drag_and_drop(from_bid: str, to_bid: str):
252
+ """
253
+ Perform a drag & drop. Hover the element that will be dragged. Press
254
+ left mouse button. Move mouse to the element that will receive the
255
+ drop. Release left mouse button.
256
+
257
+ Examples:
258
+ drag_and_drop('56', '498')
259
+ """
260
+ from_elem = get_elem_by_bid(page, from_bid, demo_mode != "off")
261
+ add_demo_mode_effects(page, from_elem, from_bid, demo_mode=demo_mode, move_cursor=True)
262
+ from_elem.hover(timeout=500)
263
+ page.mouse.down()
264
+
265
+ to_elem = get_elem_by_bid(page, to_bid, demo_mode != "off")
266
+ add_demo_mode_effects(page, to_elem, to_bid, demo_mode=demo_mode, move_cursor=True)
267
+ to_elem.hover(timeout=500)
268
+ page.mouse.up()
269
+
270
+
271
+ # https://playwright.dev/python/docs/api/class-mouse#mouse-wheel
272
+ def scroll(delta_x: float, delta_y: float):
273
+ """
274
+ Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
275
+
276
+ Examples:
277
+ scroll(0, 200)
278
+ scroll(-50.2, -100.5)
279
+ """
280
+ page.mouse.wheel(delta_x, delta_y)
281
+
282
+
283
+ # https://playwright.dev/python/docs/api/class-mouse#mouse-move
284
+ def mouse_move(x: float, y: float):
285
+ """
286
+ Move the mouse to a location. Uses absolute client coordinates in pixels.
287
+ Dispatches a mousemove event.
288
+
289
+ Examples:
290
+ mouse_move(65.2, 158.5)
291
+ """
292
+ if demo_mode != "off":
293
+ smooth_move_visual_cursor_to(page, x, y)
294
+ page.mouse.move(x, y)
295
+
296
+
297
+ # https://playwright.dev/python/docs/api/class-mouse#mouse-up
298
+ def mouse_up(x: float, y: float, button: Literal["left", "middle", "right"] = "left"):
299
+ """
300
+ Move the mouse to a location then release a mouse button. Dispatches
301
+ mousemove and mouseup events.
302
+
303
+ Examples:
304
+ mouse_up(250, 120)
305
+ mouse_up(47, 252, 'right')
306
+ """
307
+ if demo_mode != "off":
308
+ smooth_move_visual_cursor_to(page, x, y)
309
+ highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
310
+ page.mouse.move(x, y)
311
+ page.mouse.up(button=button)
312
+
313
+
314
+ # https://playwright.dev/python/docs/api/class-mouse#mouse-down
315
+ def mouse_down(x: float, y: float, button: Literal["left", "middle", "right"] = "left"):
316
+ """
317
+ Move the mouse to a location then press and hold a mouse button. Dispatches
318
+ mousemove and mousedown events.
319
+
320
+ Examples:
321
+ mouse_down(140.2, 580.1)
322
+ mouse_down(458, 254.5, 'middle')
323
+ """
324
+ if demo_mode != "off":
325
+ smooth_move_visual_cursor_to(page, x, y)
326
+ highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
327
+ page.mouse.move(x, y)
328
+ page.mouse.down(button=button)
329
+
330
+
331
+ # https://playwright.dev/python/docs/api/class-mouse#mouse-click
332
+ def mouse_click(x: float, y: float, button: Literal["left", "middle", "right"] = "left"):
333
+ """
334
+ Move the mouse to a location and click a mouse button. Dispatches mousemove,
335
+ mousedown and mouseup events.
336
+
337
+ Examples:
338
+ mouse_click(887.2, 68)
339
+ mouse_click(56, 712.56, 'right')
340
+ """
341
+ if demo_mode != "off":
342
+ smooth_move_visual_cursor_to(page, x, y)
343
+ highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
344
+ page.mouse.click(x, y, button=button)
345
+
346
+
347
+ # https://playwright.dev/python/docs/api/class-mouse#mouse-dblclick
348
+ def mouse_dblclick(x: float, y: float, button: Literal["left", "middle", "right"] = "left"):
349
+ """
350
+ Move the mouse to a location and double click a mouse button. Dispatches
351
+ mousemove, mousedown and mouseup events.
352
+
353
+ Examples:
354
+ mouse_dblclick(5, 236)
355
+ mouse_dblclick(87.5, 354, 'right')
356
+ """
357
+ if demo_mode != "off":
358
+ smooth_move_visual_cursor_to(page, x, y)
359
+ highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
360
+ page.mouse.dblclick(x, y, button=button)
361
+
362
+
363
+ def mouse_drag_and_drop(from_x: float, from_y: float, to_x: float, to_y: float):
364
+ """
365
+ Drag and drop from a location to a location. Uses absolute client
366
+ coordinates in pixels. Dispatches mousemove, mousedown and mouseup
367
+ events.
368
+
369
+ Examples:
370
+ mouse_drag_and_drop(10.7, 325, 235.6, 24.54)
371
+ """
372
+ if demo_mode != "off":
373
+ x, y = from_x, from_y
374
+ smooth_move_visual_cursor_to(page, x, y)
375
+ highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
376
+ page.mouse.move(from_x, from_y)
377
+ page.mouse.down()
378
+ if demo_mode != "off":
379
+ x, y = to_x, to_y
380
+ smooth_move_visual_cursor_to(page, x, y)
381
+ highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
382
+ page.mouse.move(to_x, to_y)
383
+ page.mouse.up()
384
+
385
+
386
+ # https://playwright.dev/python/docs/api/class-keyboard#keyboard-press
387
+ def keyboard_press(key: str):
388
+ """
389
+ Press a combination of keys. Accepts the logical key names that are
390
+ emitted in the keyboardEvent.key property of the keyboard events:
391
+ Backquote, Minus, Equal, Backslash, Backspace, Tab, Delete, Escape,
392
+ ArrowDown, End, Enter, Home, Insert, PageDown, PageUp, ArrowRight,
393
+ ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can
394
+ alternatively specify a single character you'd like to produce such
395
+ as "a" or "#". Following modification shortcuts are also supported:
396
+ Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta
397
+ resolves to Control on Windows and Linux and to Meta on macOS.
398
+
399
+ Examples:
400
+ keyboard_press('Backspace')
401
+ keyboard_press('ControlOrMeta+a')
402
+ keyboard_press('Meta+Shift+t')
403
+ page.keyboard.press("PageDown")
404
+ """
405
+ page.keyboard.press(key)
406
+
407
+
408
+ # https://playwright.dev/python/docs/api/class-keyboard#keyboard-up
409
+ def keyboard_up(key: str):
410
+ """
411
+ Release a keyboard key. Dispatches a keyup event. Accepts the logical
412
+ key names that are emitted in the keyboardEvent.key property of the
413
+ keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab,
414
+ Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp,
415
+ ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc.
416
+ You can alternatively specify a single character you'd like to produce
417
+ such as "a" or "#".
418
+
419
+ Examples:
420
+ keyboard_up('Shift')
421
+ keyboard_up('c')
422
+ """
423
+ page.keyboard.up(key)
424
+
425
+
426
+ # https://playwright.dev/python/docs/api/class-keyboard#keyboard-down
427
+ def keyboard_down(key: str):
428
+ """
429
+ Press and holds a keyboard key. Dispatches a keydown event. Accepts the
430
+ logical key names that are emitted in the keyboardEvent.key property of
431
+ the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab,
432
+ Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp,
433
+ ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can
434
+ alternatively specify a single character such as "a" or "#".
435
+
436
+ Examples:
437
+ keyboard_up('Shift')
438
+ keyboard_up('c')
439
+ """
440
+ page.keyboard.down(key)
441
+
442
+
443
+ # https://playwright.dev/python/docs/api/class-keyboard#keyboard-type
444
+ def keyboard_type(text: str):
445
+ """
446
+ Types a string of text through the keyboard. Sends a keydown, keypress/input,
447
+ and keyup event for each character in the text. Modifier keys DO NOT affect
448
+ keyboard_type. Holding down Shift will not type the text in upper case.
449
+
450
+ Examples:
451
+ keyboard_type('Hello world!')
452
+ """
453
+ if demo_mode != "off":
454
+ delay = max(2000 / len(text), 10)
455
+ else:
456
+ delay = None
457
+ page.keyboard.type(text, delay=delay)
458
+
459
+
460
+ # https://playwright.dev/python/docs/api/class-keyboard#keyboard-insert-text
461
+ def keyboard_insert_text(text: str):
462
+ """
463
+ Insert a string of text in the currently focused element. Dispatches only input
464
+ event, does not emit the keydown, keyup or keypress events. Modifier keys DO NOT
465
+ affect keyboard_insert_text. Holding down Shift will not type the text in upper
466
+ case.
467
+
468
+ Examples:
469
+ keyboard_insert_text('Hello world!')
470
+ """
471
+ page.keyboard.insert_text(text)
472
+
473
+
474
+ # https://playwright.dev/python/docs/api/class-page#page-goto
475
+ def goto(url: str):
476
+ """
477
+ Navigate to a url.
478
+
479
+ Examples:
480
+ goto('http://www.example.com')
481
+ """
482
+ page.goto(url)
483
+
484
+
485
+ # https://playwright.dev/python/docs/api/class-page#page-go-back
486
+ def go_back():
487
+ """
488
+ Navigate to the previous page in history.
489
+
490
+ Examples:
491
+ go_back()
492
+ """
493
+ page.go_back()
494
+
495
+
496
+ # https://playwright.dev/python/docs/api/class-page#page-go-forward
497
+ def go_forward():
498
+ """
499
+ Navigate to the next page in history.
500
+
501
+ Examples:
502
+ go_forward()
503
+ """
504
+ page.go_forward()
505
+
506
+
507
+ # https://playwright.dev/python/docs/api/class-browsercontext#browser-context-new-page
508
+ def new_tab():
509
+ """
510
+ Open a new tab. It will become the active one.
511
+
512
+ Examples:
513
+ new_tab()
514
+ """
515
+ global page
516
+ # set the new page as the active page
517
+ page = page.context.new_page()
518
+ # trigger the callback that sets this page as active in browsergym
519
+ page.evaluate(
520
+ """\
521
+ const event = new Event('pageshow', {
522
+ bubbles: true, // Whether the event bubbles up through the DOM or not
523
+ cancelable: false // Whether the event can be canceled
524
+ });
525
+ window.dispatchEvent(event);
526
+ """
527
+ )
528
+
529
+
530
+ # https://playwright.dev/python/docs/api/class-page#page-close
531
+ def tab_close():
532
+ """
533
+ Close the current tab.
534
+
535
+ Examples:
536
+ tab_close()
537
+ """
538
+ global page
539
+ context = page.context
540
+ page.close()
541
+ # set most recent page as active page, or open a new page if needed
542
+ if context.pages:
543
+ # TODO: do something more elaborate? (active page history)
544
+ page = context.pages[-1]
545
+ else:
546
+ page = context.new_page()
547
+ # trigger the callback that sets this page as active in browsergym
548
+ page.evaluate(
549
+ """\
550
+ const event = new Event('pageshow', {
551
+ bubbles: true, // Whether the event bubbles up through the DOM or not
552
+ cancelable: false // Whether the event can be canceled
553
+ });
554
+ window.dispatchEvent(event);
555
+ """
556
+ )
557
+
558
+
559
+ # https://playwright.dev/python/docs/api/class-page#page-bring-to-front
560
+ def tab_focus(index: int):
561
+ """
562
+ Bring tab to front (activate tab).
563
+
564
+ Examples:
565
+ tab_focus(2)
566
+ """
567
+ global page # set the focused page as the active page
568
+ page = page.context.pages[index]
569
+ page.bring_to_front()
570
+ # trigger the callback that sets this page as active in browsergym
571
+ page.evaluate(
572
+ """\
573
+ const event = new Event('pageshow', {
574
+ bubbles: true, // Whether the event bubbles up through the DOM or not
575
+ cancelable: false // Whether the event can be canceled
576
+ });
577
+ window.dispatchEvent(event);
578
+ """
579
+ )
580
+
581
+
582
+ # https://playwright.dev/python/docs/input#upload-files
583
+ def upload_file(bid: str, file: str | list[str]):
584
+ """
585
+ Click an element and wait for a "filechooser" event, then select one
586
+ or multiple input files for upload. Relative file paths are resolved
587
+ relative to the current working directory. An empty list clears the
588
+ selected files.
589
+
590
+ Examples:
591
+ upload_file("572", "my_receipt.pdf")
592
+ upload_file("63", ["/home/bob/Documents/image.jpg", "/home/bob/Documents/file.zip"])
593
+ """
594
+ elem = get_elem_by_bid(page, bid, demo_mode != "off")
595
+ add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
596
+
597
+ with page.expect_file_chooser() as fc_info:
598
+ elem.click(timeout=500)
599
+
600
+ file_chooser = fc_info.value
601
+ file_chooser.set_files(file)
602
+
603
+
604
+ # https://playwright.dev/python/docs/input#upload-files
605
+ def mouse_upload_file(x: float, y: float, file: str | list[str]):
606
+ """
607
+ Click a location and wait for a "filechooser" event, then select one
608
+ or multiple input files for upload. Relative file paths are resolved
609
+ relative to the current working directory. An empty list clears the
610
+ selected files.
611
+
612
+ Examples:
613
+ mouse_upload_file(132.1, 547, "my_receipt.pdf")
614
+ mouse_upload_file(328, 812, ["/home/bob/Documents/image.jpg", "/home/bob/Documents/file.zip"])
615
+ """
616
+ if demo_mode != "off":
617
+ smooth_move_visual_cursor_to(page, x, y)
618
+ highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
619
+
620
+ with page.expect_file_chooser() as fc_info:
621
+ page.mouse.click(x, y)
622
+
623
+ file_chooser = fc_info.value
624
+ file_chooser.set_files(file)
BrowserGym/browsergym/core/src/browsergym/core/action/highlevel.py ADDED
@@ -0,0 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+ import random
3
+ import typing
4
+ from dataclasses import dataclass
5
+
6
+ from . import utils
7
+ from .base import AbstractActionSet
8
+ from .functions import ( # check,; uncheck,
9
+ clear,
10
+ click,
11
+ dblclick,
12
+ drag_and_drop,
13
+ fill,
14
+ focus,
15
+ go_back,
16
+ go_forward,
17
+ goto,
18
+ hover,
19
+ keyboard_down,
20
+ keyboard_insert_text,
21
+ keyboard_press,
22
+ keyboard_type,
23
+ keyboard_up,
24
+ mouse_click,
25
+ mouse_dblclick,
26
+ mouse_down,
27
+ mouse_drag_and_drop,
28
+ mouse_move,
29
+ mouse_up,
30
+ mouse_upload_file,
31
+ new_tab,
32
+ noop,
33
+ press,
34
+ report_infeasible,
35
+ scroll,
36
+ select_option,
37
+ send_msg_to_user,
38
+ tab_close,
39
+ tab_focus,
40
+ upload_file,
41
+ )
42
+ from .parsers import action_docstring_parser, highlevel_action_parser
43
+
44
+ ACTION_SUBSETS = {
45
+ "chat": [send_msg_to_user],
46
+ "infeas": [report_infeasible],
47
+ "bid": [
48
+ scroll,
49
+ fill,
50
+ # These are not really needed and might pollute the action space, doing more harm than good
51
+ # check,
52
+ # uncheck,
53
+ select_option,
54
+ click,
55
+ dblclick,
56
+ hover,
57
+ press,
58
+ focus,
59
+ clear,
60
+ drag_and_drop,
61
+ upload_file,
62
+ ],
63
+ "coord": [
64
+ scroll,
65
+ mouse_move,
66
+ mouse_up,
67
+ mouse_down,
68
+ mouse_click,
69
+ mouse_dblclick,
70
+ mouse_drag_and_drop,
71
+ mouse_upload_file,
72
+ keyboard_down,
73
+ keyboard_up,
74
+ keyboard_press,
75
+ keyboard_type,
76
+ keyboard_insert_text,
77
+ ],
78
+ "nav": [go_back, go_forward, goto],
79
+ "tab": [
80
+ tab_close,
81
+ tab_focus,
82
+ new_tab,
83
+ ],
84
+ # adapted from MiniWoB repo
85
+ # https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L122
86
+ "miniwob_all": [
87
+ mouse_move, # MOVE_COORDS
88
+ mouse_click, # CLICK_COORDS
89
+ mouse_dblclick, # DBLCLICK_COORDS
90
+ mouse_down, # MOUSEDOWN_COORDS
91
+ mouse_up, # MOUSEUP_COORDS
92
+ scroll, # SCROLL_UP_COORDS, SCROLL_DOWN_COORDS
93
+ click, # CLICK_ELEMENT
94
+ keyboard_press, # PRESS_KEY
95
+ keyboard_type, # TYPE_TEX (and substitute for TYPE_FIELD()
96
+ fill, # FOCUS_ELEMENT_AND_TYPE_TEXT (and substitute for FOCUS_ELEMENT_AND_TYPE_FIELD)
97
+ ],
98
+ # adapted from MiniWoB repo
99
+ # https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L142
100
+ "miniwob_shi17": [
101
+ mouse_click, # CLICK_COORDS
102
+ mouse_dblclick, # DBLCLICK_COORDS
103
+ mouse_down, # MOUSEDOWN_COORDS
104
+ mouse_up, # MOUSEUP_COORDS
105
+ scroll, # SCROLL_UP_COORDS, SCROLL_DOWN_COORDS
106
+ keyboard_press, # PRESS_KEY
107
+ ],
108
+ # adapted from MiniWoB repo
109
+ # https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L160
110
+ "miniwob_liu18": [
111
+ click, # CLICK_ELEMENT
112
+ fill, # substitute for FOCUS_ELEMENT_AND_TYPE_FIELD
113
+ ],
114
+ # adapted from MiniWoB repo
115
+ # https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L173
116
+ "miniwob_humphreys22": [
117
+ mouse_move, # MOVE_COORDS
118
+ mouse_click, # CLICK_COORDS
119
+ mouse_dblclick, # DBLCLICK_COORDS
120
+ mouse_down, # MOUSEDOWN_COORDS
121
+ mouse_up, # MOUSEUP_COORDS
122
+ scroll, # SCROLL_UP_COORDS, SCROLL_DOWN_COORDS
123
+ keyboard_press, # PRESS_KEY
124
+ keyboard_type, # substitute for TYPE_FIELD
125
+ ],
126
+ # from the webarena paper
127
+ # https://arxiv.org/abs/2307.13854
128
+ # from the webarena source code
129
+ # https://github.com/web-arena-x/webarena/blob/e31c190c9b43f63e5724322b847e00249300df40/browser_env/actions.py#L240
130
+ # from the webarena default prompt
131
+ # https://github.com/web-arena-x/webarena/blob/e31c190c9b43f63e5724322b847e00249300df40/agent/prompts/raw/p_cot_id_actree_2s.py#L13
132
+ "webarena": [
133
+ # # code | paper | prompt
134
+ scroll, # SCROLL | scroll(dir) | scroll [down|up]
135
+ keyboard_press, # KEY_PRESS | press(key_comb) | press [key_comb]
136
+ # MOUSE_CLICK | |
137
+ # KEYBOARD_TYPE | |
138
+ # MOUSE_HOVER | |
139
+ click, # CLICK | click(elem) | click [id]
140
+ fill, # TYPE | type(elem, text) | type [id] [content]
141
+ hover, # HOVER | hover(elem) | hover [id]
142
+ tab_focus, # PAGE_FOCUS | tab_focus(index) | tab_focus [tab_index]
143
+ new_tab, # NEW_TAB | new_tab() | new_tab
144
+ go_back, # GO_BACK | go_back() | go_back
145
+ go_forward, # GO_FORWARD | go_forward() | go_forward
146
+ goto, # GOTO_URL | goto(url) | goto [url]
147
+ tab_close, # PAGE_CLOSE | tab_close() | close_tab
148
+ # CHECK | |
149
+ select_option, # SELECT_OPTION | |
150
+ send_msg_to_user, # STOP | stop(answer) | stop [answer]
151
+ report_infeasible, ## explicit unachievable action, equivalent STOP "N/A"
152
+ ],
153
+ # from the visualwebarena paper
154
+ # https://arxiv.org/abs/2401.13649
155
+ # from the visualwebarena source code
156
+ # https://github.com/web-arena-x/visualwebarena/blob/15890922c97a8694e366fde2d7de8dbd1ff63fb5/browser_env/actions.py#L311-L343
157
+ # from the visualwebarena default prompt
158
+ # https://github.com/web-arena-x/visualwebarena/blob/15890922c97a8694e366fde2d7de8dbd1ff63fb5/agent/prompts/jsons/p_cot_id_actree_3s.json#L2
159
+ "visualwebarena": [
160
+ # # code | paper | prompt
161
+ scroll, # SCROLL | scroll(dir) | scroll [down|up]
162
+ keyboard_press, # KEY_PRESS | press(key_comb) | press [key_comb]
163
+ # MOUSE_CLICK | |
164
+ # KEYBOARD_TYPE | |
165
+ # MOUSE_HOVER | |
166
+ click, # CLICK | click(elem) | click [id]
167
+ fill, # TYPE | type(elem, text) | type [id] [content]
168
+ hover, # HOVER | hover(elem) | hover [id]
169
+ tab_focus, # PAGE_FOCUS | tab_focus(index) | tab_focus [tab_index]
170
+ new_tab, # NEW_TAB | new_tab() | new_tab
171
+ go_back, # GO_BACK | go_back() | go_back
172
+ go_forward, # GO_FORWARD | go_forward() | go_forward
173
+ goto, # GOTO_URL | goto(url) | goto [url]
174
+ tab_close, # PAGE_CLOSE | tab_close() | close_tab
175
+ # CHECK | |
176
+ select_option, # SELECT_OPTION | |
177
+ send_msg_to_user, # STOP | stop(answer) | stop [answer]
178
+ # CLEAR | |
179
+ upload_file, # UPLOAD | |
180
+ report_infeasible, ## explicit unachievable action, equivalent STOP "N/A"
181
+ ],
182
+ # from workarena paper
183
+ # https://arxiv.org/abs/2403.07718
184
+ "workarena": [
185
+ scroll,
186
+ fill,
187
+ select_option,
188
+ click,
189
+ dblclick,
190
+ hover,
191
+ press,
192
+ focus,
193
+ clear,
194
+ drag_and_drop,
195
+ send_msg_to_user,
196
+ ],
197
+ # from workarena++ paper
198
+ # https://arxiv.org/abs/2407.05291
199
+ "workarena++": [
200
+ scroll,
201
+ fill,
202
+ select_option,
203
+ click,
204
+ dblclick,
205
+ hover,
206
+ press,
207
+ focus,
208
+ clear,
209
+ drag_and_drop,
210
+ tab_focus,
211
+ new_tab,
212
+ tab_close,
213
+ go_back,
214
+ go_forward,
215
+ goto,
216
+ send_msg_to_user,
217
+ report_infeasible,
218
+ ],
219
+ # from weblinx_browsergym
220
+ # https://github.com/McGill-NLP/agentlab-weblinx-mvp/blob/a91b6d19870c5187d252e70a2e2013511cc6f1d2/weblinx_browsergym/__init__.py#L274-L286
221
+ "weblinx": [
222
+ send_msg_to_user, # say(speaker="assistant", utterance=[str]) -> send_msg_to_user(text=[str])
223
+ click, # click(uid=[element id]) -> click(bid=[element id])
224
+ hover, # hover(uid=[element id]) -> hover(bid=[element id])
225
+ fill, # textinput(uid=[element id], value=[str]) -> fill(bid=[element id], value=[str])
226
+ # change(uid=[element], value=[str]) -> ❌
227
+ goto, # load(url=[link]) -> goto(url=[link])
228
+ # submit(uid=[element]) -> click(bid=[element id])
229
+ scroll, # scroll(x=[int x],y=[int y]) -> scroll(delta_x=[int x], delta_y=[int y])
230
+ # copy(uid=[element],text=[str]) -> ❌
231
+ # paste(uid=[element],text=[str]) -> ❌
232
+ new_tab, # tabcreate() -> new_tab()
233
+ tab_close, # tabremove(target=[tabId]) -> tab_close()
234
+ tab_focus, # tabswitch(origin=[origin tabId],target=[target tabId]) -> tab_focus(index=[target tabid])
235
+ ],
236
+ # from assistantbench paper
237
+ # https://arxiv.org/abs/2407.15711
238
+ "assistantbench": [
239
+ scroll, # SCROLL
240
+ fill, # TYPE
241
+ select_option, # SELECT
242
+ click, # CLICK
243
+ press, # PRESS ENTER
244
+ go_back, # GOBACK
245
+ goto, # GOTO, SEARCH
246
+ send_msg_to_user, # TERMINATE
247
+ ],
248
+ }
249
+
250
+
251
+ @dataclass
252
+ class HighLevelAction:
253
+ # entrypoint: callable
254
+ signature: str
255
+ description: str
256
+ examples: list[str]
257
+
258
+
259
+ class HighLevelActionSet(AbstractActionSet):
260
+
261
+ # static class variables
262
+ ActionSubset = typing.Literal[
263
+ "chat",
264
+ "infeas",
265
+ "bid",
266
+ "coord",
267
+ "nav",
268
+ "tab",
269
+ "miniwob_all",
270
+ "miniwob_shi17",
271
+ "miniwob_liu18",
272
+ "miniwob_humphreys22",
273
+ "webarena",
274
+ "visualwebarena",
275
+ "workarena",
276
+ "workarena++",
277
+ "weblinx",
278
+ "assistantbench",
279
+ "custom",
280
+ ]
281
+ DemoMode = typing.Literal["off", "default", "all_blue", "only_visible_elements"]
282
+
283
+ def __init__(
284
+ self,
285
+ subsets: typing.Optional[ActionSubset | list[ActionSubset]] = [
286
+ "chat",
287
+ "infeas",
288
+ "bid",
289
+ "nav",
290
+ "tab",
291
+ ],
292
+ custom_actions: typing.Optional[list[callable]] = None,
293
+ multiaction: bool = True,
294
+ demo_mode: typing.Optional[DemoMode] = None,
295
+ strict: bool = False,
296
+ retry_with_force: bool = False,
297
+ ):
298
+ super().__init__(strict)
299
+ self.multiaction = multiaction
300
+ self.demo_mode = demo_mode
301
+ self.retry_with_force = retry_with_force
302
+
303
+ if not subsets:
304
+ raise ValueError(f"'action_subsets' is empty.")
305
+
306
+ if isinstance(subsets, str):
307
+ subsets = [subsets]
308
+
309
+ allowed_actions = [noop] # the noop action is always allowed
310
+
311
+ # add actions from specified action sets
312
+ if subsets:
313
+ for subset in subsets:
314
+ if subset in ACTION_SUBSETS:
315
+ allowed_actions.extend(ACTION_SUBSETS[subset])
316
+ elif subset == "custom":
317
+ if not custom_actions:
318
+ raise ValueError(
319
+ "'custom' is in 'action_subsets' but 'custom_actions' is empty."
320
+ )
321
+ allowed_actions.extend(custom_actions)
322
+ else:
323
+ raise ValueError(f"Unknown high-level action subspace: {subset}")
324
+
325
+ # like set() but preserves order
326
+ # https://stackoverflow.com/questions/1653970/does-python-have-an-ordered-set
327
+ allowed_actions = list(dict.fromkeys(allowed_actions).keys())
328
+
329
+ # parse the actions and build the action space
330
+ self.action_set: dict[str, HighLevelAction] = {}
331
+ self.python_includes = ""
332
+
333
+ # include playwright imports
334
+ self.python_includes += f"""\
335
+ import playwright.sync_api
336
+ from typing import Literal
337
+
338
+
339
+ """
340
+ # set demo_mode and retry_with_force flags
341
+ self.python_includes += f"""\
342
+ demo_mode={repr(demo_mode)}
343
+ retry_with_force={repr(retry_with_force)}
344
+
345
+ if demo_mode is None:
346
+ demo_mode = "default" if DEMO_MODE else "off"
347
+
348
+ """
349
+
350
+ # include utility functions
351
+ for _, func in inspect.getmembers(utils, inspect.isfunction):
352
+ self.python_includes += f"""\
353
+ {inspect.getsource(func)}
354
+
355
+
356
+ """
357
+
358
+ # parse and include action functions
359
+ for func in allowed_actions:
360
+
361
+ # include action function definition in the code
362
+ self.python_includes += f"""\
363
+ {inspect.getsource(func)}
364
+
365
+
366
+ """
367
+
368
+ # extract action signature
369
+ signature = f"{func.__name__}{inspect.signature(func)}"
370
+
371
+ # parse docstring
372
+ description, examples = action_docstring_parser.parse_string(func.__doc__)
373
+
374
+ # reconstruct action description
375
+ description = " ".join(description)
376
+
377
+ # reconstruct action examples
378
+ examples = [
379
+ function_name + "(" + ", ".join([repr(arg) for arg in function_args]) + ")"
380
+ for function_name, function_args in examples
381
+ ]
382
+
383
+ if func.__name__ in self.action_set:
384
+ raise ValueError(f"Duplicated action '{func.__name__}'")
385
+
386
+ self.action_set[func.__name__] = HighLevelAction(
387
+ # entrypoint=func,
388
+ signature=signature,
389
+ description=description,
390
+ examples=examples,
391
+ )
392
+
393
+ def example_action(self, abstract: bool, max_examples: int = 3) -> str:
394
+ """
395
+ Returns an example action as a string.
396
+ """
397
+ if abstract:
398
+ if self.multiaction:
399
+ return """\
400
+ One or several actions, separated by new lines."""
401
+ else:
402
+ return """\
403
+ One single action to be executed. You can only use one action at a time."""
404
+ else:
405
+ picked_examples = []
406
+
407
+ # use fill and click examples if action is present
408
+ for action_name in ["fill", "click", "mouse_click", "keyboard_type"]:
409
+ if action_name in self.action_set:
410
+ picked_examples.extend(self.action_set[action_name].examples)
411
+
412
+ # last resort, use all action examples
413
+ if not picked_examples:
414
+ for _, action in self.action_set.items():
415
+ picked_examples += action.examples
416
+
417
+ # shuffle examples
418
+ rng = random.Random(1)
419
+ rng.shuffle(picked_examples)
420
+
421
+ if self.multiaction:
422
+ return "\n".join(picked_examples[:max_examples])
423
+ else:
424
+ return picked_examples[0]
425
+
426
+ def describe(self, with_long_description: bool = True, with_examples: bool = True):
427
+ """
428
+ Returns a textual description of this action space.
429
+ """
430
+ description = f"""
431
+ {len(self.action_set)} different types of actions are available.
432
+
433
+ """
434
+ for _, action in self.action_set.items():
435
+ description += f"""\
436
+ {action.signature}
437
+ """
438
+
439
+ if with_long_description:
440
+ description += f"""\
441
+ Description: {action.description}
442
+ """
443
+ if with_examples and action.examples:
444
+ description += f"""\
445
+ Examples:
446
+ """
447
+ for example in action.examples:
448
+ description += f"""\
449
+ {example}
450
+
451
+ """
452
+
453
+ if self.multiaction:
454
+ description += f"""\
455
+ Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.
456
+ More than 2-3 actions usually leads to failure or unexpected behavior."""
457
+ else:
458
+ description += f"""\
459
+ Only a single action can be provided at once."""
460
+
461
+ example_action = self.example_action(abstract=False)
462
+ if example_action:
463
+ description += f""" Example:
464
+ {example_action}
465
+ """
466
+ else:
467
+ description += f"""\
468
+
469
+ """
470
+
471
+ return description
472
+
473
+ def to_python_code(self, action):
474
+ """
475
+ Converts the given high-level action string to browsergym-compatible python code.
476
+
477
+ Args:
478
+ action: the high-level action to parse.
479
+
480
+ Returns:
481
+ Executable python code that performs the action in a browsergym environment.
482
+ """
483
+ highlevel_code = action
484
+
485
+ # do the actual parsing and convert each high-level action to
486
+ # the corresponding python function call
487
+ if self.strict:
488
+ function_calls = highlevel_action_parser.parse_string(highlevel_code, parse_all=True)
489
+ function_calls = function_calls.as_list()
490
+ else:
491
+ function_calls = highlevel_action_parser.search_string(
492
+ highlevel_code
493
+ ) # allow for multiple matches, skip anything in-between
494
+ function_calls = sum(function_calls.as_list(), []) # unpack multiple matches
495
+
496
+ if not function_calls:
497
+ raise ValueError("Received an empty action.")
498
+ elif len(function_calls) > 1 and not self.multiaction:
499
+ raise ValueError("Received a multi-action, only single-actions are allowed.")
500
+
501
+ python_code = ""
502
+
503
+ # function definitions
504
+ python_code += self.python_includes
505
+
506
+ # function calls
507
+ for function_name, function_args in function_calls:
508
+ if function_name not in self.action_set:
509
+ raise NameError(f"Invalid action type '{function_name}'.")
510
+ python_code += (
511
+ function_name + "(" + ", ".join([repr(arg) for arg in function_args]) + ")\n"
512
+ )
513
+
514
+ # return the constructed python code
515
+ return python_code
516
+
517
+
518
+ # consistency checks
519
+ assert "custom" not in ACTION_SUBSETS
520
+ assert set(typing.get_args(HighLevelActionSet.ActionSubset)) == set(
521
+ list(ACTION_SUBSETS.keys()) + ["custom"]
522
+ )
BrowserGym/browsergym/core/src/browsergym/core/action/parsers.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import pyparsing as pp
3
+
4
+ from dataclasses import dataclass
5
+ from typing import Any
6
+
7
+
8
+ @dataclass
9
+ class NamedArgument:
10
+ name: str
11
+ value: Any
12
+
13
+ def __repr__(self):
14
+ return f"{self.name}={repr(self.value)}"
15
+
16
+
17
+ def _build_highlevel_action_parser() -> pp.ParserElement:
18
+ """
19
+ Returns:
20
+ An action parser that accepts Python-like function calls with string, number, list or dict literals as arguments.
21
+ Example:
22
+ func("a", 42, None, True, [2, 4, "s"], {"a_key": "a_value"}, )
23
+ The parser is loose and accepts multi-line or single-line combinations af calls.
24
+ Example:
25
+ func() func()
26
+ \tfunc()
27
+ Python comments are ignored.
28
+ Example:
29
+ # this is a comment
30
+ func() # this function call will be parsed
31
+ # func() # this one will not
32
+ The parser will return a list of (function_name, function_args) tuples, one for each function call in the input.
33
+ The parser will raise exceptions
34
+
35
+ """
36
+
37
+ def make_keyword(kwd_str, kwd_value):
38
+ return pp.Keyword(kwd_str).set_parse_action(pp.replace_with(kwd_value))
39
+
40
+ TRUE = make_keyword("True", True)
41
+ FALSE = make_keyword("False", False)
42
+ NONE = make_keyword("None", None)
43
+
44
+ LBRACK, RBRACK, LBRACE, RBRACE, LPAREN, RPAREN, COLON = map(pp.Suppress, "[]{}():")
45
+
46
+ def literal_eval(toks):
47
+ return ast.literal_eval(toks[0])
48
+
49
+ string = pp.python_quoted_string().set_parse_action(literal_eval)
50
+ number = pp.pyparsing_common.number()
51
+ dict = pp.Forward().set_name("dict") # will be defined later
52
+ list = pp.Forward().set_name("list") # will be defined later
53
+ _tuple = pp.Forward().set_name("tuple") # will be defined later
54
+ element = (string | number | dict | list | _tuple | TRUE | FALSE | NONE).set_name("element")
55
+
56
+ list_items = pp.DelimitedList(element, allow_trailing_delim=True).set_name(None)
57
+ list << pp.Group(LBRACK + pp.Optional(list_items) + RBRACK, aslist=True)
58
+ _tuple << pp.Group(LPAREN + pp.Optional(list_items) + RPAREN, aslist=True).set_parse_action(
59
+ lambda tokens: tuple(tokens[0])
60
+ )
61
+
62
+ dict_item = pp.Group(string + COLON + element, aslist=True).set_name("dict item")
63
+ dict_items = pp.DelimitedList(dict_item, allow_trailing_delim=True).set_name(None)
64
+ dict << pp.Dict(LBRACE + pp.Optional(dict_items) + RBRACE, asdict=True)
65
+
66
+ arg = element
67
+ list_args = pp.DelimitedList(arg, allow_trailing_delim=True).set_name(None)
68
+ named_arg = (pp.pyparsing_common.identifier() + pp.Literal("=") + element).set_parse_action(
69
+ lambda tokens: NamedArgument(name=tokens[0], value=tokens[2])
70
+ )
71
+ list_named_args = pp.DelimitedList(named_arg, allow_trailing_delim=True).set_name(None)
72
+ function_call = pp.pyparsing_common.identifier() + pp.Group(
73
+ LPAREN + pp.Optional(list_args) + pp.Optional(list_named_args) + RPAREN, aslist=True
74
+ )
75
+
76
+ multiple_function_calls = pp.DelimitedList(pp.Group(function_call), delim="")
77
+ multiple_function_calls.ignore(pp.python_style_comment())
78
+
79
+ parser = multiple_function_calls
80
+
81
+ return parser
82
+
83
+
84
+ # this one will be used to extract python-like function calls
85
+ highlevel_action_parser: pp.ParserElement = _build_highlevel_action_parser()
86
+
87
+ # this one will be used to process the docstring in high-level actions, in order to describe the action space
88
+ action_docstring_parser: pp.ParserElement = (
89
+ pp.Group(pp.OneOrMore(pp.Word(pp.printables), stop_on=pp.Literal("Examples:")))
90
+ + pp.Literal("Examples:").suppress()
91
+ + pp.Group(highlevel_action_parser)
92
+ )
BrowserGym/browsergym/core/src/browsergym/core/action/python.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ from .base import AbstractActionSet
4
+
5
+
6
+ class PythonActionSet(AbstractActionSet):
7
+ def describe(self, with_long_description: bool = True, with_examples: bool = True):
8
+ """
9
+ Returns a textual description of this action space.
10
+ """
11
+ description = f"""
12
+ Each action consists of executable Python code (python>=3.10) that uses the Playwright library (playwright==1.32)
13
+ to interact with the current webpage and the browser context. The currently active webpage is accessible via the
14
+ global variable `page`. A function `send_message_to_user(text)` is also accessible and can be used to send a
15
+ message to the user, as well as a function `report_infeasible_instructions(reason)` to notify the user when their
16
+ instructions are infeasible."""
17
+ if with_long_description:
18
+ description += f"""
19
+ The browser context is in `page.context`, and all open webpages (tabs and popups)
20
+ are in `page.context.pages`. Here is is an example of a valid action:
21
+ ```
22
+ frame = page.frame_locator(".result-frame")
23
+ button = frame.get_by_text("Submit")
24
+ button.click()
25
+ ```
26
+ Here is another example:
27
+ ```
28
+ frame = page.get_by_test_id("a").frame_locator(":scope")
29
+ frame.get_by_test_id("a776").click()
30
+ ```
31
+ Note that Playwright's `get_by_test_id()` method is configured to use the `bid` attribute to locate HTML elements,
32
+ instead of the default `data-testid`. Also, Playwright's locators can not traverse iframes, so you have to locate
33
+ parent iframes first in order to locate an element in an iframe. The `bid` attribute contains all the information
34
+ required to recursively locate an element. For example, an element with `bid="ac2"` can be retrieved as follows:
35
+ ```
36
+ frame = page.get_by_test_id("a").frame_locator(":scope")
37
+ frame = frame.get_by_test_id("ac").frame_locator(":scope")
38
+ elem = frame.get_by_test_id("ac2")
39
+ ```
40
+ """
41
+ else:
42
+ description += f"""\
43
+
44
+ """
45
+ if with_examples:
46
+ description += f"""\
47
+ Here are other examples of valid actions:
48
+ ```
49
+ page = page.context.new_page()
50
+ page.goto("https://www.wikipedia.org/")
51
+ ```
52
+ ```
53
+ page.get_by_label("Birth date").fill("2020-02-02")
54
+ page.get_by_role("link", name="Get started").click()
55
+ ```
56
+ ```
57
+ page.get_by_label('I agree to the terms above').check()
58
+ ```
59
+ ```
60
+ page.locator('#area').fill('Hello World!')
61
+ ```
62
+ ```
63
+ page.get_by_role("textbox").press("Control+ArrowRight")
64
+ ```
65
+ ```
66
+ send_message_to_user("There are 7 items to choose from.")
67
+ ```
68
+ ```
69
+ report_infeasible_instructions("I cannot follow these instructions because there is no email field in this form.")
70
+ ```
71
+ """
72
+
73
+ return description
74
+
75
+ def example_action(self, abstract: bool) -> str:
76
+ """
77
+ Returns an example action as a string.
78
+ """
79
+ if abstract:
80
+ return """\
81
+ One single bloc of Python code. Do not include any explanation, only valid Python code."""
82
+ else:
83
+ return """\
84
+ frame = page.get_by_test_id("b").frame_locator(":scope")
85
+ frame = page.get_by_test_id("ba").frame_locator(":scope")
86
+ frame.get_by_test_id("ba2").fill("Hello world!")
87
+ frame.get_by_test_id("ba3").click()
88
+ """
89
+
90
+ def to_python_code(self, action):
91
+ """
92
+ Converts the given code action string to browsergym-compatible playwright code.
93
+
94
+ Args:
95
+ action: the code action to parse.
96
+
97
+ Returns:
98
+ Executable playwright code that performs the action in a browsergym environment.
99
+ """
100
+
101
+ python_code = ""
102
+
103
+ # extract markdown-style code snippets if detected
104
+ pattern = re.compile(r"```(?:python)?\n(?P<code>[\s\S]*?)```")
105
+ if pattern.match(action):
106
+ python_code += "\n".join([match.group("code") for match in pattern.finditer(action)])
107
+ # otherwise just use the code action as is
108
+ else:
109
+ python_code += action
110
+
111
+ # return the produced playwright code
112
+ return python_code
BrowserGym/browsergym/core/src/browsergym/core/action/utils.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+
3
+ import playwright.sync_api
4
+
5
+
6
+ def get_elem_by_bid(
7
+ page: playwright.sync_api.Page, bid: str, scroll_into_view: bool = False
8
+ ) -> playwright.sync_api.Locator:
9
+ """
10
+ Parse the given bid to sequentially locate every nested frame leading to the bid, then
11
+ locate the bid element. Bids are expected to take the form "abDb123", which means
12
+ the element abDb123 is located inside frame abDAb, which is located inside frame abDA,
13
+ which is located inside frame a, which is located inside the page's main frame.
14
+
15
+ Args:
16
+ bid: the browsergym id (playwright testid) of the page element.
17
+ scroll_into_view: try to scroll element into view, unless it is completely visible.
18
+
19
+ Returns:
20
+ Playwright element.
21
+ Bounding box of the element.
22
+ """
23
+ if not isinstance(bid, str):
24
+ raise ValueError(f"expected a string, got {repr(bid)}")
25
+
26
+ current_frame = page
27
+
28
+ # dive into each nested frame, to the frame where the element is located
29
+ i = 0
30
+ while bid[i:] and not bid[i:].isnumeric():
31
+ i += 1
32
+ # allow multi-character frame ids such as aA, bCD etc.
33
+ while bid[i:] and bid[i].isalpha() and bid[i].isupper():
34
+ i += 1
35
+ frame_bid = bid[:i] # bid of the next frame to select
36
+ frame_elem = current_frame.get_by_test_id(frame_bid)
37
+ if not frame_elem.count():
38
+ raise ValueError(f'Could not find element with bid "{bid}"')
39
+ if scroll_into_view:
40
+ frame_elem.scroll_into_view_if_needed(timeout=500)
41
+ current_frame = frame_elem.frame_locator(":scope")
42
+
43
+ # finally, we should have selected the frame where the target element is
44
+ elem = current_frame.get_by_test_id(bid)
45
+ if not elem.count():
46
+ raise ValueError(f'Could not find element with bid "{bid}"')
47
+ if scroll_into_view:
48
+ elem.scroll_into_view_if_needed(timeout=500)
49
+ return elem
50
+
51
+
52
+ def highlight_by_box(
53
+ page: playwright.sync_api.Page, box: dict, color: Literal["blue", "red"] = "blue"
54
+ ):
55
+ """Highlights the target element based on its bounding box attributes."""
56
+
57
+ assert color in ("blue", "red")
58
+
59
+ if box:
60
+ left, top, width, height = box["x"], box["y"], box["width"], box["height"]
61
+ page.evaluate(
62
+ f"""\
63
+ const overlay = document.createElement('div');
64
+ document.body.appendChild(overlay);
65
+ overlay.setAttribute('style', `
66
+ all: initial;
67
+ position: fixed;
68
+ border: 2px solid transparent; /* Start with transparent border */
69
+ borderRadius: 10px; /* Add rounded corners */
70
+ boxShadow: 0 0 0px {color}; /* Initial boxShadow with 0px spread */
71
+ left: {left - 2}px; /* Adjust left position to accommodate initial shadow spread */
72
+ top: {top - 2}px; /* Adjust top position likewise */
73
+ width: {width}px;
74
+ height: {height}px;
75
+ z-index: 2147483646; /* Maximum value - 1 */
76
+ pointerEvents: none; /* Ensure the overlay does not interfere with user interaction */
77
+ `);
78
+
79
+ // Animate the boxShadow to create a "wave" effect
80
+ let spread = 0; // Initial spread radius of the boxShadow
81
+ const waveInterval = setInterval(() => {{
82
+ spread += 10; // Increase the spread radius to simulate the wave moving outward
83
+ overlay.style.boxShadow = `0 0 40px ${{spread}}px {color}`; // Update boxShadow to new spread radius
84
+ overlay.style.opacity = 1 - spread / 38; // Gradually decrease opacity to fade out the wave
85
+ if (spread >= 38) {{ // Assuming 76px ~ 2cm spread radius
86
+ clearInterval(waveInterval); // Stop the animation once the spread radius reaches 2cm
87
+ document.body.removeChild(overlay); // Remove the overlay from the document
88
+ }}
89
+ }}, 200); // Adjust the interval as needed to control the speed of the wave animation
90
+ """
91
+ )
92
+ # Wait a bit to let users see the highlight
93
+ page.wait_for_timeout(1000) # Adjust delay as needed
94
+
95
+
96
+ def smooth_move_visual_cursor_to(
97
+ page: playwright.sync_api.Page, x: float, y: float, speed: float = 400
98
+ ):
99
+ """
100
+ Smoothly moves the visual cursor to a specific point, with constant
101
+ movement speed.
102
+
103
+ Args:
104
+ x: target location X coordinate (in viewport pixels)
105
+ y: target location Y coordinate (in viewport pixels)
106
+ speed: cursor speed (in pixels per second)
107
+ """
108
+ movement_time = page.evaluate(
109
+ """\
110
+ ([targetX, targetY, speed]) => {
111
+
112
+ // create cursor if needed
113
+ if (!("browsergym_visual_cursor" in window)) {
114
+ if (window.trustedTypes && window.trustedTypes.createPolicy) {
115
+ window.trustedTypes.createPolicy('default', {
116
+ createHTML: (string, sink) => string
117
+ });
118
+ }
119
+ let cursor = document.createElement('div');
120
+ cursor.setAttribute('id', 'browsergym-visual-cursor');
121
+ cursor.innerHTML = `
122
+ <svg width="50px" height="50px" viewBox="213 106 713 706" fill="none" xmlns="http://www.w3.org/2000/svg">
123
+ <path d="M213.333 106.667L426.667 853.333 512 512 853.333 426.667 213.333 106.667z" fill="blue"/>
124
+ </svg>
125
+ `;
126
+ cursor.setAttribute('style', `
127
+ all: initial;
128
+ position: fixed;
129
+ opacity: 0.7; /* Slightly transparent */
130
+ z-index: 2147483647; /* Maximum value */
131
+ pointer-events: none; /* Ensures the SVG doesn't interfere with page interactions */
132
+ `);
133
+
134
+ // Calculate center position within the viewport
135
+ const centerX = window.innerWidth / 2;
136
+ const centerY = window.innerHeight / 2;
137
+
138
+ cursor.style.left = `${centerX}px`;
139
+ cursor.style.top = `${centerY}px`;
140
+
141
+ // save cursor element
142
+ window.browsergym_visual_cursor = cursor;
143
+ window.browsergym_visual_cursor_n_owners = 0;
144
+ }
145
+
146
+ // recover cursor
147
+ let cursor = window.browsergym_visual_cursor;
148
+
149
+ // attach cursor to document
150
+ document.body.appendChild(cursor);
151
+ window.browsergym_visual_cursor_n_owners += 1;
152
+
153
+ x = parseFloat(cursor.style.left);
154
+ y = parseFloat(cursor.style.top);
155
+
156
+ dx = targetX - x;
157
+ dy = targetY - y;
158
+ dist = Math.hypot(dx, dy);
159
+ movement_time = (dist / speed) * 1000; // seconds to milliseconds
160
+ still_wait_time = 1000;
161
+
162
+ // Adjust steps based on distance to keep movement speed consistent
163
+ // 1 step per 10 pixels of distance, adjust as needed
164
+ steps = Math.max(1, Math.trunc(dist / 10));
165
+
166
+ step_dx = dx / steps;
167
+ step_dy = dy / steps;
168
+ step_dist = dist / steps;
169
+ step_wait_time = Math.max(10, movement_time / steps);
170
+
171
+ let step = 0;
172
+ let time_still = 0;
173
+ const cursorInterval = setInterval(() => {
174
+ // move cursor
175
+ if (step < steps) {
176
+ x += step_dx;
177
+ y += step_dy;
178
+ cursor.style.left = `${x}px`;
179
+ cursor.style.top = `${y}px`;
180
+ }
181
+ // still cursor (wait a bit)
182
+ else if (time_still < still_wait_time) {
183
+ time_still += step_wait_time;
184
+ }
185
+ // stop and detach cursor
186
+ else {
187
+ clearInterval(cursorInterval);
188
+ window.browsergym_visual_cursor_n_owners -= 1;
189
+ if (window.browsergym_visual_cursor_n_owners <= 0) {
190
+ document.body.removeChild(cursor);
191
+
192
+ }
193
+ }
194
+ step += 1;
195
+ }, step_wait_time);
196
+
197
+ return movement_time;
198
+ }""",
199
+ [x, y, speed],
200
+ )
201
+ page.wait_for_timeout(movement_time)
202
+
203
+
204
+ def check_for_overlay(
205
+ page: playwright.sync_api.Page, bid: str, element: playwright.sync_api.ElementHandle, box: dict
206
+ ):
207
+ if not element:
208
+ return False
209
+
210
+ visibility = element.get_attribute("browsergym_visibility_ratio")
211
+ if visibility is not None:
212
+ return float(visibility) >= 0.5
213
+
214
+ """Checks if a given element is the topmost element at its center position by default.
215
+ If check_corners is True, it checks if any of the corners is visible."""
216
+ if box:
217
+ # corners
218
+ points_to_check = [
219
+ (box["x"], box["y"]),
220
+ (box["x"] + box["width"], box["y"]),
221
+ (box["x"], box["y"] + box["height"]),
222
+ (box["x"] + box["width"], box["y"] + box["height"]),
223
+ ]
224
+
225
+ for x, y in points_to_check:
226
+ # Execute JavaScript to find the topmost element at the point.
227
+ top_element = page.evaluate(
228
+ f"""() => {{
229
+ const el = document.elementFromPoint({x}, {y});
230
+ return el ? el.outerHTML : '';
231
+ }}"""
232
+ )
233
+
234
+ # Check if the topmost element is the element we're interested in.
235
+ if top_element and bid in top_element:
236
+ return True
237
+
238
+ return False
239
+
240
+
241
+ def add_demo_mode_effects(
242
+ page: playwright.sync_api.Page,
243
+ elem: playwright.sync_api.ElementHandle,
244
+ bid: str,
245
+ demo_mode: Literal["off", "default", "all_blue", "only_visible_elements"],
246
+ move_cursor: bool = True,
247
+ highlight_box: bool = True,
248
+ ):
249
+ if demo_mode == "off":
250
+ return
251
+
252
+ """Adds visual effects to the target element"""
253
+ box = elem.bounding_box()
254
+ # box = extract_bounds_cdp(page, bid)
255
+ if box:
256
+ center_x, center_y = box["x"] + box["width"] / 2, box["y"] + box["height"] / 2
257
+ is_top_element = check_for_overlay(page, bid, elem, box)
258
+
259
+ if demo_mode == "only_visible_elements":
260
+ if not is_top_element:
261
+ return
262
+ else:
263
+ color = "blue"
264
+
265
+ elif demo_mode == "default":
266
+ if is_top_element:
267
+ color = "blue"
268
+ else:
269
+ color = "red"
270
+
271
+ elif demo_mode == "all_blue":
272
+ color = "blue"
273
+
274
+ if move_cursor:
275
+ smooth_move_visual_cursor_to(page, center_x, center_y)
276
+
277
+ if highlight_box:
278
+ highlight_by_box(page, box, color=color)
279
+
280
+
281
+ def call_fun(fun: callable, retry_with_force: bool):
282
+ try:
283
+ fun(force=False)
284
+ except playwright.sync_api.TimeoutError as e:
285
+ if retry_with_force:
286
+ fun(force=True)
287
+ else:
288
+ raise e
BrowserGym/browsergym/core/src/browsergym/core/chat.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from pathlib import Path
3
+ from typing import Literal
4
+ import logging
5
+ import playwright.sync_api
6
+ import re
7
+ import time
8
+
9
+ from importlib import resources
10
+
11
+ from . import _get_global_playwright, chat_files
12
+
13
+
14
+ CHATBOX_DIR = resources.files(chat_files)
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class Chat:
20
+ def __init__(
21
+ self, headless: bool, chat_size=(500, 800), record_video_dir=None, modern=True
22
+ ) -> None:
23
+ self.messages = []
24
+
25
+ # create a new browser, browser context and page for the chat
26
+ pw: playwright.sync_api.Playwright = _get_global_playwright()
27
+ self.browser = pw.chromium.launch(
28
+ headless=headless, args=[f"--window-size={chat_size[0]},{chat_size[1]}"]
29
+ )
30
+ self.context = self.browser.new_context(
31
+ no_viewport=True,
32
+ record_video_dir=Path(record_video_dir) / "chat_video" if record_video_dir else None,
33
+ record_video_size=dict(width=chat_size[0], height=chat_size[1]),
34
+ )
35
+ self.page = self.context.new_page()
36
+ self.recording_start_time = time.time() if record_video_dir else None
37
+
38
+ # setup the chat page
39
+ self.page.expose_function(
40
+ "send_user_message", lambda msg: self._js_user_message_received_callback(msg=msg)
41
+ )
42
+
43
+ if modern:
44
+ self.page.set_content(get_chatbox_modern(CHATBOX_DIR))
45
+ else:
46
+ self.page.set_content(get_chatbox_classic(CHATBOX_DIR))
47
+
48
+ def _js_user_message_received_callback(self, msg: str):
49
+ """Callback function for when a user message is received in the chatbox"""
50
+ utc_time = time.time()
51
+ self.messages.append({"role": "user", "timestamp": utc_time, "message": msg})
52
+ # returning a list as JS doesnt like tuples
53
+ return ["user", time.strftime("%H:%M", time.localtime(utc_time)), msg]
54
+
55
+ def add_message(
56
+ self, role: Literal["user", "user_image", "assistant", "info", "infeasible"], msg: str
57
+ ):
58
+ """Add a message to the chatbox and update the page accordingly."""
59
+ utc_time = time.time()
60
+ if role not in ("user", "user_image", "assistant", "info", "infeasible"):
61
+ raise ValueError(f"Invalid role: {role}")
62
+ if role in ("user", "user_image", "assistant", "infeasible"):
63
+ self.messages.append({"role": role, "timestamp": utc_time, "message": msg})
64
+ timestamp = time.strftime("%H:%M:%S", time.localtime(utc_time))
65
+ self.page.evaluate(f"addChatMessage({repr(role)}, {repr(timestamp)}, {repr(msg)});")
66
+
67
+ def wait_for_user_message(self):
68
+ logger.info("Waiting for message from user...")
69
+ # reset flag
70
+ self.page.evaluate("USER_MESSAGE_RECEIVED = false;")
71
+ # wait for flag to be raised
72
+ self.page.wait_for_function("USER_MESSAGE_RECEIVED", polling=100, timeout=0)
73
+ logger.info("Message received.")
74
+
75
+ def close(self):
76
+ self.context.close()
77
+ self.browser.close()
78
+
79
+
80
+ def get_chatbox_modern(chatbox_dir) -> str:
81
+ with open(chatbox_dir / "chatbox_modern.html", "r") as file:
82
+ chatbox_html = file.read()
83
+
84
+ return chatbox_html
85
+
86
+
87
+ def get_chatbox_classic(chatbox_dir) -> str:
88
+ with open(chatbox_dir / "chatbox.html", "r") as file:
89
+ chatbox_html = file.read()
90
+ with open(chatbox_dir / "assistant.png", "rb") as f:
91
+ image_base64 = base64.b64encode(f.read()).decode("utf-8")
92
+
93
+ assistant_image_url = f"data:image/png;base64,{image_base64}"
94
+ chatbox_html = re.sub("<ASSISTANT_IMAGE_URL>", assistant_image_url, chatbox_html)
95
+ return chatbox_html
BrowserGym/browsergym/core/src/browsergym/core/chat_files/chatbox.html ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>UI Assistant Chat</title>
8
+ <style>
9
+ .chat-container {
10
+ display: flex;
11
+ flex-flow: column;
12
+ position: fixed;
13
+ bottom: 0;
14
+ right: 0;
15
+ height: 100%;
16
+ width: 100%;
17
+ border: 1px solid black;
18
+ background-color: white;
19
+ padding: 0;
20
+ overflow: hidden;
21
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
22
+ font-family: 'Source Sans Pro', Arial, Helvetica, sans-serif;
23
+ }
24
+
25
+ .chat-header {
26
+ background-color: #032D42;
27
+ color: white;
28
+ padding: 5px;
29
+ padding-left: 15px;
30
+ text-align: center;
31
+ flex: 0 1 auto;
32
+ }
33
+
34
+ .chat-body {
35
+ padding: 10px;
36
+ overflow-y: auto;
37
+ display: flex;
38
+ flex-direction: column;
39
+ flex: 1 1 auto;
40
+ }
41
+
42
+ .chat-debug {
43
+ padding: 10px;
44
+ max-height: 30%;
45
+ overflow-y: auto;
46
+ display: flex;
47
+ flex-direction: column;
48
+ flex: 0 0 auto;
49
+ }
50
+
51
+ .chat-input-area {
52
+ display: flex;
53
+ flex-flow: row;
54
+ margin-top: 5px;
55
+ margin-top: 5px;
56
+ padding: 10px;
57
+ border-top: 1px solid #ddd;
58
+ flex: 0 1 50px;
59
+ }
60
+
61
+ .chat-input-area form {
62
+ display: flex;
63
+ width: 100%;
64
+ height: 100%;
65
+ }
66
+
67
+ .input-box {
68
+ padding: 5px;
69
+ margin-right: 10px;
70
+ border-radius: 5px;
71
+ border: 1px solid #ccc;
72
+ width: 100%;
73
+ }
74
+
75
+ .submit-button {
76
+ padding: 5px 10px;
77
+ border-radius: 5px;
78
+ background-color: #4CAF50;
79
+ color: white;
80
+ border: none;
81
+ align-self: center;
82
+ }
83
+
84
+ .message {
85
+ display: flex;
86
+ align-items: center;
87
+ margin: 0px;
88
+ padding: 0px;
89
+ }
90
+
91
+ .message p {
92
+ padding: 10px;
93
+ /* Added padding inside the bubble */
94
+ border-radius: 15px;
95
+ flex-grow: 1;
96
+ margin-top: 10;
97
+ margin-bottom: 0;
98
+ }
99
+
100
+ .chat-debug .message p {
101
+ padding: 0;
102
+ border-radius: 0;
103
+ flex-grow: 1;
104
+ margin-top: 0;
105
+ margin-bottom: 0;
106
+ }
107
+
108
+ .user-message {
109
+ background-color: #d1f4d1;
110
+ }
111
+
112
+ .assistant-message {
113
+ background-color: #e0e0e0;
114
+ }
115
+
116
+ .info-message {
117
+ background-color: #f0f0f0;
118
+ color: #707070;
119
+ font-size: 13px;
120
+ }
121
+
122
+ .assistant-image {
123
+ margin: 0px;
124
+ padding: 10px;
125
+ width: 40px;
126
+ }
127
+ </style>
128
+ </head>
129
+
130
+ <body>
131
+
132
+
133
+
134
+ <div class="chat-container">
135
+ <div class="chat-header">
136
+ <h2>BrowserGym</h2>
137
+ </div>
138
+ <div class="chat-body" id="chatBody"></div>
139
+ <div class="chat-debug" id="chatDebug"></div>
140
+ <div class="chat-input-area">
141
+ <form id="chatForm">
142
+ <textarea class="input-box" rows="2" id="inputBox"></textarea>
143
+ <input type="submit" class="submit-button" value="Send">
144
+ </form>
145
+ </div>
146
+ </div>
147
+
148
+ <script>
149
+
150
+ const assistant_image_data = "<ASSISTANT_IMAGE_URL>";
151
+
152
+ var USER_MESSAGE_RECEIVED = false;
153
+
154
+ function escapeHtml(unsafe) {
155
+ return unsafe
156
+ .replace(/&/g, "&amp;")
157
+ .replace(/</g, "&lt;")
158
+ .replace(/>/g, "&gt;")
159
+ .replace(/"/g, "&quot;")
160
+ .replace(/'/g, "&#039;");
161
+ }
162
+
163
+ function addChatMessage(role, msg) {
164
+ const chatBody = document.getElementById('chatBody');
165
+ const chatDebug = document.getElementById('chatDebug');
166
+ const msgContainer = document.createElement('div');
167
+ msgContainer.className = 'message';
168
+
169
+ const text = document.createElement('p');
170
+ text.innerHTML = escapeHtml(msg);
171
+
172
+ const assistant_img = document.createElement('img');
173
+ assistant_img.src = assistant_image_data;
174
+ assistant_img.alt = 'Assistant';
175
+ assistant_img.className = 'assistant-image';
176
+
177
+
178
+ switch (role) {
179
+ case "user":
180
+ text.className = 'user-message';
181
+ msgContainer.appendChild(text);
182
+ chatBody.appendChild(msgContainer);
183
+ break;
184
+ case "assistant":
185
+ text.className = 'assistant-message';
186
+ msgContainer.appendChild(assistant_img); // Add the image to the message container
187
+ msgContainer.appendChild(text);
188
+ chatBody.appendChild(msgContainer);
189
+ break;
190
+ case "info":
191
+ text.className = 'info-message';
192
+ text.innerHTML = msg;
193
+ msgContainer.appendChild(text);
194
+ // hide previous debug messages
195
+ for (const msg of chatDebug.children) {
196
+ msg.style.display = 'none';
197
+ }
198
+ chatDebug.appendChild(msgContainer);
199
+ break;
200
+ default:
201
+ throw new TypeError(`Illegal role "${role}".`);
202
+ }
203
+
204
+ chatBody.scrollTop = chatBody.scrollHeight;
205
+
206
+ if (role === "user") {
207
+ USER_MESSAGE_RECEIVED = true;
208
+ }
209
+ }
210
+
211
+ if (typeof send_user_message !== 'function') {
212
+ function send_user_message(msg) {
213
+ // This will be overloaded by playwright
214
+ }
215
+ }
216
+
217
+ const inputBox = document.getElementById('inputBox');
218
+
219
+ function send_msg(msg) {
220
+ if (msg.trim()) {
221
+ send_user_message(msg);
222
+ addChatMessage('user', msg);
223
+ inputBox.value = '';
224
+ }
225
+ }
226
+
227
+ inputBox.onkeypress = (e) => {
228
+ if (e.key === 'Enter' && !e.shiftKey) {
229
+ e.preventDefault();
230
+ send_msg(inputBox.value);
231
+ }
232
+ };
233
+
234
+ document.getElementById('chatForm').onsubmit = function (event) {
235
+ event.preventDefault();
236
+ send_msg(inputBox.value);
237
+ return false;
238
+ }
239
+ </script>
240
+
241
+ </body>
242
+
243
+ </html>
BrowserGym/browsergym/core/src/browsergym/core/chat_files/chatbox_modern.html ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>UI Assistant Chat</title>
8
+ <style>
9
+ body {
10
+ font-family: 'Gilroy', sans-serif;
11
+ }
12
+
13
+ textarea {
14
+ font-family: 'Gilroy', sans-serif;
15
+ }
16
+
17
+ .chat-container {
18
+ position: fixed;
19
+ bottom: 0;
20
+ right: 0;
21
+ height: 100%;
22
+ width: 100%;
23
+ border: 1px solid black;
24
+ background-color: black;
25
+ padding: 0;
26
+ overflow: hidden;
27
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
28
+ display: flex;
29
+ flex-direction: column;
30
+ min-height: 0;
31
+ }
32
+
33
+ .gradient {
34
+ height: 100%;
35
+ width: 100%;
36
+ position: absolute;
37
+ top: 0;
38
+ background: linear-gradient(180deg, #002239 0%, rgba(0, 34, 57, 0) 100%);
39
+ z-index: -1;
40
+ }
41
+
42
+ #chatui-generating-indicator {
43
+ position: absolute;
44
+ height: 100vh;
45
+ width: 8px;
46
+ }
47
+
48
+ #chatui-generating-indicator-gradient {
49
+ height: 100%;
50
+ width: 100%;
51
+ animation: 1.5s ease alternate infinite thinking;
52
+ background: linear-gradient(0deg, #032D42 0%, #50CED8 100%);
53
+ background-size: 400% 400%;
54
+ }
55
+
56
+ @keyframes thinking {
57
+ 0% {
58
+ background-position: 0% 0%;
59
+ }
60
+
61
+ 100% {
62
+ background-position: 0% 100%;
63
+ }
64
+ }
65
+
66
+
67
+ .spacer {
68
+ flex: 1;
69
+ }
70
+
71
+ .chat-wrapper {
72
+ padding: 0px 48px 48px 48px;
73
+ display: flex;
74
+ flex-flow: column;
75
+ flex: 1;
76
+ min-height: 0;
77
+
78
+ }
79
+
80
+ .chat-body {
81
+ padding: 10px;
82
+ overflow-y: auto;
83
+ display: flex;
84
+ flex-direction: column;
85
+ flex: 1 1 auto;
86
+ }
87
+
88
+ /* Hide scrollbar for Chrome, Safari and Opera */
89
+ .chat-body::-webkit-scrollbar {
90
+ display: none;
91
+ }
92
+
93
+ .chat-debug {
94
+ padding: 10px;
95
+ max-height: 45%;
96
+ overflow-y: auto;
97
+ display: flex;
98
+ flex-direction: column;
99
+ flex: 0 0 auto;
100
+ }
101
+
102
+ /* Hide scrollbar for Chrome, Safari and Opera */
103
+ .chat-debug::-webkit-scrollbar {
104
+ display: none;
105
+ }
106
+
107
+ .chat-input-area {
108
+ display: flex;
109
+ flex-flow: row;
110
+ margin-top: 48px;
111
+ padding: 10px;
112
+ padding-left: 18px;
113
+ flex: 0 1 50px;
114
+ background-color: #022435;
115
+ border-radius: 12px;
116
+
117
+ }
118
+
119
+
120
+ .chat-input-area form {
121
+ display: flex;
122
+ width: 100%;
123
+ height: 100%;
124
+ }
125
+
126
+ .input-box {
127
+ padding: 5px;
128
+ margin-right: 10px;
129
+ border-radius: 5px;
130
+ width: 100%;
131
+ background-color: transparent;
132
+ color: white;
133
+ border: none;
134
+ outline: none;
135
+ resize: none;
136
+ font-size: 18px;
137
+ min-height: 100px;
138
+ /* Minimum starting height */
139
+ max-height: 300px;
140
+ /* Maximum height */
141
+ overflow-y: auto;
142
+ /* Allows scrolling within the input box if content exceeds max height */
143
+ height: auto;
144
+ /* Automatically adjust height, but limited by other CSS properties */
145
+ }
146
+
147
+ /* Hide scrollbar for Chrome, Safari and Opera */
148
+ .input-box::-webkit-scrollbar {
149
+ display: none;
150
+
151
+ }
152
+
153
+ .submit-button {
154
+ margin-left: 10px;
155
+ background-color: #022435;
156
+ color: #9AABB3;
157
+ font-weight: bold;
158
+ cursor: pointer;
159
+ background-image: url('data:image/svg+xml,<svg width="14" height="13" viewBox="0 0 14 13" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M13.7038 6.04336L0.709549 0.0460291C0.528509 -0.0375275 0.315131 -0.00553368 0.166559 0.127445C0.0179865 0.260423 -0.0373753 0.468963 0.0256778 0.658123L1.97297 6.50001L0.0256778 12.3419C-0.0373753 12.5311 0.0179865 12.7396 0.166559 12.8726C0.315131 13.0056 0.528509 13.0375 0.709549 12.954L13.7038 6.95666C13.8817 6.87718 14.0001 6.69465 14.0001 6.50001C14.0001 6.3048 13.8819 6.12293 13.7038 6.04336ZM2.8604 6.00001L1.33983 1.4383L11.2235 6.00001H2.8604ZM11.2235 7.00001L1.33983 11.5617L2.8604 7.00001H11.2235Z" fill="%234F6C7B"/></svg>');
160
+ background-repeat: no-repeat;
161
+ background-position: center;
162
+ width: 60px;
163
+ background-repeat: no-repeat;
164
+ background-position: center;
165
+ background-size: 20px 20px;
166
+ border: none;
167
+ border-radius: 4px;
168
+ }
169
+
170
+ .submit-button:hover {
171
+ background-color: #03334a;
172
+ }
173
+
174
+ .message {
175
+ display: flex;
176
+ align-items: center;
177
+ margin: 0px;
178
+ padding: 0px;
179
+ margin-bottom: 10px;
180
+ }
181
+
182
+ .message p {
183
+ margin-bottom: 0;
184
+ }
185
+
186
+ .user-message {
187
+ background-color: transparent;
188
+ color: white;
189
+ font-size: 20px;
190
+ }
191
+
192
+ .user-message::before {
193
+ content: var(--before-content, "You");
194
+ color: #09A2BF;
195
+ display: block;
196
+ margin-bottom: 4px;
197
+ font-size: 10px;
198
+ text-transform: uppercase;
199
+ }
200
+
201
+
202
+ .assistant-message {
203
+ background-color: transparent;
204
+ color: #7ACA87;
205
+ font-size: 20px;
206
+
207
+ }
208
+
209
+ .assistant-message::before {
210
+ content: var(--before-content, "Bot");
211
+ color: #29A93E;
212
+ display: block;
213
+ margin-bottom: 4px;
214
+ font-size: 10px;
215
+ text-transform: uppercase;
216
+ }
217
+
218
+ .info-message {
219
+ color: #afadad;
220
+ font-size: 22px;
221
+ background: #04334b;
222
+ padding: 10px;
223
+ border-radius: 4px;
224
+ width: 100%;
225
+ }
226
+
227
+ .assistant-image {
228
+ margin: 0px;
229
+ padding: 10px;
230
+ width: 40px;
231
+ }
232
+ </style>
233
+ </head>
234
+
235
+ <body>
236
+ <div class="chat-container">
237
+ <div class="chat-debug" id="chatDebug"></div>
238
+ <div class="gradient">
239
+ </div>
240
+ <div id="chatui-generating-indicator" style="display: none;">
241
+ <div id="chatui-generating-indicator-gradient"></div>
242
+ </div>
243
+ <div class="chat-wrapper">
244
+ <div class="chat-body" id="chatBody">
245
+ <div class="spacer"></div>
246
+ </div>
247
+ <div class="chat-input-area">
248
+ <form id="chatForm">
249
+ <textarea class="input-box" id="inputBox" placeholder="How can I help you?"
250
+ title="Ask any question or type exit to quit."></textarea>
251
+ <input type="submit" class="submit-button" value="">
252
+ </input>
253
+ </form>
254
+ </div>
255
+ </div>
256
+ </div>
257
+
258
+ <script>
259
+
260
+ const assistant_image_data = "<ASSISTANT_IMAGE_URL>";
261
+
262
+ var USER_MESSAGE_RECEIVED = false;
263
+
264
+ function escapeHtml(unsafe) {
265
+ return unsafe
266
+ .replace(/&/g, "&amp;")
267
+ .replace(/</g, "&lt;")
268
+ .replace(/>/g, "&gt;")
269
+ .replace(/"/g, "&quot;")
270
+ .replace(/'/g, "&#039;");
271
+ }
272
+ function addHtmlLineBreaks(text) {
273
+ return text.replace(/\n/g, "<br>");
274
+ }
275
+ function toHtmlImage(url) {
276
+ return `<img src="${url}">`
277
+ }
278
+
279
+ function addChatMessage(role, timeString, msg) {
280
+ const chatBody = document.getElementById('chatBody');
281
+ const chatDebug = document.getElementById('chatDebug');
282
+ const msgContainer = document.createElement('div');
283
+ msgContainer.className = 'message';
284
+
285
+ const text = document.createElement('div');
286
+
287
+ // const assistant_img = document.createElement('img');
288
+ // assistant_img.src = assistant_image_data;
289
+ // assistant_img.alt = 'Assistant';
290
+ // assistant_img.className = 'assistant-image';
291
+
292
+ switch (role) {
293
+ case "user":
294
+ text.className = 'user-message';
295
+ text.innerHTML = addHtmlLineBreaks(msg);
296
+ text.style.setProperty('--before-content', `"${timeString} - You"`);
297
+ msgContainer.appendChild(text);
298
+ chatBody.appendChild(msgContainer);
299
+ break;
300
+ case "user_image":
301
+ text.className = 'user-message';
302
+ text.innerHTML = toHtmlImage(msg);
303
+ text.style.setProperty('--before-content', `"${timeString} - You"`);
304
+ msgContainer.appendChild(text);
305
+ chatBody.appendChild(msgContainer);
306
+ break;
307
+ case "assistant":
308
+ text.className = 'assistant-message';
309
+ text.innerHTML = addHtmlLineBreaks(escapeHtml(msg));
310
+ text.style.setProperty('--before-content', `"${timeString} - Bot"`);
311
+ // msgContainer.appendChild(assistant_img); // Add the image to the message container
312
+ msgContainer.appendChild(text);
313
+ chatBody.appendChild(msgContainer);
314
+ break;
315
+ case "infeasible":
316
+ text.className = 'assistant-message';
317
+ text.innerHTML = addHtmlLineBreaks(escapeHtml(msg));
318
+ text.style.setProperty('--before-content', `"${timeString} - Bot (abort)"`);
319
+ msgContainer.appendChild(text);
320
+ chatBody.appendChild(msgContainer);
321
+ break;
322
+ case "info":
323
+ text.className = 'info-message';
324
+ text.innerHTML = addHtmlLineBreaks(escapeHtml(msg));
325
+ msgContainer.appendChild(text);
326
+ // hide previous debug messages
327
+ for (const msg of chatDebug.children) {
328
+ msg.style.display = 'none';
329
+ }
330
+ chatDebug.appendChild(msgContainer);
331
+ break;
332
+ default:
333
+ throw new TypeError(`Illegal role "${role}".`);
334
+ }
335
+
336
+ chatBody.scrollTop = chatBody.scrollHeight;
337
+
338
+ if (role === "user") {
339
+ USER_MESSAGE_RECEIVED = true;
340
+ }
341
+ }
342
+
343
+ if (typeof send_user_message !== 'function') {
344
+ function send_user_message(msg) {
345
+ // This will be overloaded by playwright
346
+ }
347
+ }
348
+
349
+ const inputBox = document.getElementById('inputBox');
350
+
351
+ async function send_msg(msg) {
352
+ if (msg.trim()) {
353
+ const strings = await send_user_message(msg);
354
+ addChatMessage(strings[0], strings[1], strings[2]);
355
+ inputBox.value = '';
356
+ }
357
+ }
358
+
359
+ inputBox.onkeypress = (e) => {
360
+ if (e.key === 'Enter' && !e.shiftKey) {
361
+ e.preventDefault();
362
+ send_msg(inputBox.value);
363
+ }
364
+ };
365
+
366
+ document.getElementById('chatForm').onsubmit = function (event) {
367
+ event.preventDefault();
368
+ send_msg(inputBox.value);
369
+ return false;
370
+ }
371
+ // addChatMessage('info', 'Hello World');
372
+ // addChatMessage('assistant', 'Hello assistant');
373
+ // addChatMessage('user', 'Hello user');
374
+
375
+ </script>
376
+
377
+ </body>
378
+
379
+ </html>
BrowserGym/browsergym/core/src/browsergym/core/chat_files/img/send.svg ADDED
BrowserGym/browsergym/core/src/browsergym/core/constants.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ BROWSERGYM_ID_ATTRIBUTE = "bid" # Playwright's default is "data-testid"
2
+ BROWSERGYM_VISIBILITY_ATTRIBUTE = "browsergym_visibility_ratio"
3
+ BROWSERGYM_SETOFMARKS_ATTRIBUTE = "browsergym_set_of_marks"
4
+
5
+ EXTRACT_OBS_MAX_TRIES = 5
BrowserGym/browsergym/core/src/browsergym/core/env.py ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import logging
3
+ import re
4
+ import time
5
+ from abc import ABC
6
+ from pathlib import Path
7
+ from typing import Any, Callable, Literal, Optional
8
+
9
+ import gymnasium as gym
10
+ import numpy as np
11
+ import playwright.sync_api
12
+
13
+ from . import _get_global_playwright
14
+ from .action.base import execute_python_code
15
+ from .action.highlevel import HighLevelActionSet
16
+ from .chat import Chat
17
+ from .constants import BROWSERGYM_ID_ATTRIBUTE, EXTRACT_OBS_MAX_TRIES
18
+ from .observation import (
19
+ MarkingError,
20
+ _post_extract,
21
+ _pre_extract,
22
+ extract_dom_extra_properties,
23
+ extract_dom_snapshot,
24
+ extract_focused_element_bid,
25
+ extract_merged_axtree,
26
+ extract_screenshot,
27
+ )
28
+ from .spaces import AnyBox, AnyDict, Float, Unicode
29
+ from .task import AbstractBrowserTask
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ def _try_to_extract_legacy_goal(goal: list):
35
+ legacy_goal_strings = []
36
+ for message in goal:
37
+ if message["type"] == "text":
38
+ legacy_goal_strings.append(message["text"])
39
+ else:
40
+ logger.debug(
41
+ f"Message type {repr(message['type'])} present in the goal, cannot be converted to legacy text-only format."
42
+ )
43
+ legacy_goal_strings.append(
44
+ 'WARNING: This goal cannot be converted to a text-only goal format. Use the new goal format instead ("goal_object" field). Any agent reading this should abort immediately.'
45
+ )
46
+ break
47
+ legacy_goal = "\n".join(legacy_goal_strings)
48
+
49
+ return legacy_goal
50
+
51
+
52
+ class BrowserEnv(gym.Env, ABC):
53
+ """The main BrowserGym class, which encapsulates instruction-following Web browsing into a Gymnasium environment."""
54
+
55
+ # gym metadata
56
+ metadata = {"render_modes": None}
57
+
58
+ def __init__(
59
+ self,
60
+ # task-related arguments
61
+ task_entrypoint: type[AbstractBrowserTask],
62
+ task_kwargs: dict = {},
63
+ viewport: Optional[dict] = None, # will override the task's viewport
64
+ slow_mo: Optional[int] = None, # will override the task's slow_mo
65
+ timeout: Optional[int] = None, # will override the task's timeout
66
+ locale: Optional[str] = None, # will override the task's locale
67
+ timezone_id: Optional[str] = None, # will override the task's timezone_id
68
+ tags_to_mark: Literal["all", "standard_html"] = "standard_html",
69
+ # interactive / debugging arguments
70
+ headless: bool = True,
71
+ wait_for_user_message: bool = False,
72
+ terminate_on_infeasible: bool = True,
73
+ resizeable_window: bool = False,
74
+ record_video_dir: Optional[str] = None,
75
+ pw_chromium_kwargs: dict = {},
76
+ pw_context_kwargs: dict = {},
77
+ # agent-related arguments
78
+ action_mapping: Optional[callable] = HighLevelActionSet().to_python_code,
79
+ ):
80
+ """
81
+ Instantiate a ready to use BrowserEnv gym environment.
82
+
83
+ Args:
84
+ task_entrypoint: a callable that returns a new task object from a seed. Used for creating a new task during `reset()`.
85
+ task_kwargs: additional arguments passed to `task_entrypoint`.
86
+ viewport: desired viewport size. This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing.
87
+ slow_mo: desired slow_mo value for Playwright. This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing.
88
+ timeout: desired timeout value for Playwright. This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing.
89
+ locale: desired user locale for Playwright, for example en-GB, de-DE, etc. This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing.
90
+ timezone_id. desired timezone for Playwright, for example "Pacific/Tahiti". This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing.
91
+ tags_to_mark: which HTML tags should be marked by BrowserGym and receive a bid. Value "all" will mark every element in the page, while "standard_html" (default) will only mark standard html tags.
92
+ headless: whether the browser should run in headless mode or not. This will affect the viewport size, which might change the behaviour and difficulty of the task. Headless mode should only be disabled for debugging/testing.
93
+ wait_for_user_message: whether the environment should pause and wait for a user message in the chat after a new message is sent by the agent. Useful for running agents in interactive mode.
94
+ resizeable_window: whether the browser window should be resizeable or not. This will affect the viewport size, which might change the behaviour and difficulty of the task. Should only be set for debugging/testing.
95
+ record_video_dir: if set, indicates a directory to which viewport videos will be recorded.
96
+ pw_chromium_kwargs: extra parameters for the playwright Browser. Should only be used for debugging/testing.
97
+ pw_context_kwargs: extra parameters for the playwright BrowserContext. Should only be used for debugging/testing.
98
+ action_mapping: if set, the environment will use this function to map every received action to executable Python code.
99
+
100
+ """
101
+ super().__init__()
102
+ self.task_entrypoint = task_entrypoint
103
+ self.task_kwargs = dict(**task_kwargs)
104
+ self.viewport = viewport
105
+ self.slow_mo = slow_mo
106
+ self.timeout = timeout
107
+ self.locale = locale
108
+ self.timezone_id = timezone_id
109
+ self.tags_to_mark = tags_to_mark
110
+ self.headless = headless
111
+ self.wait_for_user_message = wait_for_user_message
112
+ self.terminate_on_infeasible = terminate_on_infeasible
113
+ self.resizeable_window = resizeable_window
114
+ self.record_video_dir = record_video_dir
115
+ self.pw_chromium_kwargs = pw_chromium_kwargs
116
+ self.pw_context_kwargs = pw_context_kwargs
117
+ self.action_mapping = action_mapping
118
+
119
+ # check argument values
120
+ assert tags_to_mark in ("all", "standard_html")
121
+
122
+ # task
123
+ self.task = None
124
+
125
+ # playwright
126
+ self.browser: playwright.sync_api.Browser = None
127
+ self.context: playwright.sync_api.BrowserContext = None
128
+ self.page: playwright.sync_api.Page = None
129
+ self.page_history: dict = {}
130
+
131
+ # chat
132
+ self.chat: Chat = None
133
+
134
+ # observation space
135
+ self.observation_space = gym.spaces.Dict(
136
+ {
137
+ "chat_messages": gym.spaces.Sequence(
138
+ gym.spaces.Dict(
139
+ {
140
+ "role": Unicode(),
141
+ "timestamp": Float(),
142
+ "message": Unicode(),
143
+ }
144
+ )
145
+ ),
146
+ "goal": Unicode(),
147
+ "goal_object": gym.spaces.Sequence(AnyDict()),
148
+ "open_pages_urls": gym.spaces.Sequence(Unicode()),
149
+ "open_pages_titles": gym.spaces.Sequence(Unicode()),
150
+ "active_page_index": gym.spaces.Box(
151
+ low=0, high=255, dtype=int
152
+ ), # TODO: change to an Integer (breaking change for users)
153
+ "url": Unicode(),
154
+ "screenshot": AnyBox(
155
+ low=0,
156
+ high=255,
157
+ shape=(-1, -1, 3),
158
+ dtype=np.uint8,
159
+ ), # swapped axes (height, width, RGB)
160
+ "dom_object": AnyDict(),
161
+ "axtree_object": AnyDict(),
162
+ "extra_element_properties": AnyDict(),
163
+ "focused_element_bid": Unicode(),
164
+ "last_action": Unicode(),
165
+ "last_action_error": Unicode(),
166
+ "elapsed_time": gym.spaces.Box(
167
+ low=0, high=np.inf, dtype=float
168
+ ), # TODO: change to a Float (breaking change for users)
169
+ }
170
+ )
171
+
172
+ # action space
173
+ self.action_space = Unicode()
174
+
175
+ def close(self):
176
+ # stop the task
177
+ if self.task:
178
+ self.task.teardown()
179
+ self.task = None
180
+ # close the chat
181
+ if self.chat:
182
+ self.chat.close()
183
+ self.chat = None
184
+ # close the browser context
185
+ if self.context:
186
+ self.context.close()
187
+ self.context = None
188
+ # close the browser
189
+ if self.browser:
190
+ self.browser.close()
191
+ self.browser = None
192
+
193
+ def reset(self, seed=None, *args, **kwargs):
194
+ super().reset(seed=seed, *args, **kwargs)
195
+ self.np_random = None # make sure all randomness is handled by the task
196
+
197
+ if self.task:
198
+ self.task.teardown()
199
+ self.context.close()
200
+ self.chat.close()
201
+ self.browser.close()
202
+
203
+ # create a new task
204
+ self.task = self.task_entrypoint(seed=seed, **self.task_kwargs)
205
+
206
+ def override_property(task, env, property):
207
+ """Extract property value from env if not None, otherwise from task."""
208
+ env_value = getattr(env, property)
209
+ task_value = getattr(task, property)
210
+ if env_value is None:
211
+ return task_value
212
+ else:
213
+ if task_value is not None:
214
+ logger.warning(
215
+ f"Overriding the task's {property} parameter ({repr(task_value)} => {repr(env_value)}). This might change the task's behaviour and difficulty."
216
+ )
217
+ return env_value
218
+
219
+ # fetch task's desired parameters for browser setup
220
+ viewport = override_property(self.task, self, "viewport")
221
+ slow_mo = override_property(self.task, self, "slow_mo")
222
+ timeout = override_property(self.task, self, "timeout")
223
+ locale = override_property(self.task, self, "locale")
224
+ timezone_id = override_property(self.task, self, "timezone_id")
225
+
226
+ # use the global Playwright instance
227
+ pw: playwright.sync_api.Playwright = _get_global_playwright()
228
+ # important: change playwright's test id attribute from "data-testid" to "bid"
229
+ pw.selectors.set_test_id_attribute(BROWSERGYM_ID_ATTRIBUTE)
230
+
231
+ # create a new browser
232
+ self.browser = pw.chromium.launch(
233
+ headless=self.headless,
234
+ slow_mo=slow_mo,
235
+ args=(
236
+ [f"--window-size={viewport['width']},{viewport['height']}"]
237
+ if self.resizeable_window
238
+ else None
239
+ ),
240
+ # will raise an Exception if above args are overriden
241
+ **self.pw_chromium_kwargs,
242
+ )
243
+
244
+ # create a new browser context for pages
245
+ self.context = self.browser.new_context(
246
+ no_viewport=True if self.resizeable_window else None,
247
+ viewport=viewport if not self.resizeable_window else None,
248
+ record_video_dir=(
249
+ Path(self.record_video_dir) / "task_video" if self.record_video_dir else None
250
+ ),
251
+ record_video_size=viewport,
252
+ locale=locale,
253
+ timezone_id=timezone_id,
254
+ # will raise an Exception if above args are overriden
255
+ **self.pw_context_kwargs,
256
+ )
257
+
258
+ # set default timeout
259
+ self.context.set_default_timeout(timeout)
260
+
261
+ # hack: keep track of the active page with a javascript callback
262
+ # there is no concept of active page in playwright
263
+ # https://github.com/microsoft/playwright/issues/2603
264
+ self.context.expose_binding(
265
+ "browsergym_page_activated", lambda source: self._activate_page_from_js(source["page"])
266
+ )
267
+ self.context.add_init_script(
268
+ r"""
269
+ window.browsergym_page_activated();
270
+ window.addEventListener("focus", () => {window.browsergym_page_activated();}, {capture: true});
271
+ window.addEventListener("focusin", () => {window.browsergym_page_activated();}, {capture: true});
272
+ window.addEventListener("load", () => {window.browsergym_page_activated();}, {capture: true});
273
+ window.addEventListener("pageshow", () => {window.browsergym_page_activated();}, {capture: true});
274
+ window.addEventListener("mousemove", () => {window.browsergym_page_activated();}, {capture: true});
275
+ window.addEventListener("mouseup", () => {window.browsergym_page_activated();}, {capture: true});
276
+ window.addEventListener("mousedown", () => {window.browsergym_page_activated();}, {capture: true});
277
+ window.addEventListener("wheel", () => {window.browsergym_page_activated();}, {capture: true});
278
+ window.addEventListener("keyup", () => {window.browsergym_page_activated();}, {capture: true});
279
+ window.addEventListener("keydown", () => {window.browsergym_page_activated();}, {capture: true});
280
+ window.addEventListener("input", () => {window.browsergym_page_activated();}, {capture: true});
281
+ window.addEventListener("touchstart", () => {window.browsergym_page_activated();}, {capture: true});
282
+ window.addEventListener("touchend", () => {window.browsergym_page_activated();}, {capture: true});
283
+ document.addEventListener("visibilitychange", () => {
284
+ if (document.visibilityState === "visible") {
285
+ window.browsergym_page_activated();
286
+ }
287
+ }, {capture: true});
288
+ """
289
+ )
290
+
291
+ # create the chat
292
+ self.chat = Chat(
293
+ headless=self.headless,
294
+ chat_size=(500, max(viewport["height"], 800)),
295
+ record_video_dir=self.record_video_dir,
296
+ )
297
+
298
+ # create a new page
299
+ self.page = self.context.new_page()
300
+ recording_start_time = time.time()
301
+
302
+ # setup the task
303
+ task_goal, task_info = self.task.setup(page=self.page)
304
+
305
+ # process the task goal
306
+
307
+ # no goal specified
308
+ if task_goal is None:
309
+ self.goal_object = []
310
+ # convert text-only goal (legacy) to new format
311
+ elif isinstance(task_goal, str):
312
+ self.goal_object = [{"type": "text", "text": task_goal}]
313
+ # new format goal with multiple texts and images (OpenAI style)
314
+ elif isinstance(task_goal, list):
315
+ self.goal_object = task_goal
316
+ else:
317
+ raise ValueError(f"task_goal should be of type str or list, got {task_goal.__class__}")
318
+
319
+ # initialize the chat
320
+ self.chat.add_message(
321
+ role="assistant",
322
+ msg="Hi! I am your UI assistant, I can perform web tasks for you. What can I help you with?",
323
+ )
324
+
325
+ # send task goal (if any) to the chat
326
+ for message in self.goal_object:
327
+ match message["type"]:
328
+ case "text":
329
+ self.chat.add_message(role="user", msg=message["text"])
330
+ case "image_url":
331
+ image_src = message["image_url"]
332
+ if isinstance(image_src, dict):
333
+ image_src = image_src["url"]
334
+ self.chat.add_message(role="user_image", msg=image_src)
335
+ case _:
336
+ raise ValueError(
337
+ f"Unknown message type {repr(message['type'])} in the task goal."
338
+ )
339
+
340
+ self._wait_dom_loaded()
341
+
342
+ # after the task's setup, the active page might have changed
343
+ # perform a safety check
344
+ self._active_page_check()
345
+
346
+ # init start time
347
+ self.start_time = time.time()
348
+
349
+ # no action yet
350
+ self.last_action = ""
351
+ self.last_action_error = ""
352
+ self.infeasible_message_received = False
353
+
354
+ # if asked, wait for user message
355
+ self._wait_for_user_message()
356
+
357
+ # extract obs and info from environment
358
+ obs = self._get_obs()
359
+
360
+ info = {}
361
+ info["task_info"] = task_info
362
+
363
+ # TODO this is a bit hacky, find a better solution to record videos
364
+ if self.record_video_dir:
365
+ info["recording_start_time"] = recording_start_time
366
+ info["recording_file"] = str(self.page.video.path())
367
+ info["chat"] = {
368
+ "recording_start_time": self.chat.recording_start_time,
369
+ "recording_file": str(self.chat.page.video.path()),
370
+ }
371
+
372
+ return obs, info
373
+
374
+ def pre_step(self) -> tuple[dict[str, Any], Callable, Callable]:
375
+ info = {}
376
+ info["action_exec_start"] = time.time()
377
+ info["action_exec_timeout"] = 0
378
+
379
+ def send_message_to_user(text: str):
380
+ if not isinstance(text, str):
381
+ raise ValueError(f"Forbidden value: {text} is not a string")
382
+ self.chat.add_message(role="assistant", msg=text)
383
+
384
+ def report_infeasible_instructions(reason: str):
385
+ if not isinstance(reason, str):
386
+ raise ValueError(f"Forbidden value: {reason} is not a string")
387
+ self.chat.add_message(role="infeasible", msg=reason)
388
+ self.infeasible_message_received = True
389
+
390
+ # try to execute the action
391
+ logger.debug("Executing action")
392
+ return info, send_message_to_user, report_infeasible_instructions
393
+
394
+ def step(self, action: str) -> tuple[dict[str, Any], float, bool, bool, dict[str, Any]]:
395
+ """
396
+ Execute the action in the environment.
397
+
398
+ Args:
399
+ action: the action to execute. This should be a string with code or a function call
400
+
401
+ Returns:
402
+ obs: the observation after executing the action
403
+ reward: the reward received after executing the action
404
+ terminated: whether the episode is terminated or not
405
+ truncated: whether the episode is truncated or not
406
+ info: additional information about the step
407
+ """
408
+ self.last_action = action
409
+ info, send_message_to_user, report_infeasible_instructions = self.pre_step()
410
+ try:
411
+ if self.action_mapping:
412
+ code = self.action_mapping(action)
413
+ else:
414
+ code = action
415
+ execute_python_code(
416
+ code,
417
+ self.page,
418
+ send_message_to_user=send_message_to_user,
419
+ report_infeasible_instructions=report_infeasible_instructions,
420
+ )
421
+ self.last_action_error = ""
422
+ except Exception as e:
423
+ self.last_action_error = f"{type(e).__name__}: {e}"
424
+ match = re.match("TimeoutError: Timeout ([0-9]+)ms exceeded.", self.last_action_error)
425
+ if match:
426
+ info["action_exec_timeout"] = float(match.groups()[0]) / 1000 # ms to sec
427
+ return self.post_step(info)
428
+
429
+ def post_step(
430
+ self, info: dict[str, Any], validate: bool = True
431
+ ) -> tuple[dict[str, Any], float, bool, bool, dict[str, Any]]:
432
+ """
433
+ Post step method, called after executing the action.
434
+ This method is responsible for extracting the observation after the action.
435
+ It also prepares reward, task status, user message and other step info.
436
+ Args:
437
+ info: dictionary containing information about the step
438
+ Returns:
439
+ obs: the observation after executing the action
440
+ reward: the reward received after executing the action
441
+ terminated: whether the episode is terminated or not
442
+ truncated: whether the episode is truncated or not
443
+ info: additional information about the step
444
+ """
445
+ logger.debug("Action executed")
446
+ info["action_exec_stop"] = time.time()
447
+
448
+ # wait a bit (for the JavaScript callback to set the active page)
449
+ time.sleep(0.5) # wait for JS events to be fired (half a second)
450
+ self.context.cookies() # trigger all waiting Playwright callbacks on the stack (hack, see https://playwright.dev/java/docs/multithreading)
451
+
452
+ # wait for the network to idle before extracting the observation, reward etc.
453
+ self._wait_dom_loaded()
454
+
455
+ if validate:
456
+ # after the action is executed, the active page might have changed
457
+ # perform a safety check
458
+ self._active_page_check()
459
+ logger.debug("Active page checked")
460
+
461
+ # if asked, wait for user message
462
+ self._wait_for_user_message()
463
+ logger.debug("User message done")
464
+
465
+ logger.debug("Initiating task validation")
466
+ # extract reward, done, user_message, info (task-specific)
467
+ reward, done, user_message, task_info = self._task_validate()
468
+ info["task_info"] = task_info
469
+ logger.debug("Task validation done")
470
+ else:
471
+ reward = 0
472
+ done = False
473
+ user_message = None
474
+ info["task_info"] = {}
475
+ logger.debug("Task validation skipped")
476
+
477
+ # add any user message sent by the task to the chat
478
+ if user_message:
479
+ self.chat.add_message(role="user", msg=user_message)
480
+
481
+ # extract observation (generic)
482
+ obs = self._get_obs()
483
+ logger.debug("Observation extracted")
484
+
485
+ # new step API wants a 5-tuple (gymnasium)
486
+ terminated = done or (
487
+ self.terminate_on_infeasible and self.infeasible_message_received
488
+ ) # task or agent can terminate the episode
489
+ truncated: bool = False
490
+ return obs, reward, terminated, truncated, info
491
+
492
+ def _task_validate(self):
493
+ # back-up these in case validate() navigates pages and messes the history
494
+ prev_active_page = self.page
495
+ prev_page_history = self.page_history.copy()
496
+ # call validate
497
+ reward, done, user_message, info = self.task.validate(self.page, self.chat.messages)
498
+
499
+ # safety fix, in case validate() did mess up the active page and/or page history
500
+ if prev_active_page != self.page or prev_page_history != self.page_history:
501
+ logger.debug(
502
+ "The active page and / or page history has changed during task.validate(). A recovery fix will be applied."
503
+ )
504
+ self.page = prev_active_page
505
+ self.page_history = prev_page_history
506
+
507
+ return reward, done, user_message, info
508
+
509
+ def _wait_for_user_message(self):
510
+ # if last message is from the assistant, wait for a user message to continue
511
+ # TODO: be smarter about when to wait for a user message (different action from the assistant?)
512
+ if self.chat.messages[-1]["role"] == "assistant" and self.wait_for_user_message:
513
+ self.chat.wait_for_user_message()
514
+
515
+ def _wait_dom_loaded(self):
516
+ for page in self.context.pages:
517
+ try:
518
+ page.wait_for_load_state("domcontentloaded", timeout=3000)
519
+ except playwright.sync_api.Error:
520
+ pass
521
+ for frame in page.frames:
522
+ try:
523
+ frame.wait_for_load_state("domcontentloaded", timeout=3000)
524
+ except playwright.sync_api.Error:
525
+ pass
526
+
527
+ def _activate_page_from_js(self, page: playwright.sync_api.Page):
528
+ logger.debug(f"_activate_page_from_js(page) called, page={str(page)}")
529
+ if not page.context == self.context:
530
+ raise RuntimeError(
531
+ f"Unexpected: activating a page that belongs to a different browser context ({page})."
532
+ )
533
+
534
+ # add the activated page to the page history (or move it to last which is the most recent)
535
+ if page in self.page_history:
536
+ self.page_history[page] = self.page_history.pop(
537
+ page
538
+ ) # move page to the end of dictionnary
539
+ else:
540
+ self.page_history[page] = None # add page to the end of dictionnary
541
+
542
+ self.page = page
543
+
544
+ def _active_page_check(self):
545
+ # make sure there is always a page open
546
+ # if all pages have been closed, create a new page
547
+ if len(self.context.pages) == 0:
548
+ logger.warning("All pages are closed, opening a new page.")
549
+ self.page = self.context.new_page()
550
+
551
+ # if the active page got closed, get the last active page from the history
552
+ while self.page_history and (self.page.is_closed() or self.page not in self.context.pages):
553
+ self.page_history.pop(self.page) # remove active page from history
554
+ self.page = list(self.page_history.keys())[
555
+ -1
556
+ ] # set last active page as the active page (most recent)
557
+
558
+ # active page should share the same browser context with the environment
559
+ if self.page not in self.context.pages:
560
+ raise RuntimeError(
561
+ f"Unexpected: active page is not part of the browser context's open pages ({self.page})."
562
+ )
563
+
564
+ # active page should not be closed
565
+ if self.page.is_closed():
566
+ raise RuntimeError(f"Unexpected: active page has been closed ({self.page}).")
567
+
568
+ def _get_obs(self):
569
+
570
+ for retries_left in reversed(range(EXTRACT_OBS_MAX_TRIES)):
571
+ try:
572
+ # pre-extraction, mark dom elements (set bid, set dynamic attributes like value and checked)
573
+ _pre_extract(self.page, tags_to_mark=self.tags_to_mark, lenient=(retries_left == 0))
574
+
575
+ dom = extract_dom_snapshot(self.page)
576
+ axtree = extract_merged_axtree(self.page)
577
+ focused_element_bid = extract_focused_element_bid(self.page)
578
+ extra_properties = extract_dom_extra_properties(dom)
579
+ except (playwright.sync_api.Error, MarkingError) as e:
580
+ err_msg = str(e)
581
+ # try to add robustness to async events (detached / deleted frames)
582
+ if retries_left > 0 and (
583
+ "Frame was detached" in err_msg
584
+ or "Frame with the given frameId is not found" in err_msg
585
+ or "Execution context was destroyed" in err_msg
586
+ or "Frame has been detached" in err_msg
587
+ or "Cannot mark a child frame without a bid" in err_msg
588
+ or "Cannot read properties of undefined" in err_msg
589
+ ):
590
+ logger.warning(
591
+ f"An error occurred while extracting the dom and axtree. Retrying ({retries_left}/{EXTRACT_OBS_MAX_TRIES} tries left).\n{repr(e)}"
592
+ )
593
+ # post-extract cleanup (ARIA attributes)
594
+ _post_extract(self.page)
595
+ time.sleep(0.5)
596
+ continue
597
+ else:
598
+ raise e
599
+ break
600
+
601
+ # post-extraction cleanup of temporary info in dom
602
+ _post_extract(self.page)
603
+
604
+ # obs is generic to all tasks
605
+ obs = {
606
+ "chat_messages": tuple(copy.deepcopy(self.chat.messages)),
607
+ "goal": _try_to_extract_legacy_goal(self.goal_object), # legacy goal, deprecated
608
+ "goal_object": tuple(
609
+ copy.deepcopy(self.goal_object)
610
+ ), # new goal format, list of messages openai style
611
+ "open_pages_urls": tuple(page.url for page in self.context.pages),
612
+ "open_pages_titles": tuple(page.title() for page in self.context.pages),
613
+ "active_page_index": np.asarray([self.context.pages.index(self.page)]),
614
+ "url": self.page.url, # redundant with "open_pages_urls" and "active_page_index"
615
+ "screenshot": extract_screenshot(self.page),
616
+ "dom_object": dom,
617
+ "axtree_object": axtree,
618
+ "extra_element_properties": extra_properties,
619
+ "focused_element_bid": focused_element_bid,
620
+ "last_action": self.last_action,
621
+ "last_action_error": self.last_action_error,
622
+ "elapsed_time": np.asarray([time.time() - self.start_time]),
623
+ }
624
+
625
+ return obs
BrowserGym/browsergym/core/src/browsergym/core/javascript/frame_mark_elements.js ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Go through all DOM elements in the frame (including shadowDOMs), give them unique browsergym
3
+ * identifiers (bid), and store custom data in ARIA attributes.
4
+ */
5
+ async ([parent_bid, bid_attr_name, tags_to_mark]) => {
6
+
7
+ // standard html tags
8
+ // https://www.w3schools.com/tags/
9
+ const html_tags = new Set([
10
+ "a", "abbr", "acronym", "address", "applet", "area", "article", "aside", "audio",
11
+ "b", "base", "basefont", "bdi", "bdo", "big", "blockquote", "body", "br", "button",
12
+ "canvas", "caption", "center", "cite", "code", "col", "colgroup", "data", "datalist",
13
+ "dd", "del", "details", "dfn", "dialog", "dir", "div", "dl", "dt", "em", "embed",
14
+ "fieldset", "figcaption", "figure", "font", "footer", "form", "frame", "frameset",
15
+ "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i",
16
+ "iframe", "img", "input", "ins", "kbd", "label", "legend", "li", "link", "main",
17
+ "map", "mark", "menu", "meta", "meter", "nav", "noframes", "noscript", "object",
18
+ "ol", "optgroup", "option", "output", "p", "param", "picture", "pre", "progress",
19
+ "q", "rp", "rt", "ruby", "s", "samp", "script", "search", "section", "select",
20
+ "small", "source", "span", "strike", "strong", "style", "sub", "summary", "sup",
21
+ "svg", "table", "tbody", "td", "template", "textarea", "tfoot", "th", "thead",
22
+ "time", "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr"
23
+ ]);
24
+ const set_of_marks_tags = new Set([
25
+ "input", "textarea", "select", "button", "a", "iframe", "video", "li", "td", "option"
26
+ ]);
27
+
28
+ let browsergym_first_visit = false;
29
+ // if no yet set, set the frame (local) element counter to 0
30
+ if (!("browsergym_elem_counter" in window)) {
31
+ window.browsergym_elem_counter = 0;
32
+ window.browsergym_frame_id_generator = new IFrameIdGenerator();
33
+ browsergym_first_visit = true;
34
+ }
35
+ // mechanism for computing all element's visibility
36
+ // the intersection observer will set the visibility ratio of elements entering / exiting the viewport
37
+ // a set is used to keep track of not-yet-visited elements
38
+ let elems_to_be_visited = new Set();
39
+ let intersection_observer = new IntersectionObserver(
40
+ entries => {
41
+ entries.forEach(entry => {
42
+ let elem = entry.target;
43
+ elem.setAttribute('browsergym_visibility_ratio', Math.round(entry.intersectionRatio * 100) / 100);
44
+ if (elems_to_be_visited.has(elem)) {
45
+ elems_to_be_visited.delete(elem);
46
+ }
47
+ })
48
+ },
49
+ {
50
+ threshold: [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
51
+ }
52
+ )
53
+
54
+ let all_bids = new Set();
55
+
56
+ // get all DOM elements in the current frame (does not include elements in shadowDOMs)
57
+ let elements = Array.from(document.querySelectorAll('*'));
58
+ let som_buttons = [];
59
+ i = 0;
60
+ while (i < elements.length) {
61
+ const elem = elements[i];
62
+ // add shadowDOM elements to the elements array, in such a way that order is preserved
63
+ // TODO: do we really need the order preserved?
64
+ if (elem.shadowRoot !== null) {
65
+ elements = new Array(
66
+ ...Array.prototype.slice.call(elements, 0, i + 1),
67
+ ...Array.from(elem.shadowRoot.querySelectorAll("*")),
68
+ ...Array.prototype.slice.call(elements, i + 1)
69
+ );
70
+ }
71
+ i++;
72
+ // decide if the current element should be marked or not
73
+ switch (tags_to_mark) {
74
+ // mark all elements
75
+ case "all":
76
+ break;
77
+ // mark only standard HTML tags
78
+ case "standard_html":
79
+ if (!elem.tagName || !html_tags.has(elem.tagName.toLowerCase())) {
80
+ // continue the loop, i.e., move on to the next element
81
+ continue;
82
+ }
83
+ break;
84
+ // non-recognized argument
85
+ default:
86
+ throw new Error(`Invalid value for parameter \"tags_to_mark\": ${JSON.stringify(tags_to_mark)}`);
87
+ }
88
+ // Processing element
89
+ // register intersection callback on element, and keep track of element for waiting later
90
+ elem.setAttribute('browsergym_visibility_ratio', 0);
91
+ elems_to_be_visited.add(elem);
92
+ intersection_observer.observe(elem);
93
+ // write dynamic element values to the DOM
94
+ if (typeof elem.value !== 'undefined') {
95
+ elem.setAttribute("value", elem.value);
96
+ }
97
+ // write dynamic checked properties to the DOM
98
+ if (typeof elem.checked !== 'undefined') {
99
+ if (elem.checked === true) {
100
+ elem.setAttribute("checked", "");
101
+ }
102
+ else {
103
+ elem.removeAttribute("checked");
104
+ }
105
+ }
106
+ // add the element global id (browsergym id) to a custom HTML attribute
107
+ // https://playwright.dev/docs/locators#locate-by-test-id
108
+ // recover the element id if it has one already, else compute a new element id
109
+ let elem_global_bid = null;
110
+ if (elem.hasAttribute(bid_attr_name)) {
111
+ // throw an error if the attribute is already set while this is the first visit of the page
112
+ if (browsergym_first_visit) {
113
+ throw new Error(`Attribute ${bid_attr_name} already used in element ${elem.outerHTML}`);
114
+ }
115
+ elem_global_bid = elem.getAttribute(bid_attr_name);
116
+ // if the bid has already been encountered, then this is a duplicate and a new bid should be set
117
+ if (all_bids.has(elem_global_bid)) {
118
+ console.log(`BrowserGym: duplicate bid ${elem_global_bid} detected, generating a new one`);
119
+ elem_global_bid = null;
120
+ }
121
+ }
122
+ if (elem_global_bid === null) {
123
+ let elem_local_id = null;
124
+ // iFrames get alphabetical ids: 'a', 'b', ..., 'z', 'aA', 'aB' etc.
125
+ if (['iframe', 'frame'].includes(elem.tagName.toLowerCase())) {
126
+ elem_local_id = `${window.browsergym_frame_id_generator.next()}`;
127
+ }
128
+ // other elements get numerical ids: '0', '1', '2', ...
129
+ else {
130
+ elem_local_id = `${window.browsergym_elem_counter++}`;
131
+ }
132
+ if (parent_bid == "") {
133
+ elem_global_bid = `${elem_local_id}`;
134
+ }
135
+ else {
136
+ elem_global_bid = `${parent_bid}${elem_local_id}`;
137
+ }
138
+ elem.setAttribute(bid_attr_name, `${elem_global_bid}`);
139
+ }
140
+ all_bids.add(elem_global_bid);
141
+
142
+ // Hack: store custom data inside ARIA attributes (will be available in DOM and AXTree)
143
+ // - elem_global_bid: global element identifier (unique over multiple frames)
144
+ // TODO: add more data if needed (x, y coordinates, bounding box, is_visible, is_clickable etc.)
145
+ push_bid_to_attribute(elem_global_bid, elem, "aria-roledescription");
146
+ push_bid_to_attribute(elem_global_bid, elem, "aria-description"); // fallback for generic nodes
147
+
148
+ // set-of-marks flag (He et al. 2024)
149
+ // https://github.com/MinorJerry/WebVoyager/blob/main/utils.py
150
+ elem.setAttribute("browsergym_set_of_marks", "0");
151
+ // click at center activates self or a child
152
+ if (["self", "child"].includes(whoCapturesCenterClick(elem))) {
153
+ // has valid tag name, or has click event, or triggers a pointer cursor
154
+ if (set_of_marks_tags.has(elem.tagName.toLowerCase()) || (elem.onclick != null) || (window.getComputedStyle(elem).cursor == "pointer")) {
155
+ let rect = elem.getBoundingClientRect();
156
+ let area = (rect.right - rect.left) * (rect.bottom - rect.top);
157
+ // area is large enough
158
+ if (area >= 20) {
159
+ // is not a child of a button (role, type, tag) set to be marked
160
+ if (som_buttons.every(button => !button.contains(elem))) {
161
+ // is not the sole child of span that has a role and is set to be marked
162
+ let parent = elem.parentElement;
163
+ if (!(parent && parent.tagName.toLowerCase() == "span" && parent.children.length === 1 && parent.getAttribute("role") && parent.getAttribute("browsergym_set_of_marks") === "1")) {
164
+ // all checks have passed, flag the element for inclusion in set-of-marks
165
+ elem.setAttribute("browsergym_set_of_marks", "1");
166
+ if (elem.matches('button, a, input[type="button"], div[role="button"]')) {
167
+ som_buttons.push(elem)
168
+ }
169
+ // lastly, remove the set-of-marks flag from all parents, if any
170
+ while (parent) {
171
+ if (parent.getAttribute("browsergym_set_of_marks") === "1") {
172
+ parent.setAttribute("browsergym_set_of_marks", "0")
173
+ }
174
+ parent = parent.parentElement;
175
+ }
176
+ }
177
+ }
178
+ }
179
+ }
180
+ }
181
+ }
182
+
183
+ warning_msgs = new Array();
184
+
185
+ // wait for all elements to be visited for visibility
186
+ let visibility_marking_timeout = 1000; // ms
187
+ try {
188
+ await until(() => elems_to_be_visited.size == 0, visibility_marking_timeout);
189
+ } catch {
190
+ warning_msgs.push(`Frame marking: not all elements have been visited by the intersection_observer after ${visibility_marking_timeout} ms`);
191
+ }
192
+ // disconnect intersection observer
193
+ intersection_observer.disconnect();
194
+
195
+ return warning_msgs;
196
+ }
197
+
198
+ async function until(f, timeout, interval=40) {
199
+ return new Promise((resolve, reject) => {
200
+ const start_time = Date.now();
201
+ // immediate check
202
+ if (f()) {
203
+ resolve();
204
+ }
205
+ // loop check
206
+ const wait = setInterval(() => {
207
+ if (f()) {
208
+ clearInterval(wait);
209
+ resolve();
210
+ } else if (Date.now() - start_time > timeout) {
211
+ clearInterval(wait);
212
+ reject();
213
+ }
214
+ }, interval);
215
+ });
216
+ }
217
+
218
+
219
+ function whoCapturesCenterClick(element){
220
+ var rect = element.getBoundingClientRect();
221
+ var x = (rect.left + rect.right) / 2 ;
222
+ var y = (rect.top + rect.bottom) / 2 ;
223
+ var element_at_center = elementFromPoint(x, y); // return the element in the foreground at position (x,y)
224
+ if (!element_at_center) {
225
+ return "nobody";
226
+ } else if (element_at_center === element) {
227
+ return "self";
228
+ } else if (element.contains(element_at_center)) {
229
+ return "child";
230
+ } else {
231
+ return "non-descendant";
232
+ }
233
+ }
234
+
235
+ function push_bid_to_attribute(bid, elem, attr){
236
+ let original_content = "";
237
+ if (elem.hasAttribute(attr)) {
238
+ original_content = elem.getAttribute(attr);
239
+ }
240
+ let new_content = `browsergym_id_${bid} ${original_content}`
241
+ elem.setAttribute(attr, new_content);
242
+ }
243
+
244
+ function elementFromPoint(x, y) {
245
+ let dom = document;
246
+ let last_elem = null;
247
+ let elem = null;
248
+
249
+ do {
250
+ last_elem = elem;
251
+ elem = dom.elementFromPoint(x, y);
252
+ dom = elem?.shadowRoot;
253
+ } while(dom && elem !== last_elem);
254
+
255
+ return elem;
256
+ }
257
+
258
+ // https://stackoverflow.com/questions/12504042/what-is-a-method-that-can-be-used-to-increment-letters#answer-12504061
259
+ class IFrameIdGenerator {
260
+ constructor(chars = 'abcdefghijklmnopqrstuvwxyz') {
261
+ this._chars = chars;
262
+ this._nextId = [0];
263
+ }
264
+
265
+ next() {
266
+ const r = [];
267
+ for (let i = 0; i < this._nextId.length; i++) {
268
+ let char = this._chars[this._nextId[i]];
269
+ // all but first character must be upper-cased (a, aA, bCD)
270
+ if (i < this._nextId.length - 1) {
271
+ char = char.toUpperCase();
272
+ }
273
+ r.unshift(char);
274
+ }
275
+ this._increment();
276
+ return r.join('');
277
+ }
278
+
279
+ _increment() {
280
+ for (let i = 0; i < this._nextId.length; i++) {
281
+ const val = ++this._nextId[i];
282
+ if (val < this._chars.length) {
283
+ return;
284
+ }
285
+ this._nextId[i] = 0;
286
+ }
287
+ this._nextId.push(0);
288
+ }
289
+
290
+ *[Symbol.iterator]() {
291
+ while (true) {
292
+ yield this.next();
293
+ }
294
+ }
295
+ }
BrowserGym/browsergym/core/src/browsergym/core/javascript/frame_unmark_elements.js ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Go through all DOM elements in the frame (including shadowDOMs),
3
+ * and cleanup previously stored data in ARIA attributes.
4
+ */
5
+ () => {
6
+ // get all DOM elements in the current frame (does not include elements in shadowDOMs)
7
+ let elements = Array.from(document.querySelectorAll('*'));
8
+ let i = 0;
9
+ while (i < elements.length) {
10
+ const elem = elements[i];
11
+ // add shadowDOM elements to the elements array, in such a way that order is preserved
12
+ // TODO: do we really need the order preserved?
13
+ if (elem.shadowRoot !== null) {
14
+ elements = new Array(
15
+ ...Array.prototype.slice.call(elements, 0, i + 1),
16
+ ...Array.from(elem.shadowRoot.querySelectorAll("*")),
17
+ ...Array.prototype.slice.call(elements, i + 1)
18
+ );
19
+ }
20
+ i++;
21
+ // Hack: remove custom data stored in ARIA attributes
22
+ // - elem_global_id: global browsergym identifier
23
+ pop_bid_from_attribute(elem, "aria-description");
24
+ pop_bid_from_attribute(elem, "aria-roledescription"); // fallback for generic nodes
25
+ }
26
+ }
27
+
28
+ function pop_bid_from_attribute(elem, attr) {
29
+ let bid_regex = /^browsergym_id[^\s]*\s/;
30
+ if (elem.hasAttribute(attr)) {
31
+ let content = elem.getAttribute(attr);
32
+ let original_content = content.replace(bid_regex, '');
33
+ if (original_content) {
34
+ elem.setAttribute(attr, original_content);
35
+ }
36
+ else {
37
+ elem.removeAttribute(attr);
38
+ }
39
+ }
40
+ }
BrowserGym/browsergym/core/src/browsergym/core/observation.py ADDED
@@ -0,0 +1,575 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import logging
4
+ import pkgutil
5
+ import re
6
+ from typing import Literal
7
+
8
+ import numpy as np
9
+ import PIL.Image
10
+ import playwright.sync_api
11
+
12
+ from .constants import BROWSERGYM_ID_ATTRIBUTE as BID_ATTR
13
+ from .constants import BROWSERGYM_SETOFMARKS_ATTRIBUTE as SOM_ATTR
14
+ from .constants import BROWSERGYM_VISIBILITY_ATTRIBUTE as VIS_ATTR
15
+
16
+ MARK_FRAMES_MAX_TRIES = 3
17
+
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class MarkingError(Exception):
23
+ pass
24
+
25
+
26
+ def _pre_extract(
27
+ page: playwright.sync_api.Page,
28
+ tags_to_mark: Literal["all", "standard_html"] = "standard_html",
29
+ lenient: bool = False,
30
+ ):
31
+ """
32
+ pre-extraction routine, marks dom elements (set bid and dynamic attributes like value and checked)
33
+ """
34
+ js_frame_mark_elements = pkgutil.get_data(__name__, "javascript/frame_mark_elements.js").decode(
35
+ "utf-8"
36
+ )
37
+
38
+ # we can't run this loop in JS due to Same-Origin Policy
39
+ # (can't access the content of an iframe from a another one)
40
+ def mark_frames_recursive(frame, frame_bid: str):
41
+ assert frame_bid == "" or re.match(r"^[a-z][a-zA-Z]*$", frame_bid)
42
+ logger.debug(f"Marking frame {repr(frame_bid)}")
43
+
44
+ # mark all DOM elements in the frame (it will use the parent frame element's bid as a prefix)
45
+ warning_msgs = frame.evaluate(
46
+ js_frame_mark_elements,
47
+ [frame_bid, BID_ATTR, tags_to_mark],
48
+ )
49
+ # print warning messages if any
50
+ for msg in warning_msgs:
51
+ logger.warning(msg)
52
+
53
+ # recursively mark all descendant frames
54
+ for child_frame in frame.child_frames:
55
+ # deal with detached frames
56
+ if child_frame.is_detached():
57
+ continue
58
+ # deal with weird frames (pdf viewer in <embed>)
59
+ child_frame_elem = child_frame.frame_element()
60
+ if not child_frame_elem.content_frame() == child_frame:
61
+ logger.warning(
62
+ f"Skipping frame '{child_frame.name}' for marking, seems problematic."
63
+ )
64
+ continue
65
+ # deal with sandboxed frames with blocked script execution
66
+ sandbox_attr = child_frame_elem.get_attribute("sandbox")
67
+ if sandbox_attr is not None and "allow-scripts" not in sandbox_attr.split():
68
+ continue
69
+ child_frame_bid = child_frame_elem.get_attribute(BID_ATTR)
70
+ if child_frame_bid is None:
71
+ if lenient:
72
+ logger.warning("Cannot mark a child frame without a bid. Skipping frame.")
73
+ continue
74
+ else:
75
+ raise MarkingError("Cannot mark a child frame without a bid.")
76
+ mark_frames_recursive(child_frame, frame_bid=child_frame_bid)
77
+
78
+ # mark all frames recursively
79
+ mark_frames_recursive(page.main_frame, frame_bid="")
80
+
81
+
82
+ def _post_extract(page: playwright.sync_api.Page):
83
+ js_frame_unmark_elements = pkgutil.get_data(
84
+ __name__, "javascript/frame_unmark_elements.js"
85
+ ).decode("utf-8")
86
+
87
+ # we can't run this loop in JS due to Same-Origin Policy
88
+ # (can't access the content of an iframe from a another one)
89
+ for frame in page.frames:
90
+ try:
91
+ if not frame == page.main_frame:
92
+ # deal with weird frames (pdf viewer in <embed>)
93
+ if not frame.frame_element().content_frame() == frame:
94
+ logger.warning(
95
+ f"Skipping frame '{frame.name}' for unmarking, seems problematic."
96
+ )
97
+ continue
98
+ # deal with sandboxed frames with blocked script execution
99
+ sandbox_attr = frame.frame_element().get_attribute("sandbox")
100
+ if sandbox_attr is not None and "allow-scripts" not in sandbox_attr.split():
101
+ continue
102
+ # deal with frames without a BID
103
+ bid = frame.frame_element().get_attribute(BID_ATTR)
104
+ if bid is None:
105
+ continue
106
+
107
+ frame.evaluate(js_frame_unmark_elements)
108
+ except playwright.sync_api.Error as e:
109
+ if any(msg in str(e) for msg in ("Frame was detached", "Frame has been detached")):
110
+ pass
111
+ else:
112
+ raise e
113
+
114
+
115
+ def extract_screenshot(page: playwright.sync_api.Page):
116
+ """
117
+ Extracts the screenshot image of a Playwright page using Chrome DevTools Protocol.
118
+
119
+ Args:
120
+ page: the playwright page of which to extract the screenshot.
121
+
122
+ Returns:
123
+ A screenshot of the page, in the form of a 3D array (height, width, rgb).
124
+
125
+ """
126
+
127
+ cdp = page.context.new_cdp_session(page)
128
+ cdp_answer = cdp.send(
129
+ "Page.captureScreenshot",
130
+ {
131
+ "format": "png",
132
+ },
133
+ )
134
+ cdp.detach()
135
+
136
+ # bytes of a png file
137
+ png_base64 = cdp_answer["data"]
138
+ png_bytes = base64.b64decode(png_base64)
139
+ with io.BytesIO(png_bytes) as f:
140
+ # load png as a PIL image
141
+ img = PIL.Image.open(f)
142
+ # convert to RGB (3 channels)
143
+ img = img.convert(mode="RGB")
144
+ # convert to a numpy array
145
+ img = np.array(img)
146
+
147
+ return img
148
+
149
+
150
+ # we could handle more data items here if needed
151
+ __BID_EXPR = r"([a-zA-Z0-9]+)"
152
+ __DATA_REGEXP = re.compile(r"^browsergym_id_" + __BID_EXPR + r"\s?" + r"(.*)")
153
+
154
+
155
+ def extract_data_items_from_aria(string: str, log_level: int = logging.NOTSET):
156
+ """
157
+ Utility function to extract temporary data stored in the ARIA attributes of a node
158
+ """
159
+
160
+ match = __DATA_REGEXP.fullmatch(string)
161
+ if not match:
162
+ logger.log(
163
+ level=log_level,
164
+ msg=f"Failed to extract BrowserGym data from ARIA string: {repr(string)}",
165
+ )
166
+ return [], string
167
+
168
+ groups = match.groups()
169
+ data_items = groups[:-1]
170
+ original_aria = groups[-1]
171
+ return data_items, original_aria
172
+
173
+
174
+ def extract_dom_snapshot(
175
+ page: playwright.sync_api.Page,
176
+ computed_styles=[],
177
+ include_dom_rects: bool = True,
178
+ include_paint_order: bool = True,
179
+ temp_data_cleanup: bool = True,
180
+ ):
181
+ """
182
+ Extracts the DOM snapshot of a Playwright page using Chrome DevTools Protocol.
183
+
184
+ Args:
185
+ page: the playwright page of which to extract the screenshot.
186
+ computed_styles: whitelist of computed styles to return.
187
+ include_dom_rects: whether to include DOM rectangles (offsetRects, clientRects, scrollRects) in the snapshot.
188
+ include_paint_order: whether to include paint orders in the snapshot.
189
+ temp_data_cleanup: whether to clean up the temporary data stored in the ARIA attributes.
190
+
191
+ Returns:
192
+ A document snapshot, including the full DOM tree of the root node (including iframes,
193
+ template contents, and imported documents) in a flattened array, as well as layout
194
+ and white-listed computed style information for the nodes. Shadow DOM in the returned
195
+ DOM tree is flattened.
196
+
197
+ """
198
+ cdp = page.context.new_cdp_session(page)
199
+ dom_snapshot = cdp.send(
200
+ "DOMSnapshot.captureSnapshot",
201
+ {
202
+ "computedStyles": computed_styles,
203
+ "includeDOMRects": include_dom_rects,
204
+ "includePaintOrder": include_paint_order,
205
+ },
206
+ )
207
+ cdp.detach()
208
+
209
+ # if requested, remove temporary data stored in the ARIA attributes of each node
210
+ if temp_data_cleanup:
211
+ pop_bids_from_attribute(dom_snapshot, "aria-roledescription")
212
+ pop_bids_from_attribute(dom_snapshot, "aria-description")
213
+
214
+ return dom_snapshot
215
+
216
+
217
+ def pop_bids_from_attribute(dom_snapshot, attr: str):
218
+ try:
219
+ target_attr_name_id = dom_snapshot["strings"].index(attr)
220
+ except ValueError:
221
+ target_attr_name_id = -1
222
+ # run the cleanup only if the target attribute string is present
223
+ if target_attr_name_id > -1:
224
+ processed_string_ids = set()
225
+ for document in dom_snapshot["documents"]:
226
+ for node_attributes in document["nodes"]["attributes"]:
227
+ i = 0
228
+ # find the target attribute, if any
229
+ for i in range(0, len(node_attributes), 2):
230
+ attr_name_id = node_attributes[i]
231
+ attr_value_id = node_attributes[i + 1]
232
+ if attr_name_id == target_attr_name_id:
233
+ attr_value = dom_snapshot["strings"][attr_value_id]
234
+ # remove any data stored in the target attribute
235
+ if attr_value_id not in processed_string_ids:
236
+ _, new_attr_value = extract_data_items_from_aria(attr_value)
237
+ dom_snapshot["strings"][
238
+ attr_value_id
239
+ ] = new_attr_value # update the string in the metadata
240
+ processed_string_ids.add(
241
+ attr_value_id
242
+ ) # mark string as processed (in case several nodes share the same target attribute string value)
243
+ attr_value = new_attr_value
244
+ # remove target attribute (name and value) if empty
245
+ if attr_value == "":
246
+ del node_attributes[i : i + 2]
247
+ # once target attribute is found, exit the search
248
+ break
249
+
250
+
251
+ def extract_dom_extra_properties(dom_snapshot):
252
+ def to_string(idx):
253
+ if idx == -1:
254
+ return None
255
+ else:
256
+ return dom_snapshot["strings"][idx]
257
+
258
+ # pre-locate important string ids
259
+ try:
260
+ bid_string_id = dom_snapshot["strings"].index(BID_ATTR)
261
+ except ValueError:
262
+ bid_string_id = -1
263
+ try:
264
+ vis_string_id = dom_snapshot["strings"].index(VIS_ATTR)
265
+ except ValueError:
266
+ vis_string_id = -1
267
+ try:
268
+ som_string_id = dom_snapshot["strings"].index(SOM_ATTR)
269
+ except ValueError:
270
+ som_string_id = -1
271
+
272
+ # build the iframe tree (DFS from the first frame)
273
+ doc_properties = {
274
+ 0: {
275
+ "parent": None,
276
+ }
277
+ }
278
+
279
+ docs_to_process = [0]
280
+ while docs_to_process:
281
+ doc = docs_to_process.pop(-1) # DFS
282
+
283
+ children = dom_snapshot["documents"][doc]["nodes"]["contentDocumentIndex"]
284
+ for node, child_doc in zip(children["index"], children["value"]):
285
+ doc_properties[child_doc] = {
286
+ "parent": {
287
+ "doc": doc, # parent frame index
288
+ "node": node, # node index within the parent frame
289
+ }
290
+ }
291
+ docs_to_process.append(child_doc)
292
+
293
+ # recover the absolute x and y position of the frame node in the parent (if any)
294
+ parent = doc_properties[doc]["parent"]
295
+ if parent:
296
+ parent_doc = parent["doc"]
297
+ parent_node = parent["node"]
298
+ try:
299
+ node_layout_idx = dom_snapshot["documents"][parent_doc]["layout"][
300
+ "nodeIndex"
301
+ ].index(parent_node)
302
+ except ValueError:
303
+ node_layout_idx = -1
304
+ if node_layout_idx >= 0:
305
+ node_bounds = dom_snapshot["documents"][parent_doc]["layout"]["bounds"][
306
+ node_layout_idx
307
+ ] # can be empty?
308
+ # absolute position of parent + relative position of frame node within parent
309
+ parent_node_abs_x = doc_properties[parent_doc]["abs_pos"]["x"] + node_bounds[0]
310
+ parent_node_abs_y = doc_properties[parent_doc]["abs_pos"]["y"] + node_bounds[1]
311
+ else:
312
+ parent_node_abs_x = 0
313
+ parent_node_abs_y = 0
314
+ else:
315
+ parent_node_abs_x = 0
316
+ parent_node_abs_y = 0
317
+
318
+ # get the frame's absolute position, by adding any scrolling offset if any
319
+ doc_properties[doc]["abs_pos"] = {
320
+ "x": parent_node_abs_x - dom_snapshot["documents"][doc]["scrollOffsetX"],
321
+ "y": parent_node_abs_y - dom_snapshot["documents"][doc]["scrollOffsetY"],
322
+ }
323
+
324
+ document = dom_snapshot["documents"][doc]
325
+ doc_properties[doc]["nodes"] = [
326
+ {
327
+ "bid": None, # default value, to be filled (str)
328
+ "visibility": None, # default value, to be filled (float)
329
+ "bbox": None, # default value, to be filled (list)
330
+ "clickable": False, # default value, to be filled (bool)
331
+ "set_of_marks": None, # default value, to be filled (bool)
332
+ }
333
+ for _ in enumerate(document["nodes"]["parentIndex"])
334
+ ] # all nodes in document
335
+
336
+ # extract clickable property
337
+ for node_idx in document["nodes"]["isClickable"]["index"]:
338
+ doc_properties[doc]["nodes"][node_idx]["clickable"] = True
339
+
340
+ # extract bid and visibility properties (attribute-based)
341
+ for node_idx, node_attrs in enumerate(document["nodes"]["attributes"]):
342
+ i = 0
343
+ # loop over all attributes
344
+ for i in range(0, len(node_attrs), 2):
345
+ name_string_id = node_attrs[i]
346
+ value_string_id = node_attrs[i + 1]
347
+ if name_string_id == bid_string_id:
348
+ doc_properties[doc]["nodes"][node_idx]["bid"] = to_string(value_string_id)
349
+ if name_string_id == vis_string_id:
350
+ doc_properties[doc]["nodes"][node_idx]["visibility"] = float(
351
+ to_string(value_string_id)
352
+ )
353
+ if name_string_id == som_string_id:
354
+ doc_properties[doc]["nodes"][node_idx]["set_of_marks"] = (
355
+ to_string(value_string_id) == "1"
356
+ )
357
+
358
+ # extract bbox property (in absolute coordinates)
359
+ for node_idx, bounds, client_rect in zip(
360
+ document["layout"]["nodeIndex"],
361
+ document["layout"]["bounds"],
362
+ document["layout"]["clientRects"],
363
+ ):
364
+ # empty clientRect means element is not actually rendered
365
+ if not client_rect:
366
+ doc_properties[doc]["nodes"][node_idx]["bbox"] = None
367
+ else:
368
+ # bounds gives the relative position within the document
369
+ doc_properties[doc]["nodes"][node_idx]["bbox"] = bounds.copy()
370
+ # adjust for absolute document position
371
+ doc_properties[doc]["nodes"][node_idx]["bbox"][0] += doc_properties[doc]["abs_pos"][
372
+ "x"
373
+ ]
374
+ doc_properties[doc]["nodes"][node_idx]["bbox"][1] += doc_properties[doc]["abs_pos"][
375
+ "y"
376
+ ]
377
+
378
+ # Note: other interesting fields
379
+ # document["nodes"]["parentIndex"] # parent node
380
+ # document["nodes"]["nodeType"]
381
+ # document["nodes"]["nodeName"]
382
+ # document["nodes"]["nodeValue"]
383
+ # document["nodes"]["textValue"]
384
+ # document["nodes"]["inputValue"]
385
+ # document["nodes"]["inputChecked"]
386
+ # document["nodes"]["optionSelected"]
387
+ # document["nodes"]["pseudoType"]
388
+ # document["nodes"]["pseudoIdentifier"]
389
+ # document["nodes"]["isClickable"]
390
+ # document["textBoxes"]
391
+ # document["layout"]["nodeIndex"]
392
+ # document["layout"]["bounds"]
393
+ # document["layout"]["offsetRects"]
394
+ # document["layout"]["scrollRects"]
395
+ # document["layout"]["clientRects"]
396
+ # document["layout"]["paintOrders"]
397
+
398
+ # collect the extra properties of all nodes with a browsergym_id attribute
399
+ extra_properties = {}
400
+ for doc in doc_properties.keys():
401
+ for node in doc_properties[doc]["nodes"]:
402
+ bid = node["bid"]
403
+ if bid:
404
+ if bid in extra_properties:
405
+ logger.warning(f"duplicate {BID_ATTR}={repr(bid)} attribute detected")
406
+ extra_properties[bid] = {
407
+ extra_prop: node[extra_prop]
408
+ for extra_prop in ("visibility", "bbox", "clickable", "set_of_marks")
409
+ }
410
+
411
+ return extra_properties
412
+
413
+
414
+ def extract_all_frame_axtrees(page: playwright.sync_api.Page):
415
+ """
416
+ Extracts the AXTree of all frames (main document and iframes) of a Playwright page using Chrome DevTools Protocol.
417
+
418
+ Args:
419
+ page: the playwright page of which to extract the frame AXTrees.
420
+
421
+ Returns:
422
+ A dictionnary of AXTrees (as returned by Chrome DevTools Protocol) indexed by frame IDs.
423
+
424
+ """
425
+ cdp = page.context.new_cdp_session(page)
426
+
427
+ # extract the frame tree
428
+ frame_tree = cdp.send(
429
+ "Page.getFrameTree",
430
+ {},
431
+ )
432
+
433
+ # extract all frame IDs into a list
434
+ # (breadth-first-search through the frame tree)
435
+ frame_ids = []
436
+ root_frame = frame_tree["frameTree"]
437
+ frames_to_process = [root_frame]
438
+ while frames_to_process:
439
+ frame = frames_to_process.pop()
440
+ frames_to_process.extend(frame.get("childFrames", []))
441
+ # extract the frame ID
442
+ frame_id = frame["frame"]["id"]
443
+ frame_ids.append(frame_id)
444
+
445
+ # extract the AXTree of each frame
446
+ frame_axtrees = {
447
+ frame_id: cdp.send(
448
+ "Accessibility.getFullAXTree",
449
+ {"frameId": frame_id},
450
+ )
451
+ for frame_id in frame_ids
452
+ }
453
+
454
+ cdp.detach()
455
+
456
+ # extract browsergym data from ARIA attributes
457
+ for ax_tree in frame_axtrees.values():
458
+ for node in ax_tree["nodes"]:
459
+ data_items = []
460
+ # look for data in the node's "roledescription" property
461
+ if "properties" in node:
462
+ for i, prop in enumerate(node["properties"]):
463
+ if prop["name"] == "roledescription":
464
+ data_items, new_value = extract_data_items_from_aria(prop["value"]["value"])
465
+ prop["value"]["value"] = new_value
466
+ # remove the "description" property if empty
467
+ if new_value == "":
468
+ del node["properties"][i]
469
+ break
470
+ # look for data in the node's "description" (fallback plan)
471
+ if "description" in node:
472
+ data_items_bis, new_value = extract_data_items_from_aria(
473
+ node["description"]["value"]
474
+ )
475
+ node["description"]["value"] = new_value
476
+ if new_value == "":
477
+ del node["description"]
478
+ if not data_items:
479
+ data_items = data_items_bis
480
+ # add the extracted "browsergym" data to the AXTree
481
+ if data_items:
482
+ (browsergym_id,) = data_items
483
+ node["browsergym_id"] = browsergym_id
484
+ return frame_axtrees
485
+
486
+
487
+ def extract_merged_axtree(page: playwright.sync_api.Page):
488
+ """
489
+ Extracts the merged AXTree of a Playwright page (main document and iframes AXTrees merged) using Chrome DevTools Protocol.
490
+
491
+ Args:
492
+ page: the playwright page of which to extract the merged AXTree.
493
+
494
+ Returns:
495
+ A merged AXTree (same format as those returned by Chrome DevTools Protocol).
496
+
497
+ """
498
+ frame_axtrees = extract_all_frame_axtrees(page)
499
+
500
+ cdp = page.context.new_cdp_session(page)
501
+
502
+ # merge all AXTrees into one
503
+ merged_axtree = {"nodes": []}
504
+ for ax_tree in frame_axtrees.values():
505
+ merged_axtree["nodes"].extend(ax_tree["nodes"])
506
+ # connect each iframe node to the corresponding AXTree root node
507
+ for node in ax_tree["nodes"]:
508
+ if node["role"]["value"] == "Iframe":
509
+ frame_id = (
510
+ cdp.send("DOM.describeNode", {"backendNodeId": node["backendDOMNodeId"]})
511
+ .get("node", {})
512
+ .get("frameId", None)
513
+ )
514
+ if not frame_id:
515
+ logger.warning(
516
+ f"AXTree merging: unable to recover frameId of node with backendDOMNodeId {repr(node['backendDOMNodeId'])}, skipping"
517
+ )
518
+ # it seems Page.getFrameTree() from CDP omits certain Frames (empty frames?)
519
+ # if a frame is not found in the extracted AXTrees, we just ignore it
520
+ elif frame_id in frame_axtrees:
521
+ # root node should always be the first node in the AXTree
522
+ frame_root_node = frame_axtrees[frame_id]["nodes"][0]
523
+ assert frame_root_node["frameId"] == frame_id
524
+ node["childIds"].append(frame_root_node["nodeId"])
525
+ else:
526
+ logger.warning(
527
+ f"AXTree merging: extracted AXTree does not contain frameId '{frame_id}', skipping"
528
+ )
529
+
530
+ cdp.detach()
531
+
532
+ return merged_axtree
533
+
534
+
535
+ def extract_focused_element_bid(page: playwright.sync_api.Page):
536
+ # this JS code will dive through ShadowDOMs
537
+ extract_focused_element_with_bid_script = """\
538
+ () => {
539
+ // This recursive function traverses shadow DOMs
540
+ function getActiveElement(root) {
541
+ const active_element = root.activeElement;
542
+
543
+ if (!active_element) {
544
+ return null;
545
+ }
546
+
547
+ if (active_element.shadowRoot) {
548
+ return getActiveElement(active_element.shadowRoot);
549
+ } else {
550
+ return active_element;
551
+ }
552
+ }
553
+ return getActiveElement(document);
554
+ }"""
555
+ # this playwright code will dive through iFrames
556
+ frame = page
557
+ focused_bid = ""
558
+ try:
559
+ while frame:
560
+ focused_element = frame.evaluate_handle(
561
+ extract_focused_element_with_bid_script, BID_ATTR
562
+ ).as_element()
563
+ if focused_element:
564
+ frame = focused_element.content_frame()
565
+ focused_bid = focused_element.get_attribute(BID_ATTR)
566
+ else:
567
+ frame = None
568
+ except playwright.sync_api.TimeoutError:
569
+ focused_bid = ""
570
+
571
+ # convert null / None to empty string
572
+ if not focused_bid:
573
+ focused_bid = ""
574
+
575
+ return focused_bid
BrowserGym/browsergym/core/src/browsergym/core/registration.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+ from typing import Type
3
+
4
+ import gymnasium as gym
5
+
6
+ from .env import BrowserEnv
7
+ from .task import AbstractBrowserTask
8
+
9
+
10
+ class frozen_partial:
11
+ """
12
+ Freeze some keyword arguments of a function.
13
+
14
+ """
15
+
16
+ def __init__(self, func, **frozen_kwargs):
17
+ self.func = func
18
+ self.frozen_kwargs = frozen_kwargs
19
+
20
+ def __call__(self, *args, **kwargs):
21
+ # check overlap between kwargs and frozen_kwargs
22
+ clashing_kwargs = set(self.frozen_kwargs) & set(kwargs) # key set intersection
23
+ if clashing_kwargs:
24
+ raise ValueError(f"Illegal attempt to override frozen parameters {clashing_kwargs}.")
25
+ # merge the two dicts
26
+ kwargs = kwargs | self.frozen_kwargs
27
+
28
+ return self.func(*args, **kwargs)
29
+
30
+
31
+ def register_task(
32
+ id: str,
33
+ task_class: Type[AbstractBrowserTask],
34
+ task_kwargs: dict = {},
35
+ default_task_kwargs: dict = {},
36
+ nondeterministic: bool = True,
37
+ *args,
38
+ **kwargs,
39
+ ):
40
+ """
41
+ Registers a browser task as a gym environment with its unique id.
42
+
43
+ Args:
44
+ id: the id of the task to register (will be prepended by "browsergym/").
45
+ task_class: the task class to register.
46
+ task_kwargs: frozen task arguments (can not be overloaded at environment creation time).
47
+ task_kwargs_default: default task arguments (can be overloaded at environment creation time).
48
+ nondeterministic: whether the task cannot be guaranteed deterministic transitions.
49
+ *args: additional sequential arguments for either the gym or the browsergym environment.
50
+ *kwargs: additional keyword arguments for either the gym or the browsergym environment.
51
+ """
52
+ if task_kwargs and default_task_kwargs:
53
+ # check overlap between frozen and default task_kwargs
54
+ clashing_kwargs = set(task_kwargs) & set(default_task_kwargs) # key set intersection
55
+ if clashing_kwargs:
56
+ raise ValueError(
57
+ f"Illegal attempt to register Browsergym environment {id} with both frozen and default values for task parameters {clashing_kwargs}."
58
+ )
59
+
60
+ task_entrypoint = task_class
61
+
62
+ # freeze task_kwargs (cannot be overriden at environment creation)
63
+ task_entrypoint = frozen_partial(task_class, **task_kwargs)
64
+
65
+ # pre-set default_task_kwargs (can be overriden at environment creation)
66
+ task_entrypoint = partial(task_entrypoint, **default_task_kwargs)
67
+
68
+ gym.register(
69
+ id=f"browsergym/{id}",
70
+ entry_point=lambda *env_args, **env_kwargs: BrowserEnv(
71
+ task_entrypoint, *env_args, **env_kwargs
72
+ ),
73
+ nondeterministic=nondeterministic,
74
+ *args,
75
+ **kwargs,
76
+ )
BrowserGym/browsergym/core/src/browsergym/core/spaces.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Borrowed from https://github.com/Farama-Foundation/miniwob-plusplus/blob/553daee55ea0b2cc32b181a474083ab4cad782a1/miniwob/spaces.py"""
2
+
3
+ from typing import Any
4
+
5
+ import numpy as np
6
+ from gymnasium.spaces import Space
7
+ from numpy.typing import NDArray
8
+
9
+
10
+ class Unicode(Space):
11
+ """
12
+ A space representing a unicode string.
13
+ """
14
+
15
+ def __init__(self):
16
+ super().__init__()
17
+
18
+ def contains(self, x: Any) -> bool:
19
+ """Return boolean specifying if x is a valid member of this space."""
20
+ # Do not check the character set.
21
+ return isinstance(x, str)
22
+
23
+ def __repr__(self) -> str:
24
+ """Gives a string representation of this space."""
25
+ return f"Unicode()"
26
+
27
+ def __eq__(self, other: Any) -> bool:
28
+ """Check whether ``other`` is equivalent to this instance."""
29
+ return isinstance(other, Unicode)
30
+
31
+
32
+ class Float(Space):
33
+ """
34
+ A space representing a float.
35
+ """
36
+
37
+ def __init__(self):
38
+ super().__init__()
39
+
40
+ def contains(self, x: Any) -> bool:
41
+ """Return boolean specifying if x is a valid member of this space."""
42
+ return isinstance(x, float)
43
+
44
+ def __repr__(self) -> str:
45
+ """Gives a string representation of this space."""
46
+ return f"Float()"
47
+
48
+ def __eq__(self, other: Any) -> bool:
49
+ """Check whether ``other`` is equivalent to this instance."""
50
+ return isinstance(other, Float)
51
+
52
+
53
+ class Integer(Space):
54
+ """
55
+ A space representing an integer.
56
+ """
57
+
58
+ def __init__(self):
59
+ super().__init__()
60
+
61
+ def contains(self, x: Any) -> bool:
62
+ """Return boolean specifying if x is a valid member of this space."""
63
+ return isinstance(x, int)
64
+
65
+ def __repr__(self) -> str:
66
+ """Gives a string representation of this space."""
67
+ return f"Integer()"
68
+
69
+ def __eq__(self, other: Any) -> bool:
70
+ """Check whether ``other`` is equivalent to this instance."""
71
+ return isinstance(other, Integer)
72
+
73
+
74
+ class AnyDict(Space):
75
+ """A space representing an arbitrary dictionary object."""
76
+
77
+ def contains(self, x: Any) -> bool:
78
+ """Return boolean specifying if x is a valid member of this space."""
79
+ # Do not check anything specific.
80
+ return isinstance(x, dict)
81
+
82
+ def __repr__(self) -> str:
83
+ """Gives a string representation of this space."""
84
+ return f"AnyDict()"
85
+
86
+ def __eq__(self, other: Any) -> bool:
87
+ """Check whether ``other`` is equivalent to this instance."""
88
+ return isinstance(other, AnyDict)
89
+
90
+
91
+ class Anything(Space):
92
+ """A space representing an arbitrary dictionary object."""
93
+
94
+ def contains(self, x: Any) -> bool:
95
+ return True
96
+
97
+ def __repr__(self) -> str:
98
+ return f"Anything()"
99
+
100
+ def __eq__(self, other: Any) -> bool:
101
+ return isinstance(other, Anything)
102
+
103
+
104
+ class AnyBox(Space[NDArray[Any]]):
105
+ """A space representing an arbitrary dictionary object."""
106
+
107
+ def __init__(self, low, high, shape, dtype):
108
+ super().__init__(shape, dtype)
109
+ self.low = low
110
+ self.high = high
111
+
112
+ def contains(self, x: Any) -> bool:
113
+ """Return boolean specifying if x is a valid member of this space."""
114
+ if not isinstance(x, np.ndarray):
115
+ try:
116
+ x = np.asarray(x, dtype=self.dtype)
117
+ except (ValueError, TypeError):
118
+ return False
119
+
120
+ return bool(
121
+ np.can_cast(x.dtype, self.dtype)
122
+ and len(x.shape) == len(self.shape)
123
+ and all([dim in (xdim, -1) for xdim, dim in zip(x.shape, self.shape)])
124
+ and np.all(x >= self.low)
125
+ and np.all(x <= self.high)
126
+ )
127
+
128
+ def __repr__(self) -> str:
129
+ """Gives a string representation of this space."""
130
+ return f"AnyBox(low={repr(self.low)}, high={repr(self.high)}, shape={repr(self.shape)}, dtype={repr(self.dtype)})"
131
+
132
+ def __eq__(self, other: Any) -> bool:
133
+ """Check whether ``other`` is equivalent to this instance."""
134
+ return (
135
+ isinstance(other, AnyBox)
136
+ and self.low == other.low
137
+ and self.high == other.high
138
+ and self.shape == other.shape
139
+ and self.dtype == other.dtype
140
+ )
BrowserGym/browsergym/core/src/browsergym/core/task.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Tuple
3
+
4
+ import numpy as np
5
+ import playwright.sync_api
6
+
7
+
8
+ class AbstractBrowserTask(ABC):
9
+ """
10
+ Abstract class for browsergym tasks.
11
+
12
+ """
13
+
14
+ @classmethod
15
+ def get_task_id(cls):
16
+ raise NotImplementedError
17
+
18
+ def __init__(self, seed: int) -> None:
19
+ # initiate a random number generator
20
+ self.random = np.random.RandomState(seed)
21
+
22
+ # task properties, will be used to set up the browsergym environment
23
+ # default values, can be overriden in children classes
24
+ self.viewport = {"width": 1280, "height": 720}
25
+ self.slow_mo = 1000 # ms
26
+ self.timeout = 5000 # ms
27
+ self.locale = None # see https://playwright.dev/python/docs/api/class-browser#browser-new-context-option-locale
28
+ self.timezone_id = None # see https://playwright.dev/python/docs/api/class-browser#browser-new-context-option-timezone-id
29
+
30
+ @abstractmethod
31
+ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]:
32
+ """
33
+ Set up everything needed to execute the task.
34
+
35
+ Args:
36
+ page: the active playwright page.
37
+
38
+ Returns:
39
+ goal: str, goal of the task.
40
+ info: dict, custom information from the task.
41
+ """
42
+
43
+ @abstractmethod
44
+ def validate(
45
+ self, page: playwright.sync_api.Page, chat_messages: list[str]
46
+ ) -> Tuple[float, bool, str, dict]:
47
+ """
48
+ Validate the task was completed successfully
49
+
50
+ Args:
51
+ page: the active playwright page.
52
+ chat_messages: the chat messages.
53
+
54
+ Returns:
55
+ reward: float, the reward obtained since last call to validate().
56
+ done: boolean flag, indicates if the task has finished or not (be it success or fail).
57
+ message: string, a new user message for the chat.
58
+ info: dictionnary, custom information from the task.
59
+
60
+ """
61
+
62
+ def cheat(self, page: playwright.sync_api.Page, chat_messages: list[str]) -> None:
63
+ """
64
+ Solve the task using a pre-defined solution (optional).
65
+
66
+ """
67
+ raise NotImplementedError
68
+
69
+ def teardown(self) -> None:
70
+ """
71
+ Tear down the task and clean up any resource / data created by the task (optional).
72
+
73
+ """
74
+ pass
75
+
76
+
77
+ class OpenEndedTask(AbstractBrowserTask):
78
+ @classmethod
79
+ def get_task_id(cls):
80
+ return "openended"
81
+
82
+ def __init__(self, seed: int, start_url: str, goal: str = None) -> None:
83
+ """
84
+ Args:
85
+ seed: random seed.
86
+ start_url: str, the url for the starting page.
87
+ goal: str, the initial goal.
88
+
89
+ """
90
+ super().__init__(seed)
91
+ self.start_url = start_url
92
+ self.goal = goal
93
+
94
+ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]:
95
+ page.goto(self.start_url, timeout=10000)
96
+ return self.goal, {}
97
+
98
+ def teardown(self) -> None:
99
+ pass
100
+
101
+ def validate(
102
+ self, page: playwright.sync_api.Page, chat_messages: list[str]
103
+ ) -> Tuple[float, bool, str, dict]:
104
+ reward, done, msg, info = 0, False, "", {}
105
+
106
+ for message in chat_messages:
107
+ if message["role"] == "user" and message["message"] == "exit":
108
+ done = True
109
+ break
110
+
111
+ return reward, done, msg, info
BrowserGym/browsergym/core/src/browsergym/utils/mcp_server.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MCP server for BrowserGym
2
+ import argparse
3
+ import asyncio
4
+ import re
5
+ from collections.abc import AsyncIterator
6
+ from contextlib import asynccontextmanager
7
+ from dataclasses import dataclass, field
8
+ from typing import Callable
9
+
10
+ import gymnasium as gym
11
+ from mcp.server.fastmcp import FastMCP
12
+
13
+ from browsergym.core.action.highlevel import ACTION_SUBSETS, HighLevelActionSet
14
+ from browsergym.core.env import BrowserEnv
15
+
16
+
17
+ @dataclass
18
+ class BgymConfig:
19
+ headless: bool = True
20
+ timeout_ms: int = 10000
21
+ record_video_dir: str | None = None
22
+ demo_mode: HighLevelActionSet.DemoMode = "default"
23
+ validate_actions: list[str] = field(default_factory=list)
24
+
25
+
26
+ @dataclass
27
+ class AppContext:
28
+ gym: BrowserEnv
29
+ config: BgymConfig
30
+ task_id: str
31
+ actions: HighLevelActionSet
32
+
33
+
34
+ def get_cli_args():
35
+ parser = argparse.ArgumentParser(
36
+ description="BrowserGym MCP server",
37
+ usage="python browsergym/core/src/browsergym/utils/%(prog)s [options]",
38
+ epilog="To run Dev UI: mcp dev browsergym/core/src/browsergym/utils/mcp_server.py -e browsergym/core/",
39
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
40
+ )
41
+ parser.add_argument(
42
+ "-t",
43
+ "--task_id",
44
+ type=str,
45
+ default="browsergym/openended",
46
+ help="Task ID to run",
47
+ )
48
+ parser.add_argument(
49
+ "-l",
50
+ "--headless",
51
+ action="store_true",
52
+ help="Run in headless mode",
53
+ )
54
+ parser.add_argument(
55
+ "-r",
56
+ "--record_video_dir",
57
+ type=str,
58
+ default=None,
59
+ help="Directory to save recorded videos",
60
+ )
61
+ parser.add_argument(
62
+ "--demo_mode",
63
+ type=str,
64
+ default="off",
65
+ choices=["off", "default", "all_blue", "only_visible_elements"],
66
+ help="Demo mode for action set",
67
+ )
68
+ parser.add_argument(
69
+ "--timeout_ms",
70
+ type=int,
71
+ default=10000,
72
+ help="Timeout in milliseconds for each step",
73
+ )
74
+ parser.add_argument(
75
+ "--subset",
76
+ type=str,
77
+ default="workarena++",
78
+ choices=ACTION_SUBSETS.keys(),
79
+ help="Subset of actions to use",
80
+ )
81
+ parser.add_argument(
82
+ "--validate_actions",
83
+ type=str,
84
+ nargs="+",
85
+ default=["click", "goto"],
86
+ help="Names of actions for which validation should be performed",
87
+ )
88
+ args, _ = parser.parse_known_args()
89
+ return args
90
+
91
+
92
+ args = get_cli_args()
93
+ task_id = args.task_id
94
+ config = BgymConfig(
95
+ headless=args.headless,
96
+ timeout_ms=args.timeout_ms,
97
+ record_video_dir=args.record_video_dir,
98
+ demo_mode=args.demo_mode,
99
+ validate_actions=args.validate_actions,
100
+ )
101
+
102
+
103
+ @asynccontextmanager
104
+ async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]:
105
+ """Manage application lifecycle with type-safe context"""
106
+ # Initialize on startup
107
+ actions = HighLevelActionSet(demo_mode=config.demo_mode, subsets=args.subset)
108
+ _gym: BrowserEnv = await asyncio.to_thread(
109
+ gym.make,
110
+ task_id,
111
+ headless=config.headless,
112
+ record_video_dir=config.record_video_dir,
113
+ action_mapping=actions.to_python_code,
114
+ timeout=config.timeout_ms,
115
+ task_kwargs={"start_url": "about:blank"},
116
+ ) # type: ignore
117
+ await asyncio.to_thread(_gym.reset)
118
+
119
+ try:
120
+ yield AppContext(gym=_gym, config=config, task_id=task_id, actions=actions)
121
+ finally:
122
+ # Cleanup on shutdown
123
+ await asyncio.to_thread(_gym.close)
124
+
125
+
126
+ mcp = FastMCP("BrowserGym", lifespan=app_lifespan)
127
+
128
+
129
+ def format_func_call(func: Callable, args, kwargs) -> str:
130
+ args_str = ", ".join(repr(arg) for arg in args)
131
+ kwargs_str = ", ".join(f"{k}={repr(v)}" for k, v in kwargs.items())
132
+ all_args_str = ", ".join(filter(None, [args_str, kwargs_str]))
133
+ return f"{func.__name__}({all_args_str})"
134
+
135
+
136
+ def fn_wrapper(func: Callable, validate: bool = True):
137
+ async def decorator(*args, **kwargs):
138
+ """
139
+ Decorator to execute function from the action space in the context of the gym.
140
+ 1. Loads the parent module of the function to use as function context
141
+ 2. Executes the pre_step method of the gym
142
+ 3. Sets up the module vars from the current state of the gym
143
+ 4. Executes the function from this module and handles any exceptions
144
+ 5. Executes the post_step method of the gym
145
+
146
+ """
147
+ gym: BrowserEnv = mcp.get_context().request_context.lifespan_context.gym # type: ignore
148
+ while not isinstance(gym, BrowserEnv):
149
+ gym = (
150
+ gym.env
151
+ ) # gym library wraps the BrowserEnv in a few layers (usually 2) of wrappers, this loop unwraps them
152
+
153
+ # Load the parent module of the function to use as function context
154
+ import browsergym.core.action.functions as fn_context
155
+
156
+ fn = getattr(fn_context, func.__name__)
157
+
158
+ gym.last_action = format_func_call(fn, args, kwargs)
159
+ info, send_message_to_user, report_infeasible_instructions = await asyncio.to_thread(
160
+ gym.pre_step
161
+ )
162
+
163
+ # Set up the module vars from the current state of the gym
164
+ fn_context.send_message_to_user = send_message_to_user
165
+ fn_context.report_infeasible_instructions = report_infeasible_instructions
166
+ fn_context.page = gym.page
167
+ fn_context.demo_mode = config.demo_mode
168
+
169
+ try:
170
+ fn(*args, **kwargs)
171
+ gym.last_action_error = ""
172
+ except Exception as e:
173
+ gym.last_action_error = f"{type(e).__name__}: {e}"
174
+ match = re.match("TimeoutError: Timeout ([0-9]+)ms exceeded.", gym.last_action_error)
175
+ if match:
176
+ info["action_exec_timeout"] = float(match.groups()[0]) / 1000
177
+
178
+ results = await asyncio.to_thread(gym.post_step, info, validate)
179
+ return results
180
+
181
+ decorator.__wrapped__ = func # type: ignore
182
+ decorator.__name__ = func.__name__
183
+ decorator.__doc__ = func.__doc__
184
+ return decorator
185
+
186
+
187
+ for fn in ACTION_SUBSETS[args.subset]:
188
+ validate = fn.__name__ in config.validate_actions
189
+ mcp.add_tool(fn_wrapper(fn, validate))
190
+
191
+ if __name__ == "__main__":
192
+ mcp.run(transport="stdio")
BrowserGym/browsergym/core/src/browsergym/utils/obs.py ADDED
@@ -0,0 +1,554 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import logging
3
+ import math
4
+ import re
5
+ from collections import defaultdict
6
+
7
+ import numpy as np
8
+ import PIL.Image
9
+ import PIL.ImageDraw
10
+ import PIL.ImageFont
11
+ from bs4 import BeautifulSoup
12
+
13
+ from browsergym.core.constants import BROWSERGYM_ID_ATTRIBUTE as BID_ATTR
14
+ from browsergym.core.constants import BROWSERGYM_SETOFMARKS_ATTRIBUTE as SOM_ATTR
15
+ from browsergym.core.constants import BROWSERGYM_VISIBILITY_ATTRIBUTE as VIS_ATTR
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ IGNORED_AXTREE_ROLES = ["LineBreak"]
20
+
21
+ IGNORED_AXTREE_PROPERTIES = (
22
+ "editable",
23
+ "readonly",
24
+ "level",
25
+ "settable",
26
+ "multiline",
27
+ "invalid",
28
+ "focusable",
29
+ )
30
+
31
+
32
+ def flatten_dom_to_str(
33
+ dom_snapshot,
34
+ extra_properties: dict = None,
35
+ with_visible: bool = False,
36
+ with_clickable: bool = False,
37
+ with_center_coords: bool = False,
38
+ with_bounding_box_coords: bool = False,
39
+ with_som: bool = False,
40
+ filter_visible_only: bool = False,
41
+ filter_with_bid_only: bool = False,
42
+ filter_som_only: bool = False,
43
+ coord_decimals: int = 0,
44
+ hide_bid_if_invisible: int = False,
45
+ hide_all_bids: bool = False,
46
+ ) -> str:
47
+ """Formats a DOM snapshot into a string text"""
48
+
49
+ def to_string(idx):
50
+ if idx == -1:
51
+ return None
52
+ else:
53
+ return dom_snapshot["strings"][idx]
54
+
55
+ def parse_document(document_idx) -> str:
56
+ # adapted from [natbot](https://github.com/nat/natbot)
57
+
58
+ nodes = dom_snapshot["documents"][document_idx]["nodes"]
59
+ node_children = defaultdict(lambda: [])
60
+
61
+ for node_idx in range(len(nodes["nodeName"])):
62
+ parent_idx = nodes["parentIndex"][node_idx]
63
+ if parent_idx != -1:
64
+ node_children[parent_idx].append(node_idx)
65
+
66
+ def dfs(node_idx: int, parent_node_skipped: bool) -> str:
67
+
68
+ # https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
69
+ # https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeName
70
+ # https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeValue
71
+
72
+ node_type = nodes["nodeType"][node_idx]
73
+ node_name = to_string(nodes["nodeName"][node_idx])
74
+ node_value = to_string(nodes["nodeValue"][node_idx])
75
+ html_before = ""
76
+ html_after = ""
77
+ skip_node = False
78
+
79
+ # text nodes: print text content only if parent was not skipped
80
+ if node_type == 3: # node_name == "#text"
81
+ if not parent_node_skipped and node_value is not None:
82
+ html_before += node_value
83
+
84
+ # CData nodes: print content only if parent was not skipped
85
+ elif node_type == 4: # node_name == "#cdata-section":
86
+ if not parent_node_skipped and node_value is not None:
87
+ html_before += f"<!CDATA[[{node_value}]]>"
88
+
89
+ # processing instructions, comments, documents, doctypes, document fragments: don't print
90
+ elif node_type in (7, 8, 9, 10, 11):
91
+ skip_node = True
92
+
93
+ # now we should have an element node
94
+ else:
95
+ assert node_type == 1
96
+
97
+ tag_name = node_name.lower().strip()
98
+ attributes = [] # to be printed as attributes with the tag
99
+ bid = None
100
+
101
+ # parse node attributes
102
+ node_attr_idxs = nodes["attributes"][node_idx]
103
+ for i in range(0, len(node_attr_idxs), 2):
104
+ attr_name = to_string(node_attr_idxs[i])
105
+ attr_value = to_string(node_attr_idxs[i + 1])
106
+
107
+ # extract and print bid
108
+ if attr_name == BID_ATTR:
109
+ bid = attr_value
110
+ # ignore browsergym attributes
111
+ elif attr_name in (VIS_ATTR, SOM_ATTR):
112
+ pass
113
+ # print other attributes
114
+ else:
115
+ if attr_value is None:
116
+ # attribute value missing
117
+ attributes.append(f"{attr_name}")
118
+ else:
119
+ # attribute value present
120
+ attributes.append(f'{attr_name}="{attr_value}"')
121
+
122
+ skip_node, extra_attributes_to_print = _process_bid(
123
+ bid,
124
+ extra_properties=extra_properties,
125
+ with_visible=with_visible,
126
+ with_clickable=with_clickable,
127
+ with_center_coords=with_center_coords,
128
+ with_bounding_box_coords=with_bounding_box_coords,
129
+ with_som=with_som,
130
+ filter_visible_only=filter_visible_only,
131
+ filter_with_bid_only=filter_with_bid_only,
132
+ filter_som_only=filter_som_only,
133
+ coord_decimals=coord_decimals,
134
+ )
135
+
136
+ # insert extra attributes before regular attributes
137
+ attributes = extra_attributes_to_print + attributes
138
+
139
+ # insert bid as first attribute
140
+ if not (
141
+ hide_all_bids
142
+ or bid is None
143
+ or (
144
+ hide_bid_if_invisible
145
+ and extra_properties.get(bid, {}).get("visibility", 0) < 0.5
146
+ )
147
+ ):
148
+ attributes.insert(0, f'bid="{bid}"')
149
+
150
+ if not skip_node:
151
+ # print node opening tag, with its attributes
152
+ html_before += f"<{tag_name}" + " ".join([""] + attributes) + ">"
153
+ # print node closing tag
154
+ html_after += f"</{tag_name}>"
155
+
156
+ html = ""
157
+ html += html_before
158
+
159
+ # recursively print iframe nodes if any
160
+ if node_idx in nodes["contentDocumentIndex"]["index"]:
161
+ sub_document_idx = nodes["contentDocumentIndex"]["value"][
162
+ nodes["contentDocumentIndex"]["index"].index(node_idx)
163
+ ]
164
+ html += parse_document(document_idx=sub_document_idx)
165
+
166
+ # recursively print children nodes if any
167
+ for child_idx in node_children[node_idx]:
168
+ html += dfs(node_idx=child_idx, parent_node_skipped=skip_node)
169
+
170
+ html += html_after
171
+
172
+ return html
173
+
174
+ html = dfs(node_idx=0, parent_node_skipped=False)
175
+
176
+ # Format the HTML document with indentation
177
+ soup = BeautifulSoup(html, "lxml")
178
+ html = soup.prettify()
179
+
180
+ return html
181
+
182
+ html = parse_document(document_idx=0)
183
+
184
+ return html
185
+
186
+
187
+ def _get_coord_str(coord, decimals):
188
+ if isinstance(coord, str):
189
+ coord = list(map(float, ast.literal_eval(coord)))
190
+
191
+ coord_format = f".{decimals}f"
192
+ coord_str = ",".join([f"{c:{coord_format}}" for c in coord])
193
+ return f"({coord_str})"
194
+
195
+
196
+ def _process_bid(
197
+ bid,
198
+ extra_properties: dict = None,
199
+ with_visible: bool = False,
200
+ with_clickable: bool = False,
201
+ with_center_coords: bool = False,
202
+ with_bounding_box_coords: bool = False,
203
+ with_som: bool = False,
204
+ filter_visible_only: bool = False,
205
+ filter_with_bid_only: bool = False,
206
+ filter_som_only: bool = False,
207
+ coord_decimals: int = 0,
208
+ ):
209
+ """
210
+ Process extra attributes and attribute-based filters, for the element with the given bid.
211
+
212
+ Returns:
213
+ A flag indicating if the element should be skipped or not (due to filters).
214
+ Attributes to be printed, as a list of "x=y" strings.
215
+ """
216
+
217
+ if extra_properties is None:
218
+ if any(
219
+ (
220
+ with_visible,
221
+ with_clickable,
222
+ with_center_coords,
223
+ with_bounding_box_coords,
224
+ with_som,
225
+ filter_visible_only,
226
+ filter_with_bid_only,
227
+ filter_som_only,
228
+ )
229
+ ):
230
+ raise ValueError("extra_properties argument required")
231
+ else:
232
+ extra_properties = {}
233
+
234
+ skip_element = False
235
+ attributes_to_print = []
236
+
237
+ if bid is None:
238
+ # skip nodes without a bid (if requested)
239
+ if filter_with_bid_only:
240
+ skip_element = True
241
+ if filter_som_only:
242
+ skip_element = True
243
+ if filter_visible_only:
244
+ # element without bid have no visibility mark, they could be visible or non-visible
245
+ # TODO we consider them as visible. Is this what we want? Now that duplicate bids are handled, should we mark all non-html elements?
246
+ pass # keep elements without visible property
247
+ # skip_element = True # filter elements without visible property
248
+
249
+ # parse extra browsergym properties, if node has a bid
250
+ else:
251
+ if bid in extra_properties:
252
+ node_vis = extra_properties[bid]["visibility"]
253
+ node_bbox = extra_properties[bid]["bbox"]
254
+ node_is_clickable = extra_properties[bid]["clickable"]
255
+ node_in_som = extra_properties[bid]["set_of_marks"]
256
+ node_is_visible = node_vis >= 0.5
257
+ # skip non-visible nodes (if requested)
258
+ if filter_visible_only and not node_is_visible:
259
+ skip_element = True
260
+ if filter_som_only and not node_in_som:
261
+ skip_element = True
262
+ # print extra attributes if requested (with new names)
263
+ if with_som and node_in_som:
264
+ attributes_to_print.insert(0, f"som")
265
+ if with_visible and node_is_visible:
266
+ attributes_to_print.insert(0, f"visible")
267
+ if with_clickable and node_is_clickable:
268
+ attributes_to_print.insert(0, f"clickable")
269
+ if with_center_coords and node_bbox is not None:
270
+ x, y, width, height = node_bbox
271
+ center = (x + width / 2, y + height / 2)
272
+ attributes_to_print.insert(0, f'center="{_get_coord_str(center, coord_decimals)}"')
273
+ if with_bounding_box_coords and node_bbox is not None:
274
+ x, y, width, height = node_bbox
275
+ box = (x, y, x + width, y + height)
276
+ attributes_to_print.insert(0, f'box="{_get_coord_str(box, coord_decimals)}"')
277
+
278
+ return skip_element, attributes_to_print
279
+
280
+
281
+ def flatten_axtree_to_str(
282
+ AX_tree,
283
+ extra_properties: dict = None,
284
+ with_visible: bool = False,
285
+ with_clickable: bool = False,
286
+ with_center_coords: bool = False,
287
+ with_bounding_box_coords: bool = False,
288
+ with_som: bool = False,
289
+ skip_generic: bool = True,
290
+ filter_visible_only: bool = False,
291
+ filter_with_bid_only: bool = False,
292
+ filter_som_only: bool = False,
293
+ coord_decimals: int = 0,
294
+ ignored_roles=IGNORED_AXTREE_ROLES,
295
+ ignored_properties=IGNORED_AXTREE_PROPERTIES,
296
+ remove_redundant_static_text: bool = True,
297
+ hide_bid_if_invisible: bool = False,
298
+ hide_all_children: bool = False,
299
+ hide_all_bids: bool = False,
300
+ ) -> str:
301
+ """Formats the accessibility tree into a string text"""
302
+ node_id_to_idx = {}
303
+ for idx, node in enumerate(AX_tree["nodes"]):
304
+ node_id_to_idx[node["nodeId"]] = idx
305
+
306
+ def dfs(node_idx: int, depth: int, parent_node_filtered: bool, parent_node_name: str) -> str:
307
+ tree_str = ""
308
+ node = AX_tree["nodes"][node_idx]
309
+ indent = "\t" * depth
310
+ skip_node = False # node will not be printed, with no effect on children nodes
311
+ filter_node = False # node will not be printed, possibly along with its children nodes
312
+ node_role = node["role"]["value"]
313
+ node_name = ""
314
+
315
+ if node_role in ignored_roles:
316
+ skip_node = True
317
+ pass
318
+ elif "name" not in node:
319
+ skip_node = True
320
+ pass
321
+ else:
322
+ node_name = node["name"]["value"]
323
+ if "value" in node and "value" in node["value"]:
324
+ node_value = node["value"]["value"]
325
+ else:
326
+ node_value = None
327
+
328
+ # extract bid
329
+ bid = node.get("browsergym_id", None)
330
+
331
+ # extract node attributes
332
+ attributes = []
333
+ for property in node.get("properties", []):
334
+ if not "value" in property:
335
+ continue
336
+ if not "value" in property["value"]:
337
+ continue
338
+
339
+ prop_name = property["name"]
340
+ prop_value = property["value"]["value"]
341
+
342
+ if prop_name in ignored_properties:
343
+ continue
344
+ elif prop_name in ("required", "focused", "atomic"):
345
+ if prop_value:
346
+ attributes.append(prop_name)
347
+ else:
348
+ attributes.append(f"{prop_name}={repr(prop_value)}")
349
+
350
+ if skip_generic and node_role == "generic" and not attributes:
351
+ skip_node = True
352
+
353
+ if hide_all_children and parent_node_filtered:
354
+ skip_node = True
355
+
356
+ if node_role == "StaticText":
357
+ if parent_node_filtered:
358
+ skip_node = True
359
+ elif remove_redundant_static_text and node_name in parent_node_name:
360
+ skip_node = True
361
+ else:
362
+ filter_node, extra_attributes_to_print = _process_bid(
363
+ bid,
364
+ extra_properties=extra_properties,
365
+ with_visible=with_visible,
366
+ with_clickable=with_clickable,
367
+ with_center_coords=with_center_coords,
368
+ with_bounding_box_coords=with_bounding_box_coords,
369
+ with_som=with_som,
370
+ filter_visible_only=filter_visible_only,
371
+ filter_with_bid_only=filter_with_bid_only,
372
+ filter_som_only=filter_som_only,
373
+ coord_decimals=coord_decimals,
374
+ )
375
+
376
+ # if either is True, skip the node
377
+ skip_node = skip_node or filter_node
378
+
379
+ # insert extra attributes before regular attributes
380
+ attributes = extra_attributes_to_print + attributes
381
+
382
+ # actually print the node string
383
+ if not skip_node:
384
+ if node_role == "generic" and not node_name:
385
+ node_str = f"{node_role}"
386
+ else:
387
+ node_str = f"{node_role} {repr(node_name.strip())}"
388
+
389
+ if not (
390
+ hide_all_bids
391
+ or bid is None
392
+ or (
393
+ hide_bid_if_invisible
394
+ and extra_properties.get(bid, {}).get("visibility", 0) < 0.5
395
+ )
396
+ ):
397
+ node_str = f"[{bid}] " + node_str
398
+
399
+ if node_value is not None:
400
+ node_str += f' value={repr(node["value"]["value"])}'
401
+
402
+ if attributes:
403
+ node_str += ", ".join([""] + attributes)
404
+
405
+ tree_str += f"{indent}{node_str}"
406
+
407
+ for child_node_id in node["childIds"]:
408
+ if child_node_id not in node_id_to_idx or child_node_id == node["nodeId"]:
409
+ continue
410
+ # mark this to save some tokens
411
+ child_depth = depth if skip_node else (depth + 1)
412
+ child_str = dfs(
413
+ node_id_to_idx[child_node_id],
414
+ child_depth,
415
+ parent_node_filtered=filter_node,
416
+ parent_node_name=node_name,
417
+ )
418
+ if child_str:
419
+ if tree_str:
420
+ tree_str += "\n"
421
+ tree_str += child_str
422
+
423
+ return tree_str
424
+
425
+ tree_str = dfs(0, 0, False, "")
426
+ return tree_str
427
+
428
+
429
+ def overlay_som(
430
+ screenshot: np.typing.ArrayLike,
431
+ extra_properties: dict,
432
+ fontsize: int = 12,
433
+ linewidth: int = 2,
434
+ tag_margin: int = 2,
435
+ ):
436
+ img = PIL.Image.fromarray(screenshot).copy() # make a copy
437
+ img = img.convert(mode="RGBA")
438
+ draw = PIL.ImageDraw.Draw(img)
439
+
440
+ font = PIL.ImageFont.load_default(size=fontsize)
441
+
442
+ # Adapted from https://stackoverflow.com/questions/51908563/dotted-or-dashed-line-with-python-pillow/58885306#58885306
443
+ def linedashed(
444
+ draw: PIL.ImageDraw.Draw, x0, y0, x1, y1, fill, width, dash_length=4, nodash_length=8
445
+ ):
446
+ line_dx = x1 - x0 # delta x (can be negative)
447
+ line_dy = y1 - y0 # delta y (can be negative)
448
+ line_length = math.hypot(line_dx, line_dy) # line length (positive)
449
+ if line_length == 0:
450
+ return # Avoid division by zero in case the line length is 0
451
+ pixel_dx = line_dx / line_length # x add for 1px line length
452
+ pixel_dy = line_dy / line_length # y add for 1px line length
453
+ dash_start = 0
454
+ while dash_start < line_length:
455
+ dash_end = dash_start + dash_length
456
+ if dash_end > line_length:
457
+ dash_end = line_length
458
+ draw.line(
459
+ (
460
+ round(x0 + pixel_dx * dash_start),
461
+ round(y0 + pixel_dy * dash_start),
462
+ round(x0 + pixel_dx * dash_end),
463
+ round(y0 + pixel_dy * dash_end),
464
+ ),
465
+ fill=fill,
466
+ width=width,
467
+ )
468
+ dash_start += dash_length + nodash_length
469
+
470
+ for bid, properties in extra_properties.items():
471
+ if properties["set_of_marks"] and properties["bbox"]:
472
+ x, y, width, height = properties["bbox"]
473
+ x0, y0 = x, y
474
+ x1, y1 = x + width, y + height
475
+
476
+ # skip small boxes
477
+ area = (x1 - x0) * (y1 - y0)
478
+ if area < 20:
479
+ logger.warning(
480
+ f'som overlay: skipping bid "{bid}" due to bbox too small (area={area})'
481
+ )
482
+ continue
483
+
484
+ # draw bounding box with dashed lines
485
+ linedashed(draw, x0, y0, x1, y0, fill=(0, 0, 0, 255), width=linewidth)
486
+ linedashed(draw, x1, y0, x1, y1, fill=(0, 0, 0, 255), width=linewidth)
487
+ linedashed(draw, x1, y1, x0, y1, fill=(0, 0, 0, 255), width=linewidth)
488
+ linedashed(draw, x0, y1, x0, y0, fill=(0, 0, 0, 255), width=linewidth)
489
+
490
+ # get text box size (left, top, right, bottom)
491
+ tag_box = font.getbbox(
492
+ bid,
493
+ )
494
+
495
+ # set tag size, including margins
496
+ tag_size = (
497
+ (tag_box[2] - tag_box[0] + 2 * (tag_margin + 1)),
498
+ (tag_box[3] - tag_box[1] + 2 * (tag_margin + 1)),
499
+ )
500
+
501
+ # create tag image with correct size and black background
502
+ tag_img = PIL.Image.new("RGBA", tag_size, "black")
503
+ tag_draw = PIL.ImageDraw.Draw(tag_img)
504
+ # write text with 1px horizontal margin
505
+ tag_draw.text(
506
+ (-tag_box[0] + tag_margin + 1, -tag_box[1] + tag_margin + 1),
507
+ bid,
508
+ font=font,
509
+ fill=(255, 255, 255, 255),
510
+ spacing=0,
511
+ )
512
+ tag_draw.rectangle(
513
+ (0, 0, tag_size[0] - 1, tag_size[1] - 1),
514
+ fill=None,
515
+ outline=(255, 255, 255, 255),
516
+ width=1,
517
+ )
518
+
519
+ # draw tag in the source image, upper left of the bounding box
520
+ tag_pos = (x + 0, y - tag_size[1] / 2 + 4)
521
+ tag_pos = list(map(round, tag_pos))
522
+ img.paste(tag_img, tag_pos)
523
+
524
+ # convert to RGB (3 channels)
525
+ img = img.convert(mode="RGB")
526
+ # convert to a numpy array
527
+ img = np.array(img)
528
+
529
+ return img
530
+
531
+
532
+ def prune_html(html):
533
+ html = re.sub(r"\n", " ", html)
534
+ # remove html comments
535
+ html = re.sub(r"<!--(.*?)-->", "", html, flags=re.MULTILINE)
536
+
537
+ soup = BeautifulSoup(html, "lxml")
538
+ for tag in reversed(soup.find_all()):
539
+ # remove body and html tags (not their content)
540
+ if tag.name in ("html", "body"):
541
+ tag.unwrap()
542
+ # remove useless tags
543
+ elif tag.name in ("style", "link", "script", "br"):
544
+ tag.decompose()
545
+ # remove / unwrap structural tags
546
+ elif tag.name in ("div", "span", "i", "p") and len(tag.attrs) == 1 and tag.has_attr("bid"):
547
+ if not tag.contents:
548
+ tag.decompose()
549
+ else:
550
+ tag.unwrap()
551
+
552
+ html = soup.prettify()
553
+
554
+ return html
BrowserGym/browsergym/experiments/README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BrowserGym experiments
2
+
3
+ This package provides `browsergym.experiments`, a suite of experimentation tools for [BrowserGym](https://github.com/ServiceNow/BrowserGym).
4
+
5
+ As a convenience namespace, it also provides `bgym`.
6
+
7
+ ## Setup
8
+
9
+ 1. Install the package
10
+ ```sh
11
+ pip install browsergym-experiments
12
+ ```
BrowserGym/browsergym/experiments/pyproject.toml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-requirements-txt"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "browsergym-experiments"
7
+ description = "Experimentation tools for BrowserGym"
8
+ authors = [
9
+ {name = "Massimo Caccia"},
10
+ {name = "Alex Lacoste"},
11
+ {name = "Thibault Le Sellier De Chezelles"},
12
+ {name = "Maxime Gasse"},
13
+ ]
14
+ readme = "README.md"
15
+ requires-python = ">3.7"
16
+ license = {text = "Apache-2.0"}
17
+ classifiers = [
18
+ "Development Status :: 3 - Alpha",
19
+ "Programming Language :: Python :: 3",
20
+ "Operating System :: OS Independent",
21
+ "Intended Audience :: Science/Research",
22
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
23
+ "License :: OSI Approved :: Apache Software License",
24
+ ]
25
+ dynamic = ["dependencies", "version"]
26
+
27
+ [project.optional-dependencies]
28
+ miniwob = [
29
+ "browsergym-miniwob",
30
+ ]
31
+ workarena = [
32
+ "browsergym-workarena",
33
+ ]
34
+ webarena = [
35
+ "browsergym-webarena",
36
+ ]
37
+ visualwebarena = [
38
+ "browsergym-visualwebarena",
39
+ ]
40
+ assistantbench = [
41
+ "browsergym-assistantbench",
42
+ ]
43
+ weblinx = [
44
+ "weblinx_browsergym",
45
+ ]
46
+ all = [
47
+ "browsergym-experiment[miniwob]",
48
+ "browsergym-experiment[workarena]",
49
+ "browsergym-experiment[webarena]",
50
+ "browsergym-experiment[visualwebarena]",
51
+ "browsergym-experiment[assistantbench]",
52
+ "browsergym-experiment[weblinx]",
53
+ ]
54
+
55
+ [project.urls]
56
+ homepage = "https://github.com/ServiceNow/BrowserGym"
57
+
58
+ [tool.hatch.version]
59
+ path = "../core/src/browsergym/core/__init__.py"
60
+
61
+ [tool.hatch.metadata.hooks.requirements_txt]
62
+ files = ["requirements.txt"]
63
+
64
+ [tool.hatch.build.targets.wheel]
65
+ packages = ["src/browsergym", "src/bgym"]
BrowserGym/browsergym/experiments/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ browsergym-core==0.13.4
2
+ tiktoken>=0.4
3
+ dataclasses-json
BrowserGym/browsergym/experiments/src/bgym/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from browsergym.core.action.base import AbstractActionSet
2
+ from browsergym.core.action.highlevel import HighLevelActionSet
3
+ from browsergym.core.action.python import PythonActionSet
4
+ from browsergym.experiments.agent import Agent, AgentInfo
5
+ from browsergym.experiments.benchmark import (
6
+ DEFAULT_BENCHMARKS,
7
+ Benchmark,
8
+ HighLevelActionSetArgs,
9
+ )
10
+ from browsergym.experiments.loop import (
11
+ AbstractAgentArgs,
12
+ EnvArgs,
13
+ ExpArgs,
14
+ ExpResult,
15
+ StepInfo,
16
+ StepTimestamps,
17
+ )
BrowserGym/browsergym/experiments/src/browsergym/experiments/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .agent import Agent, AgentInfo
2
+ from .loop import AbstractAgentArgs, EnvArgs, ExpArgs, get_exp_result
BrowserGym/browsergym/experiments/src/browsergym/experiments/agent.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass, field
3
+ from typing import Any
4
+
5
+ from browsergym.core.action.base import AbstractActionSet
6
+ from browsergym.core.action.highlevel import HighLevelActionSet
7
+ from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html
8
+
9
+
10
+ def default_obs_preprocessor(obs: dict) -> dict:
11
+ obs = obs.copy() # shallow copy to avoid modifying the original dict
12
+ # augment the observation with text versions of the DOM and AXTree
13
+ obs["dom_txt"] = flatten_dom_to_str(obs["dom_object"])
14
+ obs["axtree_txt"] = flatten_axtree_to_str(obs["axtree_object"])
15
+ obs["pruned_html"] = prune_html(obs["dom_txt"])
16
+ # remove raw entries that the agent won't use, and we don't want to record
17
+ del obs["dom_object"]
18
+ del obs["axtree_object"]
19
+ return obs
20
+
21
+
22
+ DEFAULT_ACTION_SET: AbstractActionSet = HighLevelActionSet()
23
+ DEFAULT_OBS_PREPROCESSOR: callable = default_obs_preprocessor
24
+
25
+
26
+ @dataclass
27
+ class AgentInfo:
28
+ think: str = None
29
+ chat_messages: list = None
30
+ stats: dict = field(default_factory=dict)
31
+ markdown_page: str = ""
32
+ html_page: str = ""
33
+ extra_info: dict = None
34
+
35
+ def __getitem__(self, key):
36
+ return getattr(self, key)
37
+
38
+ def __contains__(self, key):
39
+ return hasattr(self, key)
40
+
41
+ def pop(self, key, default=None):
42
+ return getattr(self, key, default)
43
+
44
+ def get(self, key, default=None):
45
+ return getattr(self, key, default)
46
+
47
+
48
+ class Agent(ABC):
49
+ """
50
+ A template class that defines the required signature of an agent interacting
51
+ with a browsergym environment
52
+
53
+ Attributes:
54
+ action_set: AbstractActionSet
55
+ Defines the set of actions that the agent can take in the environment.
56
+ This property is meant to be overloaded by your agent (optional).
57
+ By default, uses BrowserGym's high-level action set.
58
+ """
59
+
60
+ action_set: AbstractActionSet = DEFAULT_ACTION_SET
61
+
62
+ def obs_preprocessor(self, obs: dict) -> Any:
63
+ """
64
+ Function that pre-processes observations before feeding them to `get_action()`.
65
+ This property is meant to be overloaded by your agent (optional).
66
+ By default, the base observation is augmented with text versions of the DOM and AXTREE.
67
+
68
+ Why this mapping? This mapping will happen within the experiment loop, so that the
69
+ resulting observation gets recorded in the execution traces, and statistics can be computed from it.
70
+ """
71
+ return DEFAULT_OBS_PREPROCESSOR(obs)
72
+
73
+ @abstractmethod
74
+ def get_action(self, obs: Any) -> tuple[str, AgentInfo]:
75
+ """
76
+ Updates the agent with the current observation, and returns its next action (plus an info dict, optional).
77
+
78
+ Parameters:
79
+ -----------
80
+ obs:
81
+ The current observation of the environment, after it has been processed by `obs_preprocessor()`.
82
+ By default, a BrowserGym observation is a dict with the following entries:
83
+ - "chat_messages": list[str], messages between the agent and the user.
84
+ - "goal": str, the current goal.
85
+ - "open_pages_urls": list[str], open pages.
86
+ - "active_page_index": int, the index of the active page.
87
+ - "url": str, the current URL.
88
+ - "screenshot": 3D np.array, the current screenshot.
89
+ - "dom_object": dict, the current DOM object. See DOMSnapshot from chrome devtools.
90
+ - "axtree_object": dict, the current AXTREE object. See Accessibility Tree from chrome devtools.
91
+ - "extra_element_properties": dict[bid, dict[name, value]] extra
92
+ properties of elements in the DOM.
93
+ - "focused_element_bid": str, the bid of the focused element.
94
+ - "last_action": str, the last action executed.
95
+ - "last_action_error": str, the error of the last action.
96
+ - "elapsed_time": float, the time elapsed since the start of the episode.
97
+
98
+ Returns:
99
+ --------
100
+ action: str
101
+ The action to be processed by `action_mapping()` (if any), and executed in the environment.
102
+ info: AgentInfo
103
+ Additional information about the action. with the following entries
104
+ being handled by BrowserGym:
105
+ - "think": optional chain of thought
106
+ - "messages": list of messages with the LLM
107
+ - "stats": dict of extra statistics that will be saved and
108
+ aggregated.
109
+ - "markdown_page": str, string that will be displayed by agentlab's xray tool.
110
+ - "extra_info": dict, additional information that will be saved
111
+ and aggregated.
112
+ """
BrowserGym/browsergym/experiments/src/browsergym/experiments/benchmark/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .base import Benchmark, HighLevelActionSetArgs
2
+ from .configs import DEFAULT_BENCHMARKS