Spaces:

LangAGI-Lab
/

Web-Shepherd-Demo

Running

App Files Files Community

kyle8581 commited on May 18

Commit

dd39c08

1 Parent(s): d9b575c

upload

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

BrowserGym/.gitignore +154 -0
BrowserGym/.pre-commit-config.yaml +44 -0
BrowserGym/.readthedocs.yaml +32 -0
BrowserGym/LICENSE +13 -0
BrowserGym/Makefile +17 -0
BrowserGym/README.md +254 -0
BrowserGym/browsergym/assistantbench/README.md +21 -0
BrowserGym/browsergym/assistantbench/pyproject.toml +35 -0
BrowserGym/browsergym/assistantbench/requirements.txt +4 -0
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/__init__.py +54 -0
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_dicts.py +68 -0
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_factory.py +28 -0
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_numbers.py +34 -0
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_strings.py +174 -0
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/utils.py +25 -0
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluator.py +132 -0
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/task.py +142 -0
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/utils.py +73 -0
BrowserGym/browsergym/core/README.md +10 -0
BrowserGym/browsergym/core/pyproject.toml +42 -0
BrowserGym/browsergym/core/requirements.txt +8 -0
BrowserGym/browsergym/core/src/browsergym/core/__init__.py +27 -0
BrowserGym/browsergym/core/src/browsergym/core/action/__init__.py +11 -0
BrowserGym/browsergym/core/src/browsergym/core/action/base.py +63 -0
BrowserGym/browsergym/core/src/browsergym/core/action/functions.py +624 -0
BrowserGym/browsergym/core/src/browsergym/core/action/highlevel.py +522 -0
BrowserGym/browsergym/core/src/browsergym/core/action/parsers.py +92 -0
BrowserGym/browsergym/core/src/browsergym/core/action/python.py +112 -0
BrowserGym/browsergym/core/src/browsergym/core/action/utils.py +288 -0
BrowserGym/browsergym/core/src/browsergym/core/chat.py +95 -0
BrowserGym/browsergym/core/src/browsergym/core/chat_files/chatbox.html +243 -0
BrowserGym/browsergym/core/src/browsergym/core/chat_files/chatbox_modern.html +379 -0
BrowserGym/browsergym/core/src/browsergym/core/chat_files/img/send.svg +3 -0
BrowserGym/browsergym/core/src/browsergym/core/constants.py +5 -0
BrowserGym/browsergym/core/src/browsergym/core/env.py +625 -0
BrowserGym/browsergym/core/src/browsergym/core/javascript/frame_mark_elements.js +295 -0
BrowserGym/browsergym/core/src/browsergym/core/javascript/frame_unmark_elements.js +40 -0
BrowserGym/browsergym/core/src/browsergym/core/observation.py +575 -0
BrowserGym/browsergym/core/src/browsergym/core/registration.py +76 -0
BrowserGym/browsergym/core/src/browsergym/core/spaces.py +140 -0
BrowserGym/browsergym/core/src/browsergym/core/task.py +111 -0
BrowserGym/browsergym/core/src/browsergym/utils/mcp_server.py +192 -0
BrowserGym/browsergym/core/src/browsergym/utils/obs.py +554 -0
BrowserGym/browsergym/experiments/README.md +12 -0
BrowserGym/browsergym/experiments/pyproject.toml +65 -0
BrowserGym/browsergym/experiments/requirements.txt +3 -0
BrowserGym/browsergym/experiments/src/bgym/__init__.py +17 -0
BrowserGym/browsergym/experiments/src/browsergym/experiments/__init__.py +2 -0
BrowserGym/browsergym/experiments/src/browsergym/experiments/agent.py +112 -0
BrowserGym/browsergym/experiments/src/browsergym/experiments/benchmark/__init__.py +2 -0

BrowserGym/.gitignore ADDED Viewed

	@@ -0,0 +1,154 @@

+.DS_store
+.idea/
+docs/src/generated/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# error logs
+error_logs.txt
+# tests
+tests/results
+tmp.py
+.vscode/**
+# demo and results
+results/
+.vscode/launch.json
+# assistantbench
+tests/assistantbench/assistantbench-predictions-test.jsonl
+# weblinx
+bg_wl_data/
+uv.lock

BrowserGym/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+fail_fast: false
+default_language_version:
+  python: python3
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.2.0
+    hooks:
+      - id: trailing-whitespace
+        exclude: ^(.*)\.md$
+      - id: end-of-file-fixer
+      - id: check-yaml
+        exclude: ^(.circleci/recipe|recipe)  # conda build recipes are templated
+      - id: check-added-large-files
+  - repo: https://github.com/pocc/pre-commit-hooks
+    rev: v1.1.1
+    hooks:
+      - id: clang-format
+        args: [--style=file, -i]
+      - id: clang-tidy
+        args: [--fix, --fix-errors]
+  - repo: https://github.com/psf/black
+    rev: 24.2.0
+    hooks:
+      - id: black
+        args: [--config=./pyproject.toml]
+  - repo: https://github.com/asottile/blacken-docs
+    rev: v1.12.1
+    hooks:
+    - id: blacken-docs
+      args: [ '--line-length', '100' ]
+      additional_dependencies: [black]
+  - repo: https://github.com/Lucas-C/pre-commit-hooks
+    rev: v1.5.5
+    hooks:
+    - id: forbid-crlf
+    - id: remove-crlf
+    # Black does not clear tabs in docstrings
+    - id: forbid-tabs
+      files: '.*\.py$'
+    - id: remove-tabs
+      files: '.*\.py$'
+      args: [ '--whitespaces-count', '4' ]

BrowserGym/.readthedocs.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+# Required
+version: 2
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+    # You can also specify other tool versions:
+    # nodejs: "19"
+    # rust: "1.64"
+    # golang: "1.19"
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/src/conf.py
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#    - pdf
+#    - epub
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+   install:
+   - requirements: docs/requirements.txt

BrowserGym/LICENSE ADDED Viewed

	@@ -0,0 +1,13 @@

+   Copyright 2024 ServiceNow
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

BrowserGym/Makefile ADDED Viewed

	@@ -0,0 +1,17 @@

+install:
+	@echo "--- 🚀 Installing project dependencies ---"
+	pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/
+	playwright install chromium
+install-demo:
+	@echo "--- 🚀 Installing demo dependencies ---"
+	pip install -r demo_agent/requirements.txt
+	playwright install chromium
+demo:
+	@echo "--- 🚀 Running demo agent ---"
+	(set -x && cd demo_agent && python run_demo.py)
+test-core:
+	@echo "--- 🧪 Running tests ---"
+	pytest -n auto ./tests/core

BrowserGym/README.md ADDED Viewed

	@@ -0,0 +1,254 @@

+<div align="center">
+![BrowserGym banner](https://github.com/user-attachments/assets/4853f210-43ac-4107-a0d2-95c9c614dbe7)
+🛠️ [Setup](#%EF%B8%8F-setup) -
+🏋 [Usage](#-usage) -
+💻 [Demo](#-demo) -
+🌐 [Ecosystem](#-ecosystem) -
+🚀 [AgentLab](https://github.com/ServiceNow/AgentLab) -
+🌟 [Contributors](#-contributors) -
+📄 [Paper](https://arxiv.org/abs/2412.05467) -
+📝 [Citation](#-citing-this-work)
+[![pypi](https://badge.fury.io/py/browsergym.svg)](https://pypi.org/project/browsergym/)
+[![PyPI - License](https://img.shields.io/pypi/l/browsergym?style=flat-square)]([https://opensource.org/licenses/MIT](http://www.apache.org/licenses/LICENSE-2.0))
+[![PyPI - Downloads](https://img.shields.io/pypi/dm/browsergym-core?style=flat-square)](https://pypistats.org/packages/browsergym-core)
+[![GitHub star chart](https://img.shields.io/github/stars/ServiceNow/BrowserGym?style=flat-square)](https://star-history.com/#ServiceNow/BrowserGym)
+[![Code Format](https://github.com/ServiceNow/BrowserGym/actions/workflows/code_format.yml/badge.svg)](https://github.com/ServiceNow/BrowserGym/actions/workflows/code_format.yml)
+[![Tests](https://github.com/ServiceNow/BrowserGym/actions/workflows/unit_tests.yml/badge.svg)](https://github.com/ServiceNow/BrowserGym/actions/workflows/unit_tests.yml)
+```python
+pip install browsergym
+```
+</div>
+> [!WARNING]
+> BrowserGym is meant to provide an open, easy-to-use and extensible framework to accelerate the field of web agent research.
+> It is not meant to be a consumer product. Use with caution!
+> [!TIP]
+> 🚀 Check out [AgentLab](https://github.com/ServiceNow/AgentLab)✨ !
+> A seamless framework to implement, test, and evaluate your web agents on all BrowserGym benchmarks.
+https://github.com/ServiceNow/BrowserGym/assets/26232819/e0bfc788-cc8e-44f1-b8c3-0d1114108b85
+_Example of a GPT4-V agent executing openended tasks (top row, chat interactive), as well as WebArena and WorkArena tasks (bottom row)._
+BrowserGym includes the following benchmarks by default:
+ - [MiniWoB](https://miniwob.farama.org/)
+ - [WebArena](https://webarena.dev/)
+ - [VisualWebArena](https://jykoh.com/vwa)
+ - [WorkArena](https://github.com/ServiceNow/WorkArena)
+ - [AssistantBench](https://github.com/oriyor/assistantbench)
+ - [WebLINX](https://github.com/McGill-NLP/weblinx) (static benchmark)
+Designing new web benchmarks with BrowserGym is easy, and simply requires to inherit the [`AbstractBrowserTask`](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/core/src/browsergym/core/task.py#L7C7-L7C26) class.
+## 🛠️ Setup
+To use browsergym, install one of the following packages:
+```sh
+pip install browsergym  # (recommended) everything below
+pip install browsergym-experiments  # experiment utilities (agent, loop, benchmarks) + everything below
+pip install browsergym-core  # core functionalities only (no benchmark, just the openended task)
+pip install browsergym-miniwob  # core + miniwob
+pip install browsergym-webarena  # core + webarena
+pip install browsergym-visualwebarena  # core + visualwebarena
+pip install browsergym-workarena  # core + workarena
+pip install browsergym-assistantbench  # core + assistantbench
+pip install weblinx-browsergym  # core + weblinx
+```
+Then setup playwright by running
+```sh
+playwright install chromium
+```
+Finally, each benchmark comes with its own specific setup that requires to follow additional steps.
+ - for MiniWoB++, see [miniwob/README.md](browsergym/miniwob/README.md)
+ - for WebArena, see [webarena/README.md](browsergym/webarena/README.md)
+ - for VisualWebArena, see [visualwebarena/README.md](browsergym/visualwebarena/README.md)
+ - for WorkArena, see [WorkArena](https://github.com/ServiceNow/WorkArena)
+ - for AssistantBench, see [assistantbench/README.md](browsergym/assistantbench/README.md)
+### 🏗️ Development setup
+To install browsergym locally for development, use the following commands:
+```sh
+git clone [email protected]:ServiceNow/BrowserGym.git
+cd BrowserGym
+make install
+```
+Contributions are welcome! 😊
+## 🏋 Usage
+Boilerplate code to run an agent on an interactive, open-ended task:
+```python
+import gymnasium as gym
+import browsergym.core  # register the openended task as a gym environment
+# start an openended environment
+env = gym.make(
+    "browsergym/openended",
+    task_kwargs={"start_url": "https://www.google.com/"},  # starting URL
+    wait_for_user_message=True,  # wait for a user message after each agent message sent to the chat
+)
+# run the environment <> agent loop until termination
+obs, info = env.reset()
+while True:
+    action = ...  # implement your agent here
+    obs, reward, terminated, truncated, info = env.step(action)
+    if terminated or truncated:
+        break
+# release the environment
+env.close()
+```
+MiniWoB
+```python
+import gymnasium as gym
+import browsergym.miniwob  # register miniwob tasks as gym environments
+# start a miniwob task
+env = gym.make("browsergym/miniwob.choose-list")
+...
+# list all the available miniwob tasks
+env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/miniwob")]
+print("\n".join(env_ids))
+```
+WorkArena
+```python
+import gymnasium as gym
+import browsergym.workarena  # register workarena tasks as gym environments
+# start a workarena task
+env = gym.make("browsergym/workarena.servicenow.order-ipad-pro")
+...
+# list all the available workarena tasks
+env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/workarena")]
+print("\n".join(env_ids))
+```
+WebArena
+```python
+import gymnasium as gym
+import browsergym.webarena  # register webarena tasks as gym environments
+# start a webarena task
+env = gym.make("browsergym/webarena.310")
+...
+# list all the available webarena tasks
+env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/webarena")]
+print("\n".join(env_ids))
+```
+VisualWebArena
+```python
+import gymnasium as gym
+import browsergym.webarena  # register webarena tasks as gym environments
+# start a visualwebarena task
+env = gym.make("browsergym/visualwebarena.721")
+...
+# list all the available visualwebarena tasks
+env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/visualwebarena")]
+print("\n".join(env_ids))
+```
+AssistantBench
+```python
+import gymnasium as gym
+import browsergym.workarena  # register assistantbench tasks as gym environments
+# start an assistantbench task
+env = gym.make("browsergym/assistantbench.validation.3")
+...
+# list all the available assistantbench tasks
+env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/workarena")]
+print("\n".join(env_ids))
+```
+## 💻 Demo
+If you want to experiment with a demo agent in BrowserGym, follow these steps
+```sh
+# conda setup
+conda env create -f demo_agent/environment.yml
+conda activate demo_agent
+# or pip setup
+pip install -r demo_agent/requirements.txt
+# then download the browser for playwright
+playwright install chromium
+```
+Our demo agent uses `openai` as a backend, be sure to set your `OPENAI_API_KEY`.
+Launch the demo agent as follows
+```sh
+# openended (interactive chat mode)
+python demo_agent/run_demo.py --task_name openended --start_url https://www.google.com
+# miniwob
+python demo_agent/run_demo.py --task_name miniwob.click-test
+# workarena
+python demo_agent/run_demo.py --task_name workarena.servicenow.order-standard-laptop
+# webarena
+python demo_agent/run_demo.py --task_name webarena.4
+# visualwebarena
+python demo_agent/run_demo.py --task_name visualwebarena.398
+```
+You can customize your experience by changing the `model_name` to your preferred LLM (it uses `gpt-4o-mini` by default), adding screenshots for your VLMs with `use_screenshot`, and much more!
+```python
+python demo_agent/run_demo.py --help
+```
+## 🌐 Ecosystem
+- [AgentLab](https://github.com/ServiceNow/AgentLab): Seamlessly run agents on benchmarks, collect and analyse traces.
+- [WorkArena(++)](https://github.com/ServiceNow/WorkArena): A benchmark for web agents on the ServiceNow platform.
+- [WebArena](https://github.com/web-arena-x/webarena): A benchmark of realistic web tasks on self-hosted domains.
+- [VisualWebArena](https://github.com/web-arena-x/visualwebarena): A benchmark of realistic visual web tasks on self-hosted domains.
+- [MiniWoB(++)](https://miniwob.farama.org/): A collection of over 100 web tasks on synthetic web pages.
+- [WebLINX](https://github.com/McGill-NLP/weblinx): A dataset of real-world web interaction traces.
+- [AssistantBench](https://github.com/oriyor/assistantbench): A benchmark of realistic and time-consuming tasks on the open web.
+- [DoomArena](https://github.com/ServiceNow/DoomArena): A framework for AI agent security testing which supports injecting attacks into web pages from Browsergym environments.
+## 🌟 Contributors
+[![BrowserGym contributors](https://contrib.rocks/image?repo=ServiceNow/BrowserGym&max=2000)](https://github.com/ServiceNow/BrowserGym/graphs/contributors)
+## 📝 Citing This Work
+Please use the following BibTeX to cite our work:
+```tex
+@inproceedings{workarena2024,
+    title = {{W}ork{A}rena: How Capable are Web Agents at Solving Common Knowledge Work Tasks?},
+    author = {Drouin, Alexandre and Gasse, Maxime and Caccia, Massimo and Laradji, Issam H. and Del Verme, Manuel and Marty, Tom and Vazquez, David and Chapados, Nicolas and Lacoste, Alexandre},
+    booktitle = {Proceedings of the 41st International Conference on Machine Learning},
+    pages = {11642--11662},
+    year = {2024},
+    editor = {Salakhutdinov, Ruslan and Kolter, Zico and Heller, Katherine and Weller, Adrian and Oliver, Nuria and Scarlett, Jonathan and Berkenkamp, Felix},
+    volume = {235},
+    series = {Proceedings of Machine Learning Research},
+    month = {21--27 Jul},
+    publisher = {PMLR},
+    url = {https://proceedings.mlr.press/v235/drouin24a.html},
+}
+```

BrowserGym/browsergym/assistantbench/README.md ADDED Viewed

	@@ -0,0 +1,21 @@

+# AssistantBench <> BrowserGym
+This package provides an implementation for using the [AssistantBench](https://assistantbench.github.io/) benchmark in BrowserGym.
+Because AssistantBench includes open-ended tasks, setup is extremely easy and simply requires installing the package.
+Please note that AssistantBench has a hidden test set, so test set predictions will need to be uploaded to the official [leaderboard](https://huggingface.co/spaces/AssistantBench/leaderboard).
+## Setting up
+- Install the package (this is still a wip)
+```
+pip install browsergym-assistantbench
+```
+- Run inference, e.g., run the following commands for demo on a simple toy task
+```
+python demo_agent/run_demo.py --task_name assistantbench.validation.3
+```
+- Test set predictions will be saved to `./assistantbench-predictions-test.jsonl`. To evaluate on the official test set, upload these predictions to the official [leaderboard](https://huggingface.co/spaces/AssistantBench/leaderboard).

BrowserGym/browsergym/assistantbench/pyproject.toml ADDED Viewed

	@@ -0,0 +1,35 @@

+[build-system]
+requires = ["hatchling", "hatch-requirements-txt"]
+build-backend = "hatchling.build"
+[project]
+name = "browsergym-assistantbench"
+description = "AssistantBench benchmark for BrowserGym"
+authors = [
+    {name = "Ori Yoran"},
+    {name = "Maxime Gasse"},
+]
+readme = "README.md"
+requires-python = ">3.7"
+license = {text = "Apache-2.0"}
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "License :: OSI Approved :: Apache Software License",
+]
+dynamic = ["dependencies", "version"]
+[project.urls]
+homepage = "https://github.com/ServiceNow/BrowserGym"
+[tool.hatch.version]
+path = "../core/src/browsergym/core/__init__.py"
+[tool.hatch.metadata.hooks.requirements_txt]
+files = ["requirements.txt"]
+[tool.hatch.build.targets.wheel]
+packages = ["src/browsergym"]

BrowserGym/browsergym/assistantbench/requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+browsergym-core==0.13.4
+datasets
+scipy
+numpy

BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/__init__.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from browsergym.core.registration import register_task
+from . import task
+TOY_AB_TASK_IDS = []
+VALID_AB_TASK_IDS = []
+TEST_AB_TASK_IDS = []
+# register a toy easy task for testing implementation
+gym_id = f"assistantbench.imp.0"
+register_task(
+    gym_id,
+    task.AssistantBenchTask,
+    task_kwargs={
+        "task_id": f"imp.0",
+    },
+    default_task_kwargs={
+        "save_predictions": False,  # can be overriden
+    },
+)
+TOY_AB_TASK_IDS.append(gym_id)
+# register the AssistantBench dev set
+for task_id in range(33):
+    gym_id = f"assistantbench.validation.{task_id}"
+    register_task(
+        gym_id,
+        task.AssistantBenchTask,
+        task_kwargs={
+            "task_id": f"validation.{task_id}",
+        },
+        default_task_kwargs={
+            "save_predictions": False,  # can be overriden
+        },
+    )
+    VALID_AB_TASK_IDS.append(gym_id)
+# register the AssistantBench test set
+for task_id in range(181):
+    gym_id = f"assistantbench.test.{task_id}"
+    register_task(
+        gym_id,
+        task.AssistantBenchTask,
+        task_kwargs={
+            "task_id": f"test.{task_id}",
+        },
+        default_task_kwargs={
+            "save_predictions": True,  # can be overriden
+        },
+    )
+    TEST_AB_TASK_IDS.append(gym_id)
+ALL_AB_TASK_IDS = TOY_AB_TASK_IDS + VALID_AB_TASK_IDS + TEST_AB_TASK_IDS

BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_dicts.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from typing import Dict, List
+import numpy as np
+from .utils import _align_bags
+def calculate_f1_score(precision, recall):
+    if precision + recall == 0:
+        return 0  # Handle the case to avoid division by zero
+    return 2 * (precision * recall) / (precision + recall)
+def calc_recall(pred: Dict, gold: Dict, use_gold_for_eval: bool):
+    from .evaluate_factory import get_evaluator_from_gold_answer
+    recall = []
+    for gold_key, gold_value in gold.items():
+        pred_value = pred.get(gold_key)
+        gold_value = fix_number(gold_value)
+        pred_value = fix_number(pred_value)
+        if gold_key not in pred:
+            recall.append(0)
+        else:
+            evaluator = (
+                get_evaluator_from_gold_answer(type(gold_value))
+                if use_gold_for_eval
+                else get_evaluator_from_gold_answer(type(pred_value))
+            )
+            if type(pred_value) != type(gold_value):
+                recall.append(0)
+                continue
+            recall.append(evaluator(pred_value, gold_value))
+    avg_recall = np.average(recall)
+    return avg_recall
+def fix_number(number):
+    if type(number) == str:
+        copy_ans = number
+        copy_ans = " ".join(
+            " ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")
+        ).strip()
+        copy_ans = copy_ans.strip()
+        copy_ans = copy_ans.replace(",", ".")
+        try:
+            return float(copy_ans)
+        except:
+            return number
+    elif type(number) == int:
+        return float(number)
+    else:
+        return number
+def evaluate_pair_of_dicts(pred: Dict, gold: Dict):
+    recall = calc_recall(pred, gold, True)
+    precision = calc_recall(gold, pred, False)
+    f1 = calculate_f1_score(precision, recall)
+    return f1
+def evaluate_dicts(pred: List[Dict], gold: List[Dict]):
+    if not (type(pred) == dict or len(pred) == 0 or (type(pred) == list and type(pred[0]) == dict)):
+        return 0
+    max_alignment_scores = _align_bags(pred, gold, evaluate_pair_of_dicts)
+    return np.average(max_alignment_scores)

BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_factory.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from typing import Union
+from .evaluate_dicts import evaluate_dicts
+from .evaluate_numbers import evaluate_numbers
+from .evaluate_strings import evaluate_strings
+EvaluatorFactory = {
+    "string": evaluate_strings,
+    "number": evaluate_numbers,
+    "json": evaluate_dicts,
+    "string list": evaluate_strings,
+}
+EvaluatorFactoryFromType = {
+    str: evaluate_strings,
+    int: evaluate_numbers,
+    float: evaluate_numbers,
+    bool: evaluate_strings,
+    list: evaluate_strings,
+}
+def get_evaluator(evaluator: str):
+    return EvaluatorFactory[evaluator]
+def get_evaluator_from_gold_answer(gold_answer: Union[str, int, float]):
+    return EvaluatorFactoryFromType[gold_answer]

BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_numbers.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from typing import Union
+import numpy as np
+# Renamed calc_z function to distance_function_log
+def distance_function_log(pred: float, gold: float):
+    if pred == gold == 0:
+        return 1
+    if pred == 0:
+        pred = 1e-4
+    if gold == 0:
+        gold = 1e-4
+    if pred > gold:
+        return max(0, 1 - np.log(pred / gold))
+    else:
+        return max(0, 1 - np.log(gold / pred))
+def evaluate_numbers(pred: Union[float, str], gold: float):
+    res = None
+    if type(pred) != float and type(pred) != int:
+        try:
+            pred = float(pred)
+        except ValueError:
+            res = 0
+    if type(gold) != float and type(gold) != int:
+        try:
+            gold = float(gold)
+        except ValueError:
+            res = 0
+    if res is None:
+        res = distance_function_log(pred, gold)
+    return res

BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_strings.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""
+Evaluation for two strings or list of strings.
+Code taken from the DROP benchmark - https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
+"""
+import re
+import string
+from typing import List, Set, Tuple, Union
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+# From here through _normalize_answer was originally copied from:
+# https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
+# Then cleaned up and modified a bit.
+def _remove_articles(text: str) -> str:
+    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+    return re.sub(regex, " ", text)
+def _white_space_fix(text: str) -> str:
+    return " ".join(text.split())
+EXCLUDE = set(string.punctuation)
+def _remove_punc(text: str) -> str:
+    if not _is_number(text):
+        return "".join(ch for ch in text if ch not in EXCLUDE)
+    else:
+        return text
+def _lower(text: str) -> str:
+    return text.lower()
+def _tokenize(text: str) -> List[str]:
+    return re.split(" |-", text)
+def _normalize_answer(text: str) -> str:
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    parts = [
+        _white_space_fix(_remove_articles(_normalize_number(_remove_punc(_lower(token)))))
+        for token in _tokenize(text)
+    ]
+    parts = [part for part in parts if part.strip()]
+    normalized = " ".join(parts).strip()
+    return normalized
+def _is_number(text: str) -> bool:
+    try:
+        float(text)
+        return True
+    except ValueError:
+        return False
+def _normalize_number(text: str) -> str:
+    if _is_number(text):
+        return str(float(text))
+    else:
+        return text
+def _answer_to_bags(
+    answer: Union[str, List[str], Tuple[str, ...]]
+) -> Tuple[List[str], List[Set[str]]]:
+    if isinstance(answer, (list, tuple)):
+        raw_spans = answer
+    else:
+        raw_spans = [answer]
+    normalized_spans: List[str] = []
+    token_bags = []
+    for raw_span in raw_spans:
+        normalized_span = _normalize_answer(raw_span)
+        normalized_spans.append(normalized_span)
+        token_bags.append(set(normalized_span.split()))
+    return normalized_spans, token_bags
+def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
+    """
+    Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
+    between them and gets maximum metric values over all the answers.
+    """
+    scores = np.zeros([len(gold), len(predicted)])
+    for gold_index, gold_item in enumerate(gold):
+        for pred_index, pred_item in enumerate(predicted):
+            if _match_numbers_if_present(gold_item, pred_item):
+                scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
+    row_ind, col_ind = linear_sum_assignment(-scores)
+    max_scores = np.zeros([max(len(gold), len(predicted))])
+    for row, column in zip(row_ind, col_ind):
+        max_scores[row] = max(max_scores[row], scores[row, column])
+    return max_scores
+def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
+    intersection = len(gold_bag.intersection(predicted_bag))
+    if not predicted_bag:
+        precision = 1.0
+    else:
+        precision = intersection / float(len(predicted_bag))
+    if not gold_bag:
+        recall = 1.0
+    else:
+        recall = intersection / float(len(gold_bag))
+    f1 = (
+        (2 * precision * recall) / (precision + recall)
+        if not (precision == 0.0 and recall == 0.0)
+        else 0.0
+    )
+    return f1
+def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]) -> bool:
+    gold_numbers = set()
+    predicted_numbers = set()
+    for word in gold_bag:
+        if _is_number(word):
+            gold_numbers.add(word)
+    for word in predicted_bag:
+        if _is_number(word):
+            predicted_numbers.add(word)
+    if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
+        return True
+    return False
+def get_metrics(
+    predicted: Union[str, List[str], Tuple[str, ...]],
+    gold: Union[str, List[str], Tuple[str, ...]],
+) -> Tuple[float, float]:
+    """
+    Takes a predicted answer and a gold answer (that are both either a string or a list of
+    strings), and returns exact match and the DROP F1 metric for the prediction.  If you are
+    writing a script for evaluating objects in memory (say, the output of predictions during
+    validation, or while training), this is the function you want to call, after using
+    :func:`answer_json_to_strings` when reading the gold answer from the released data file.
+    """
+    predicted_bags = _answer_to_bags(predicted)
+    gold_bags = _answer_to_bags(gold)
+    if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
+        exact_match = 1.0
+    else:
+        exact_match = 0.0
+    f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
+    f1 = np.mean(f1_per_bag)
+    f1 = round(f1, 2)
+    return exact_match, f1
+def evaluate_strings(prediction, gold):
+    if type(prediction) != list and type(prediction) != str:
+        prediction = str(prediction)
+    if type(gold) != list and type(gold) != str:
+        gold = str(gold)
+    try:
+        predicted_bags = _answer_to_bags(prediction)
+        gold_bags = _answer_to_bags(gold)
+        f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
+        f1 = np.mean(f1_per_bag)
+    except Exception:
+        f1 = 0.0
+    return f1

BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/utils.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from typing import Callable, List, Set
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+def _align_bags(
+    predicted: List[Set[str]],
+    gold: List[Set[str]],
+    method: Callable[[object, object], float],
+) -> List[float]:
+    """
+    Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
+    between them and gets maximum metric values over all the answers.
+    """
+    scores = np.zeros([len(gold), len(predicted)])
+    for gold_index, gold_item in enumerate(gold):
+        for pred_index, pred_item in enumerate(predicted):
+            scores[gold_index, pred_index] = method(pred_item, gold_item)
+    row_ind, col_ind = linear_sum_assignment(-scores)
+    max_scores = np.zeros([max(len(gold), len(predicted))])
+    for row, column in zip(row_ind, col_ind):
+        max_scores[row] = max(max_scores[row], scores[row, column])
+    return max_scores

BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluator.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# todo export evaluation to a python package
+import json
+import numpy as np
+from .evaluate_utils.evaluate_factory import get_evaluator
+def find_isnan(samp):
+    try:
+        if np.isnan(samp):
+            return True
+        else:
+            return False
+    except:
+        return False
+def fix_ans(answer):
+    try:
+        answer = (
+            answer.replace("{'", '{"')
+            .replace("', '", '", "')
+            .replace("': '", '": "')
+            .replace("'}", '"}')
+        )
+        answer = answer.replace("': ", '": ')
+        return answer
+    except:
+        return answer
+def parse_answer(answer):
+    if len(answer) == 1:
+        ans, is_num = fix_number(answer[0])
+        if is_num:
+            return ans, "number"
+        try:
+            ans = json.loads(fix_ans(answer[0]))
+            return [ans], "json"
+        except:
+            ans, is_num = fix_number(answer[0])
+            if is_num:
+                return ans, "number"
+            else:
+                return answer[0], "string"
+    else:
+        try:
+            ans = [json.loads(fix_ans(ex)) for ex in answer]
+            return ans, "json"
+        except:
+            return answer, "string list"
+def fix_number(number):
+    if type(number) == str:
+        copy_ans = number
+        copy_ans = " ".join(
+            " ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")
+        ).strip()
+        copy_ans = copy_ans.strip()
+        copy_ans = copy_ans.replace(",", ".").replace(" square kilometers", "")
+        try:
+            return float(copy_ans), True
+        except:
+            return number, False
+    elif type(number) == int:
+        return float(number), True
+    else:
+        return number, True
+def fix_prediction(prediction, gold_answer, evaluator):
+    if (
+        type(prediction) == list
+        and len(prediction) == 1
+        and (
+            type(prediction[0]) == int
+            or ((type(prediction[0]) == str) and prediction[0].isnumeric())
+        )
+    ):
+        prediction = fix_number(prediction[0])
+    if type(prediction) != list:
+        prediction, is_num = fix_number(prediction)
+        if evaluator == "json":
+            try:
+                prediction = [json.loads(pred) for pred in prediction.split("\n")]
+            except:
+                prediction = [prediction]
+    if (hasattr(type(prediction), "__len__")) and (len(prediction) == 0):
+        return prediction, False
+    if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float:
+        return prediction, False
+    return prediction, True
+def question_scorer(prediction, gold_answer):
+    try:
+        prediction = json.loads(prediction)
+    except:
+        prediction = prediction
+    answer_list = (
+        [x for x in gold_answer.split("\n") if len(x.strip()) > 0]
+        if type(gold_answer) != list
+        else gold_answer
+    )
+    gold_answer, evaluator = parse_answer(answer_list)
+    prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator)
+    has_ans = 1.0
+    if (type(prediction) != float and len(prediction) == 0) or find_isnan(prediction):
+        has_ans = 0.0
+    if type(prediction) == list:
+        if all(
+            (type(pred) not in {float, int} and len(pred) == 0) or find_isnan(pred)
+            for pred in prediction
+        ):
+            has_ans = 0
+    if not run_eval:
+        return 0.0, has_ans
+    metric_eval = get_evaluator(evaluator)
+    accuracy = metric_eval(prediction, gold_answer)
+    return accuracy, has_ans

BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/task.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import logging
+import os
+from typing import Dict, Tuple
+from datasets import load_dataset
+from playwright.sync_api import Page
+from browsergym.core.task import AbstractBrowserTask
+from .evaluation.evaluator import question_scorer
+from .utils import add_prediction_to_jsonl
+logger = logging.getLogger(__name__)
+_DEFAULT_OUTPUT_FILE = None
+def set_default_output_file(output_file: str):
+    global _DEFAULT_OUTPUT_FILE
+    _DEFAULT_OUTPUT_FILE = output_file
+def get_default_output_file():
+    return _DEFAULT_OUTPUT_FILE
+# Load dataset
+DATA_DATASET = "AssistantBench/AssistantBench"
+all_tasks = load_dataset(DATA_DATASET, trust_remote_code=True)
+# Extract answers and tasks for validation and test splits
+def extract_data(split_name: str) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]:
+    return (
+        {
+            f"{split_name}.{i}": row["answer"] if row["answer"] is not None else ""
+            for i, row in enumerate(all_tasks[split_name])
+        },
+        {f"{split_name}.{i}": row["task"] for i, row in enumerate(all_tasks[split_name])},
+        {f"{split_name}.{i}": row["id"] for i, row in enumerate(all_tasks[split_name])},
+    )
+# Implementation data for testing
+def get_implementation_testing_data() -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]:
+    return (
+        {"imp.0": "20"},
+        {
+            "imp.0": "What is the weather in Paris yesterday in Celsius? Answer with the number only."
+        },
+        {"imp.0": "test_imp_id_0"},
+    )
+# Combine dev, test, and implementation-specific testing splits
+gold_answers_dev, tasks_dev, ids_dev = extract_data("validation")
+gold_answers_test, tasks_test, ids_test = extract_data("test")
+gold_answers_impl_testing, tasks_test_impl_testing, ids_imp_testing = (
+    get_implementation_testing_data()
+)
+gold_answers = {**gold_answers_dev, **gold_answers_test, **gold_answers_impl_testing}
+tasks = {**tasks_dev, **tasks_test, **tasks_test_impl_testing}
+ids = {**ids_dev, **ids_test, **ids_imp_testing}
+class AssistantBenchTask(AbstractBrowserTask):
+    @classmethod
+    def get_task_id(cls) -> str:
+        """
+        Generic class for several task ids, this way of obtaining the task id is not compatible for now.
+        """
+        raise NotImplementedError
+    def __init__(
+        self, seed: int, task_id: str, output_file: str = None, save_predictions: bool = False
+    ) -> None:
+        """
+        Args:
+            seed (int): Random seed for task initialization.
+            task_id (str): Unique identifier for the task (for the BrowserGym environment).
+            output_file (str, optional): Path to the output file for saving results, needed for test set.
+            save_predictions (bool, optional): Save predictions to the output file (yes/no).
+        """
+        super().__init__(seed)
+        self.locale = "en-US"
+        self.timezone_id = "America/New_York"
+        self.task_id = task_id
+        self.start_url = "https://google.com"
+        self.goal = tasks[str(self.task_id)]
+        self.gold = gold_answers[str(self.task_id)]
+        self.ab_task_id = ids[self.task_id]
+        self.save_predictions = save_predictions
+        self.output_file = output_file
+        # set output_file using the global default value, if not provided in constructor
+        if not self.output_file:
+            self.output_file = get_default_output_file()
+        # use env variable in last resort
+        if not self.output_file:
+            self.output_file = os.getenv("ASSISTANTBENCH_OUTPUT_FILE", None)
+        if self.save_predictions and self.output_file:
+            logger.info(f"Task prediction will be written to output file {self.output_file}")
+    def setup(self, page: Page) -> Tuple[str, dict]:
+        logger.info(f"Navigating to start url: {self.start_url}")
+        page.goto(self.start_url, timeout=10000)
+        if self.save_predictions and self.output_file:
+            # create an empty task entry in the output file (will raise an Exception if the entry is already there)
+            add_prediction_to_jsonl(
+                file_path=self.output_file,
+                task_id=self.ab_task_id,
+                prediction="",
+                override_if_exists=False,
+            )
+        return self.goal, {}
+    def teardown(self) -> None:
+        pass
+    def validate(self, page: Page, chat_messages: list[dict]) -> Tuple[float, bool, str, dict]:
+        accuracy, done, msg, info = 0.0, False, "", {}
+        # eval when the agent returns a response
+        if chat_messages and chat_messages[-1]["role"] == "assistant":
+            done = True
+            prediction = chat_messages[-1]["message"]
+            if self.save_predictions and self.output_file:
+                # update the task entry in the output file
+                add_prediction_to_jsonl(
+                    file_path=self.output_file,
+                    task_id=self.ab_task_id,
+                    prediction=prediction,
+                    override_if_exists=True,
+                )
+            accuracy, has_ans = question_scorer(prediction, self.gold)
+        return accuracy, done, msg, info

BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/utils.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import json
+import logging
+import os
+import pathlib
+import time
+logger = logging.getLogger(__name__)
+def add_prediction_to_jsonl(
+    file_path: str, task_id: str, prediction: object, override_if_exists: bool
+) -> None:
+    """
+    Multiprocessing-safe file write.
+    """
+    lock_file_path = pathlib.Path(file_path).with_suffix(".lock")
+    lock_max_wait = 10  # 10 seconds
+    # Acquire lock (atomic file creation)
+    start_time = time.time()
+    while True:
+        try:
+            fd = os.open(lock_file_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
+            with os.fdopen(fd, "w") as f:
+                f.write("lock")
+            break
+        except FileExistsError:
+            # give up if max wait time reached
+            seconds_waited = time.time() - start_time
+            if seconds_waited >= lock_max_wait:
+                raise RuntimeError(
+                    f"Lock file could not be acquired after {seconds_waited} seconds ({lock_file_path})"
+                )
+            # wait for lock release
+            logger.info(f"Waiting for lock file to be released: {lock_file_path}")
+            time.sleep(1)  # 1 sec
+    logger.info(f"Lock file acquired: {lock_file_path}")
+    # Check if the file exists, if not, create it
+    if not os.path.exists(file_path):
+        with open(file_path, "w") as f:
+            pass  # Create an empty file
+    # Load existing data, if any
+    data = []
+    if os.path.exists(file_path):
+        with open(file_path, "r") as f:
+            data.extend([json.loads(line) for line in f if line.strip()])  # Skip empty lines
+    # Check if task_id already exists
+    existing_record = next((entry for entry in data if entry["id"] == task_id), None)
+    # Add or update the record
+    if not existing_record:
+        # Add new record
+        data.append({"id": task_id, "answer": prediction})
+    elif override_if_exists:
+        # Update existing record
+        existing_record["answer"] = prediction
+    else:
+        raise ValueError(
+            f"Prediction for task ID {repr(task_id)} already exists in file {file_path}."
+        )
+    # Write data back to the file
+    with open(file_path, "w") as f:
+        for entry in data:
+            f.write(json.dumps(entry) + "\n")
+    # Release lock (remove file)
+    os.remove(lock_file_path)
+    logger.info(f"Lock file released: {lock_file_path}")

BrowserGym/browsergym/core/README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+# BrowserGym core
+This package provides `browsergym.core`, which provides the core functionalities of [BrowserGym](https://github.com/ServiceNow/BrowserGym).
+## Setup
+1. Install the package
+```sh
+pip install browsergym-core
+```

BrowserGym/browsergym/core/pyproject.toml ADDED Viewed

	@@ -0,0 +1,42 @@

+[build-system]
+requires = ["hatchling", "hatch-requirements-txt"]
+build-backend = "hatchling.build"
+[project]
+name = "browsergym-core"
+description = "BrowserGym: a gym environment for web task automation in the Chromium browser"
+authors = [
+    {name = "Rim Assouel"},
+    {name = "Léo Boisvert"},
+    {name = "Massimo Caccia"},
+    {name = "Alex Drouin"},
+    {name = "Maxime Gasse"},
+    {name = "Imene Kerboua"},
+    {name = "Alex Lacoste"},
+    {name = "Thibault Le Sellier De Chezelles"},
+    {name = "Tom Marty"},
+]
+readme = "README.md"
+requires-python = ">3.9"
+license = {text = "Apache-2.0"}
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "License :: OSI Approved :: Apache Software License",
+]
+dynamic = ["dependencies", "version"]
+[project.urls]
+homepage = "https://github.com/ServiceNow/BrowserGym"
+[tool.hatch.version]
+path = "src/browsergym/core/__init__.py"
+[tool.hatch.metadata.hooks.requirements_txt]
+files = ["requirements.txt"]
+[tool.hatch.build.targets.wheel]
+packages = ["src/browsergym"]

BrowserGym/browsergym/core/requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+playwright==1.44
+gymnasium>=0.27
+numpy>=1.14
+pyparsing>=3
+Pillow>=10.1
+beautifulsoup4>=4.12
+lxml>=4.9
+mcp[cli]>=1.6.0

BrowserGym/browsergym/core/src/browsergym/core/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+__version__ = "0.13.4"
+import playwright.sync_api
+# we use a global playwright instance
+_PLAYWRIGHT = None
+def _set_global_playwright(pw: playwright.sync_api.Playwright):
+    global _PLAYWRIGHT
+    _PLAYWRIGHT = pw
+def _get_global_playwright():
+    global _PLAYWRIGHT
+    if not _PLAYWRIGHT:
+        pw = playwright.sync_api.sync_playwright().start()
+        _set_global_playwright(pw)
+    return _PLAYWRIGHT
+# register the open-ended task
+from .registration import register_task
+from .task import OpenEndedTask
+register_task(OpenEndedTask.get_task_id(), OpenEndedTask)

BrowserGym/browsergym/core/src/browsergym/core/action/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+_DEMO_MODE = False
+def set_global_demo_mode(demo_mode: bool):
+    global _DEMO_MODE
+    _DEMO_MODE = demo_mode
+def get_global_demo_mode():
+    global _DEMO_MODE
+    return _DEMO_MODE

BrowserGym/browsergym/core/src/browsergym/core/action/base.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from abc import ABC, abstractmethod
+import playwright.sync_api
+from . import get_global_demo_mode
+class AbstractActionSet(ABC):
+    def __init__(self, strict: bool = False):
+        self.strict = strict
+    @abstractmethod
+    def describe(self, with_long_description: bool = True, with_examples: bool = True) -> str:
+        """
+        Returns a textual description of this action space.
+        """
+    @abstractmethod
+    def example_action(self, abstract: bool) -> str:
+        """
+        Returns an example action as a string.
+        """
+    @abstractmethod
+    def to_python_code(self, action) -> str:
+        """
+        Converts the given action to browsergym-compatible python code.
+        Args:
+            action: the action to convert.
+        Returns:
+            Executable python code that performs the action in a browsergym environment.
+        """
+def execute_python_code(
+    code: str,
+    page: playwright.sync_api.Page,
+    send_message_to_user: callable,
+    report_infeasible_instructions: callable,
+):
+    """
+    Executes Python code in a new context, except for a playwright `page` object and a `send_message_to_user` function.
+    WARNING: this is not safe!
+    https://stackoverflow.com/questions/77655440/can-you-protect-a-python-variable-with-exec
+    Args:
+        code: the Python code to execute, as a string.
+        page: the playwright page that will be made accessible to the code.
+        send_message_to_user: utility function that will be made accessible to the code. It should take one text argument.
+        report_infeasible_instructions: utility function that will be made accessible to the code. It should take one text argument.
+    """
+    globals = {
+        "page": page,
+        "send_message_to_user": send_message_to_user,
+        "report_infeasible_instructions": report_infeasible_instructions,
+        "DEMO_MODE": get_global_demo_mode(),
+    }
+    exec(code, globals)

BrowserGym/browsergym/core/src/browsergym/core/action/functions.py ADDED Viewed

	@@ -0,0 +1,624 @@

+# these are placeholders
+# all these symbols will be available in browsergym actions
+from typing import Literal
+import playwright.sync_api
+from .utils import (
+    add_demo_mode_effects,
+    call_fun,
+    get_elem_by_bid,
+    highlight_by_box,
+    smooth_move_visual_cursor_to,
+)
+page: playwright.sync_api.Page = None
+send_message_to_user: callable = None
+report_infeasible_instructions: callable = None
+demo_mode: Literal["off", "default", "all_blue", "only_visible_elements"] = None
+retry_with_force: bool = False
+"""IMPORTANT
+The following primitives are meant to be included in the browsergym action using
+inspect.getsource().
+"""
+def send_msg_to_user(text: str):
+    """
+    Sends a message to the user.
+    Examples:
+        send_msg_to_user("Based on the results of my search, the city was built in 1751.")
+    """
+    send_message_to_user(text)
+def report_infeasible(reason: str):
+    """
+    Notifies the user that their instructions are infeasible.
+    Examples:
+        report_infeasible("I cannot follow these instructions because there is no email field in this form.")
+    """
+    report_infeasible_instructions(reason)
+def noop(wait_ms: float = 1000):
+    """
+    Do nothing, and optionally wait for the given time (in milliseconds).
+    Examples:
+        noop()
+        noop(500)
+    """
+    page.wait_for_timeout(wait_ms)
+# https://playwright.dev/docs/input#text-input
+def fill(bid: str, value: str):
+    """
+    Fill out a form field. It focuses the element and triggers an input event with the entered text.
+    It works for <input>, <textarea> and [contenteditable] elements.
+    Examples:
+        fill('237', 'example value')
+        fill('45', "multi-line\\nexample")
+        fill('a12', "example with \\"quotes\\"")
+    """
+    elem = get_elem_by_bid(page, bid, demo_mode != "off")
+    add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
+    def do(force: bool):
+        if demo_mode != "off":
+            delay = max(2000 / len(value), 10)
+            elem.clear(force=force, timeout=500)
+            elem.type(value, delay=delay, timeout=0)  # no timeout
+        else:
+            elem.fill(value, force=force, timeout=500)
+    call_fun(do, retry_with_force)
+# https://playwright.dev/python/docs/api/class-locator#locator-check
+def check(bid: str):
+    """
+    Ensure a checkbox or radio element is checked.
+    Examples:
+        check('55')
+    """
+    elem = get_elem_by_bid(page, bid, demo_mode != "off")
+    add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
+    def do(force: bool):
+        elem.check(force=force, timeout=500)
+    call_fun(do, retry_with_force)
+# https://playwright.dev/python/docs/api/class-locator#locator-uncheck
+def uncheck(bid: str):
+    """
+    Ensure a checkbox or radio element is unchecked.
+    Examples:
+        uncheck('a5289')
+    """
+    elem = get_elem_by_bid(page, bid, demo_mode != "off")
+    add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
+    def do(force: bool):
+        elem.uncheck(force=force, timeout=500)
+    call_fun(do, retry_with_force)
+# https://playwright.dev/docs/input#select-options
+def select_option(bid: str, options: str | list[str]):
+    """
+    Select one or multiple options in a <select> element. You can specify
+    option value or label to select. Multiple options can be selected.
+    Examples:
+        select_option('a48', "blue")
+        select_option('c48', ["red", "green", "blue"])
+    """
+    elem = get_elem_by_bid(page, bid, demo_mode != "off")
+    add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
+    def do(force: bool):
+        elem.select_option(options, force=force, timeout=500)
+    call_fun(do, retry_with_force)
+# https://playwright.dev/python/docs/api/class-locator#locator-click
+def click(
+    bid: str,
+    button: Literal["left", "middle", "right"] = "left",
+    modifiers: list[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]] = [],
+):
+    """
+    Click an element.
+    Examples:
+        click('a51')
+        click('b22', button="right")
+        click('48', button="middle", modifiers=["Shift"])
+    """
+    elem = get_elem_by_bid(page, bid, demo_mode != "off")
+    add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
+    def do(force: bool):
+        elem.click(button=button, modifiers=modifiers, force=force, timeout=500)
+    call_fun(do, retry_with_force)
+# https://playwright.dev/python/docs/api/class-locator#locator-dblclick
+def dblclick(
+    bid: str,
+    button: Literal["left", "middle", "right"] = "left",
+    modifiers: list[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]] = [],
+):
+    """
+    Double click an element.
+    Examples:
+        dblclick('12')
+        dblclick('ca42', button="right")
+        dblclick('178', button="middle", modifiers=["Shift"])
+    """
+    elem = get_elem_by_bid(page, bid, demo_mode != "off")
+    add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
+    def do(force: bool):
+        elem.click(button=button, modifiers=modifiers, force=force, timeout=500)
+    call_fun(do, retry_with_force)
+# https://playwright.dev/python/docs/api/class-locator#locator-hover
+def hover(bid: str):
+    """
+    Hover over an element.
+    Examples:
+        hover('b8')
+    """
+    elem = get_elem_by_bid(page, bid, demo_mode != "off")
+    add_demo_mode_effects(
+        page, elem, bid, demo_mode=demo_mode, move_cursor=True, highlight_box=False
+    )
+    def do(force: bool):
+        elem.hover(force=force, timeout=500)
+    call_fun(do, retry_with_force)
+# https://playwright.dev/python/docs/input#keys-and-shortcuts
+def press(bid: str, key_comb: str):
+    """
+    Focus the matching element and press a combination of keys. It accepts
+    the logical key names that are emitted in the keyboardEvent.key property
+    of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace,
+    Tab, Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp,
+    ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can
+    alternatively specify a single character you'd like to produce such as "a"
+    or "#". Following modification shortcuts are also supported: Shift, Control,
+    Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on
+    Windows and Linux and to Meta on macOS.
+    Examples:
+        press('88', 'Backspace')
+        press('a26', 'ControlOrMeta+a')
+        press('a61', 'Meta+Shift+t')
+    """
+    elem = get_elem_by_bid(page, bid, demo_mode != "off")
+    add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
+    elem.press(key_comb, timeout=500)
+# https://playwright.dev/python/docs/api/class-locator#locator-focus
+def focus(bid: str):
+    """
+    Focus the matching element.
+    Examples:
+        focus('b455')
+    """
+    elem = get_elem_by_bid(page, bid, demo_mode != "off")
+    add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
+    elem.focus(timeout=500)
+# https://playwright.dev/python/docs/api/class-locator#locator-clear
+def clear(bid: str):
+    """
+    Clear the input field.
+    Examples:
+        clear('996')
+    """
+    elem = get_elem_by_bid(page, bid, demo_mode != "off")
+    add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
+    elem.clear(timeout=500)
+# https://playwright.dev/python/docs/input#drag-and-drop
+def drag_and_drop(from_bid: str, to_bid: str):
+    """
+    Perform a drag & drop. Hover the element that will be dragged. Press
+    left mouse button. Move mouse to the element that will receive the
+    drop. Release left mouse button.
+    Examples:
+        drag_and_drop('56', '498')
+    """
+    from_elem = get_elem_by_bid(page, from_bid, demo_mode != "off")
+    add_demo_mode_effects(page, from_elem, from_bid, demo_mode=demo_mode, move_cursor=True)
+    from_elem.hover(timeout=500)
+    page.mouse.down()
+    to_elem = get_elem_by_bid(page, to_bid, demo_mode != "off")
+    add_demo_mode_effects(page, to_elem, to_bid, demo_mode=demo_mode, move_cursor=True)
+    to_elem.hover(timeout=500)
+    page.mouse.up()
+# https://playwright.dev/python/docs/api/class-mouse#mouse-wheel
+def scroll(delta_x: float, delta_y: float):
+    """
+    Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
+    Examples:
+        scroll(0, 200)
+        scroll(-50.2, -100.5)
+    """
+    page.mouse.wheel(delta_x, delta_y)
+# https://playwright.dev/python/docs/api/class-mouse#mouse-move
+def mouse_move(x: float, y: float):
+    """
+    Move the mouse to a location. Uses absolute client coordinates in pixels.
+    Dispatches a mousemove event.
+    Examples:
+        mouse_move(65.2, 158.5)
+    """
+    if demo_mode != "off":
+        smooth_move_visual_cursor_to(page, x, y)
+    page.mouse.move(x, y)
+# https://playwright.dev/python/docs/api/class-mouse#mouse-up
+def mouse_up(x: float, y: float, button: Literal["left", "middle", "right"] = "left"):
+    """
+    Move the mouse to a location then release a mouse button. Dispatches
+    mousemove and mouseup events.
+    Examples:
+        mouse_up(250, 120)
+        mouse_up(47, 252, 'right')
+    """
+    if demo_mode != "off":
+        smooth_move_visual_cursor_to(page, x, y)
+        highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
+    page.mouse.move(x, y)
+    page.mouse.up(button=button)
+# https://playwright.dev/python/docs/api/class-mouse#mouse-down
+def mouse_down(x: float, y: float, button: Literal["left", "middle", "right"] = "left"):
+    """
+    Move the mouse to a location then press and hold a mouse button. Dispatches
+    mousemove and mousedown events.
+    Examples:
+        mouse_down(140.2, 580.1)
+        mouse_down(458, 254.5, 'middle')
+    """
+    if demo_mode != "off":
+        smooth_move_visual_cursor_to(page, x, y)
+        highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
+    page.mouse.move(x, y)
+    page.mouse.down(button=button)
+# https://playwright.dev/python/docs/api/class-mouse#mouse-click
+def mouse_click(x: float, y: float, button: Literal["left", "middle", "right"] = "left"):
+    """
+    Move the mouse to a location and click a mouse button. Dispatches mousemove,
+    mousedown and mouseup events.
+    Examples:
+        mouse_click(887.2, 68)
+        mouse_click(56, 712.56, 'right')
+    """
+    if demo_mode != "off":
+        smooth_move_visual_cursor_to(page, x, y)
+        highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
+    page.mouse.click(x, y, button=button)
+# https://playwright.dev/python/docs/api/class-mouse#mouse-dblclick
+def mouse_dblclick(x: float, y: float, button: Literal["left", "middle", "right"] = "left"):
+    """
+    Move the mouse to a location and double click a mouse button. Dispatches
+    mousemove, mousedown and mouseup events.
+    Examples:
+        mouse_dblclick(5, 236)
+        mouse_dblclick(87.5, 354, 'right')
+    """
+    if demo_mode != "off":
+        smooth_move_visual_cursor_to(page, x, y)
+        highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
+    page.mouse.dblclick(x, y, button=button)
+def mouse_drag_and_drop(from_x: float, from_y: float, to_x: float, to_y: float):
+    """
+    Drag and drop from a location to a location. Uses absolute client
+    coordinates in pixels. Dispatches mousemove, mousedown and mouseup
+    events.
+    Examples:
+        mouse_drag_and_drop(10.7, 325, 235.6, 24.54)
+    """
+    if demo_mode != "off":
+        x, y = from_x, from_y
+        smooth_move_visual_cursor_to(page, x, y)
+        highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
+    page.mouse.move(from_x, from_y)
+    page.mouse.down()
+    if demo_mode != "off":
+        x, y = to_x, to_y
+        smooth_move_visual_cursor_to(page, x, y)
+        highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
+    page.mouse.move(to_x, to_y)
+    page.mouse.up()
+# https://playwright.dev/python/docs/api/class-keyboard#keyboard-press
+def keyboard_press(key: str):
+    """
+    Press a combination of keys. Accepts the logical key names that are
+    emitted in the keyboardEvent.key property of the keyboard events:
+    Backquote, Minus, Equal, Backslash, Backspace, Tab, Delete, Escape,
+    ArrowDown, End, Enter, Home, Insert, PageDown, PageUp, ArrowRight,
+    ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can
+    alternatively specify a single character you'd like to produce such
+    as "a" or "#". Following modification shortcuts are also supported:
+    Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta
+    resolves to Control on Windows and Linux and to Meta on macOS.
+    Examples:
+        keyboard_press('Backspace')
+        keyboard_press('ControlOrMeta+a')
+        keyboard_press('Meta+Shift+t')
+        page.keyboard.press("PageDown")
+    """
+    page.keyboard.press(key)
+# https://playwright.dev/python/docs/api/class-keyboard#keyboard-up
+def keyboard_up(key: str):
+    """
+    Release a keyboard key. Dispatches a keyup event. Accepts the logical
+    key names that are emitted in the keyboardEvent.key property of the
+    keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab,
+    Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp,
+    ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc.
+    You can alternatively specify a single character you'd like to produce
+    such as "a" or "#".
+    Examples:
+        keyboard_up('Shift')
+        keyboard_up('c')
+    """
+    page.keyboard.up(key)
+# https://playwright.dev/python/docs/api/class-keyboard#keyboard-down
+def keyboard_down(key: str):
+    """
+    Press and holds a keyboard key. Dispatches a keydown event. Accepts the
+    logical key names that are emitted in the keyboardEvent.key property of
+    the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab,
+    Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp,
+    ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can
+    alternatively specify a single character such as "a" or "#".
+    Examples:
+        keyboard_up('Shift')
+        keyboard_up('c')
+    """
+    page.keyboard.down(key)
+# https://playwright.dev/python/docs/api/class-keyboard#keyboard-type
+def keyboard_type(text: str):
+    """
+    Types a string of text through the keyboard. Sends a keydown, keypress/input,
+    and keyup event for each character in the text. Modifier keys DO NOT affect
+    keyboard_type. Holding down Shift will not type the text in upper case.
+    Examples:
+        keyboard_type('Hello world!')
+    """
+    if demo_mode != "off":
+        delay = max(2000 / len(text), 10)
+    else:
+        delay = None
+    page.keyboard.type(text, delay=delay)
+# https://playwright.dev/python/docs/api/class-keyboard#keyboard-insert-text
+def keyboard_insert_text(text: str):
+    """
+    Insert a string of text in the currently focused element. Dispatches only input
+    event, does not emit the keydown, keyup or keypress events. Modifier keys DO NOT
+    affect keyboard_insert_text. Holding down Shift will not type the text in upper
+    case.
+    Examples:
+        keyboard_insert_text('Hello world!')
+    """
+    page.keyboard.insert_text(text)
+# https://playwright.dev/python/docs/api/class-page#page-goto
+def goto(url: str):
+    """
+    Navigate to a url.
+    Examples:
+        goto('http://www.example.com')
+    """
+    page.goto(url)
+# https://playwright.dev/python/docs/api/class-page#page-go-back
+def go_back():
+    """
+    Navigate to the previous page in history.
+    Examples:
+        go_back()
+    """
+    page.go_back()
+# https://playwright.dev/python/docs/api/class-page#page-go-forward
+def go_forward():
+    """
+    Navigate to the next page in history.
+    Examples:
+        go_forward()
+    """
+    page.go_forward()
+# https://playwright.dev/python/docs/api/class-browsercontext#browser-context-new-page
+def new_tab():
+    """
+    Open a new tab. It will become the active one.
+    Examples:
+        new_tab()
+    """
+    global page
+    # set the new page as the active page
+    page = page.context.new_page()
+    # trigger the callback that sets this page as active in browsergym
+    page.evaluate(
+        """\
+const event = new Event('pageshow', {
+    bubbles: true,  // Whether the event bubbles up through the DOM or not
+    cancelable: false  // Whether the event can be canceled
+});
+window.dispatchEvent(event);
+"""
+    )
+# https://playwright.dev/python/docs/api/class-page#page-close
+def tab_close():
+    """
+    Close the current tab.
+    Examples:
+        tab_close()
+    """
+    global page
+    context = page.context
+    page.close()
+    # set most recent page as active page, or open a new page if needed
+    if context.pages:
+        # TODO: do something more elaborate? (active page history)
+        page = context.pages[-1]
+    else:
+        page = context.new_page()
+    # trigger the callback that sets this page as active in browsergym
+    page.evaluate(
+        """\
+const event = new Event('pageshow', {
+    bubbles: true,  // Whether the event bubbles up through the DOM or not
+    cancelable: false  // Whether the event can be canceled
+});
+window.dispatchEvent(event);
+"""
+    )
+# https://playwright.dev/python/docs/api/class-page#page-bring-to-front
+def tab_focus(index: int):
+    """
+    Bring tab to front (activate tab).
+    Examples:
+        tab_focus(2)
+    """
+    global page  # set the focused page as the active page
+    page = page.context.pages[index]
+    page.bring_to_front()
+    # trigger the callback that sets this page as active in browsergym
+    page.evaluate(
+        """\
+const event = new Event('pageshow', {
+    bubbles: true,  // Whether the event bubbles up through the DOM or not
+    cancelable: false  // Whether the event can be canceled
+});
+window.dispatchEvent(event);
+"""
+    )
+# https://playwright.dev/python/docs/input#upload-files
+def upload_file(bid: str, file: str | list[str]):
+    """
+    Click an element and wait for a "filechooser" event, then select one
+    or multiple input files for upload. Relative file paths are resolved
+    relative to the current working directory. An empty list clears the
+    selected files.
+    Examples:
+        upload_file("572", "my_receipt.pdf")
+        upload_file("63", ["/home/bob/Documents/image.jpg", "/home/bob/Documents/file.zip"])
+    """
+    elem = get_elem_by_bid(page, bid, demo_mode != "off")
+    add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
+    with page.expect_file_chooser() as fc_info:
+        elem.click(timeout=500)
+    file_chooser = fc_info.value
+    file_chooser.set_files(file)
+# https://playwright.dev/python/docs/input#upload-files
+def mouse_upload_file(x: float, y: float, file: str | list[str]):
+    """
+    Click a location and wait for a "filechooser" event, then select one
+    or multiple input files for upload. Relative file paths are resolved
+    relative to the current working directory. An empty list clears the
+    selected files.
+    Examples:
+        mouse_upload_file(132.1, 547, "my_receipt.pdf")
+        mouse_upload_file(328, 812, ["/home/bob/Documents/image.jpg", "/home/bob/Documents/file.zip"])
+    """
+    if demo_mode != "off":
+        smooth_move_visual_cursor_to(page, x, y)
+        highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
+    with page.expect_file_chooser() as fc_info:
+        page.mouse.click(x, y)
+    file_chooser = fc_info.value
+    file_chooser.set_files(file)

BrowserGym/browsergym/core/src/browsergym/core/action/highlevel.py ADDED Viewed

	@@ -0,0 +1,522 @@

+import inspect
+import random
+import typing
+from dataclasses import dataclass
+from . import utils
+from .base import AbstractActionSet
+from .functions import (  # check,; uncheck,
+    clear,
+    click,
+    dblclick,
+    drag_and_drop,
+    fill,
+    focus,
+    go_back,
+    go_forward,
+    goto,
+    hover,
+    keyboard_down,
+    keyboard_insert_text,
+    keyboard_press,
+    keyboard_type,
+    keyboard_up,
+    mouse_click,
+    mouse_dblclick,
+    mouse_down,
+    mouse_drag_and_drop,
+    mouse_move,
+    mouse_up,
+    mouse_upload_file,
+    new_tab,
+    noop,
+    press,
+    report_infeasible,
+    scroll,
+    select_option,
+    send_msg_to_user,
+    tab_close,
+    tab_focus,
+    upload_file,
+)
+from .parsers import action_docstring_parser, highlevel_action_parser
+ACTION_SUBSETS = {
+    "chat": [send_msg_to_user],
+    "infeas": [report_infeasible],
+    "bid": [
+        scroll,
+        fill,
+        # These are not really needed and might pollute the action space, doing more harm than good
+        # check,
+        # uncheck,
+        select_option,
+        click,
+        dblclick,
+        hover,
+        press,
+        focus,
+        clear,
+        drag_and_drop,
+        upload_file,
+    ],
+    "coord": [
+        scroll,
+        mouse_move,
+        mouse_up,
+        mouse_down,
+        mouse_click,
+        mouse_dblclick,
+        mouse_drag_and_drop,
+        mouse_upload_file,
+        keyboard_down,
+        keyboard_up,
+        keyboard_press,
+        keyboard_type,
+        keyboard_insert_text,
+    ],
+    "nav": [go_back, go_forward, goto],
+    "tab": [
+        tab_close,
+        tab_focus,
+        new_tab,
+    ],
+    # adapted from MiniWoB repo
+    # https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L122
+    "miniwob_all": [
+        mouse_move,  #     MOVE_COORDS
+        mouse_click,  #    CLICK_COORDS
+        mouse_dblclick,  # DBLCLICK_COORDS
+        mouse_down,  #     MOUSEDOWN_COORDS
+        mouse_up,  #       MOUSEUP_COORDS
+        scroll,  #         SCROLL_UP_COORDS, SCROLL_DOWN_COORDS
+        click,  #          CLICK_ELEMENT
+        keyboard_press,  # PRESS_KEY
+        keyboard_type,  #  TYPE_TEX (and substitute for TYPE_FIELD()
+        fill,  #           FOCUS_ELEMENT_AND_TYPE_TEXT (and substitute for FOCUS_ELEMENT_AND_TYPE_FIELD)
+    ],
+    # adapted from MiniWoB repo
+    # https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L142
+    "miniwob_shi17": [
+        mouse_click,  #    CLICK_COORDS
+        mouse_dblclick,  # DBLCLICK_COORDS
+        mouse_down,  #     MOUSEDOWN_COORDS
+        mouse_up,  #       MOUSEUP_COORDS
+        scroll,  #         SCROLL_UP_COORDS, SCROLL_DOWN_COORDS
+        keyboard_press,  # PRESS_KEY
+    ],
+    # adapted from MiniWoB repo
+    # https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L160
+    "miniwob_liu18": [
+        click,  # CLICK_ELEMENT
+        fill,  #  substitute for FOCUS_ELEMENT_AND_TYPE_FIELD
+    ],
+    # adapted from MiniWoB repo
+    # https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L173
+    "miniwob_humphreys22": [
+        mouse_move,  #     MOVE_COORDS
+        mouse_click,  #    CLICK_COORDS
+        mouse_dblclick,  # DBLCLICK_COORDS
+        mouse_down,  #     MOUSEDOWN_COORDS
+        mouse_up,  #       MOUSEUP_COORDS
+        scroll,  #         SCROLL_UP_COORDS, SCROLL_DOWN_COORDS
+        keyboard_press,  # PRESS_KEY
+        keyboard_type,  #  substitute for TYPE_FIELD
+    ],
+    # from the webarena paper
+    # https://arxiv.org/abs/2307.13854
+    # from the webarena source code
+    # https://github.com/web-arena-x/webarena/blob/e31c190c9b43f63e5724322b847e00249300df40/browser_env/actions.py#L240
+    # from the webarena default prompt
+    # https://github.com/web-arena-x/webarena/blob/e31c190c9b43f63e5724322b847e00249300df40/agent/prompts/raw/p_cot_id_actree_2s.py#L13
+    "webarena": [
+        #                   #     code      |      paper       |      prompt
+        scroll,  #            SCROLL        | scroll(dir)      | scroll [down|up]
+        keyboard_press,  #    KEY_PRESS     | press(key_comb)  | press [key_comb]
+        #                     MOUSE_CLICK   |                  |
+        #                     KEYBOARD_TYPE |                  |
+        #                     MOUSE_HOVER   |                  |
+        click,  #             CLICK         | click(elem)      | click [id]
+        fill,  #              TYPE          | type(elem, text) | type [id] [content]
+        hover,  #             HOVER         | hover(elem)      | hover [id]
+        tab_focus,  #         PAGE_FOCUS    | tab_focus(index) | tab_focus [tab_index]
+        new_tab,  #           NEW_TAB       | new_tab()        | new_tab
+        go_back,  #           GO_BACK       | go_back()        | go_back
+        go_forward,  #        GO_FORWARD    | go_forward()     | go_forward
+        goto,  #              GOTO_URL      | goto(url)        | goto [url]
+        tab_close,  #         PAGE_CLOSE    | tab_close()      | close_tab
+        #                     CHECK         |                  |
+        select_option,  #     SELECT_OPTION |                  |
+        send_msg_to_user,  #  STOP          | stop(answer)     | stop [answer]
+        report_infeasible,  ## explicit unachievable action, equivalent STOP "N/A"
+    ],
+    # from the visualwebarena paper
+    # https://arxiv.org/abs/2401.13649
+    # from the visualwebarena source code
+    # https://github.com/web-arena-x/visualwebarena/blob/15890922c97a8694e366fde2d7de8dbd1ff63fb5/browser_env/actions.py#L311-L343
+    # from the visualwebarena default prompt
+    # https://github.com/web-arena-x/visualwebarena/blob/15890922c97a8694e366fde2d7de8dbd1ff63fb5/agent/prompts/jsons/p_cot_id_actree_3s.json#L2
+    "visualwebarena": [
+        #                   #     code      |      paper       |      prompt
+        scroll,  #            SCROLL        | scroll(dir)      | scroll [down|up]
+        keyboard_press,  #    KEY_PRESS     | press(key_comb)  | press [key_comb]
+        #                     MOUSE_CLICK   |                  |
+        #                     KEYBOARD_TYPE |                  |
+        #                     MOUSE_HOVER   |                  |
+        click,  #             CLICK         | click(elem)      | click [id]
+        fill,  #              TYPE          | type(elem, text) | type [id] [content]
+        hover,  #             HOVER         | hover(elem)      | hover [id]
+        tab_focus,  #         PAGE_FOCUS    | tab_focus(index) | tab_focus [tab_index]
+        new_tab,  #           NEW_TAB       | new_tab()        | new_tab
+        go_back,  #           GO_BACK       | go_back()        | go_back
+        go_forward,  #        GO_FORWARD    | go_forward()     | go_forward
+        goto,  #              GOTO_URL      | goto(url)        | goto [url]
+        tab_close,  #         PAGE_CLOSE    | tab_close()      | close_tab
+        #                     CHECK         |                  |
+        select_option,  #     SELECT_OPTION |                  |
+        send_msg_to_user,  #  STOP          | stop(answer)     | stop [answer]
+        #                     CLEAR         |                  |
+        upload_file,  #       UPLOAD        |                  |
+        report_infeasible,  ## explicit unachievable action, equivalent STOP "N/A"
+    ],
+    # from workarena paper
+    # https://arxiv.org/abs/2403.07718
+    "workarena": [
+        scroll,
+        fill,
+        select_option,
+        click,
+        dblclick,
+        hover,
+        press,
+        focus,
+        clear,
+        drag_and_drop,
+        send_msg_to_user,
+    ],
+    # from workarena++ paper
+    # https://arxiv.org/abs/2407.05291
+    "workarena++": [
+        scroll,
+        fill,
+        select_option,
+        click,
+        dblclick,
+        hover,
+        press,
+        focus,
+        clear,
+        drag_and_drop,
+        tab_focus,
+        new_tab,
+        tab_close,
+        go_back,
+        go_forward,
+        goto,
+        send_msg_to_user,
+        report_infeasible,
+    ],
+    # from weblinx_browsergym
+    # https://github.com/McGill-NLP/agentlab-weblinx-mvp/blob/a91b6d19870c5187d252e70a2e2013511cc6f1d2/weblinx_browsergym/__init__.py#L274-L286
+    "weblinx": [
+        send_msg_to_user,  # say(speaker="assistant", utterance=[str]) -> send_msg_to_user(text=[str])
+        click,  # click(uid=[element id]) -> click(bid=[element id])
+        hover,  # hover(uid=[element id]) -> hover(bid=[element id])
+        fill,  # textinput(uid=[element id], value=[str]) -> fill(bid=[element id], value=[str])
+        # change(uid=[element], value=[str]) -> ❌
+        goto,  # load(url=[link]) -> goto(url=[link])
+        # submit(uid=[element]) -> click(bid=[element id])
+        scroll,  # scroll(x=[int x],y=[int y]) -> scroll(delta_x=[int x], delta_y=[int y])
+        # copy(uid=[element],text=[str]) -> ❌
+        # paste(uid=[element],text=[str]) -> ❌
+        new_tab,  # tabcreate() -> new_tab()
+        tab_close,  # tabremove(target=[tabId]) -> tab_close()
+        tab_focus,  # tabswitch(origin=[origin tabId],target=[target tabId]) -> tab_focus(index=[target tabid])
+    ],
+    # from assistantbench paper
+    # https://arxiv.org/abs/2407.15711
+    "assistantbench": [
+        scroll,  # SCROLL
+        fill,  # TYPE
+        select_option,  # SELECT
+        click,  # CLICK
+        press,  # PRESS ENTER
+        go_back,  # GOBACK
+        goto,  # GOTO, SEARCH
+        send_msg_to_user,  # TERMINATE
+    ],
+}
+@dataclass
+class HighLevelAction:
+    # entrypoint: callable
+    signature: str
+    description: str
+    examples: list[str]
+class HighLevelActionSet(AbstractActionSet):
+    # static class variables
+    ActionSubset = typing.Literal[
+        "chat",
+        "infeas",
+        "bid",
+        "coord",
+        "nav",
+        "tab",
+        "miniwob_all",
+        "miniwob_shi17",
+        "miniwob_liu18",
+        "miniwob_humphreys22",
+        "webarena",
+        "visualwebarena",
+        "workarena",
+        "workarena++",
+        "weblinx",
+        "assistantbench",
+        "custom",
+    ]
+    DemoMode = typing.Literal["off", "default", "all_blue", "only_visible_elements"]
+    def __init__(
+        self,
+        subsets: typing.Optional[ActionSubset | list[ActionSubset]] = [
+            "chat",
+            "infeas",
+            "bid",
+            "nav",
+            "tab",
+        ],
+        custom_actions: typing.Optional[list[callable]] = None,
+        multiaction: bool = True,
+        demo_mode: typing.Optional[DemoMode] = None,
+        strict: bool = False,
+        retry_with_force: bool = False,
+    ):
+        super().__init__(strict)
+        self.multiaction = multiaction
+        self.demo_mode = demo_mode
+        self.retry_with_force = retry_with_force
+        if not subsets:
+            raise ValueError(f"'action_subsets' is empty.")
+        if isinstance(subsets, str):
+            subsets = [subsets]
+        allowed_actions = [noop]  # the noop action is always allowed
+        # add actions from specified action sets
+        if subsets:
+            for subset in subsets:
+                if subset in ACTION_SUBSETS:
+                    allowed_actions.extend(ACTION_SUBSETS[subset])
+                elif subset == "custom":
+                    if not custom_actions:
+                        raise ValueError(
+                            "'custom' is in 'action_subsets' but 'custom_actions' is empty."
+                        )
+                    allowed_actions.extend(custom_actions)
+                else:
+                    raise ValueError(f"Unknown high-level action subspace: {subset}")
+        # like set() but preserves order
+        # https://stackoverflow.com/questions/1653970/does-python-have-an-ordered-set
+        allowed_actions = list(dict.fromkeys(allowed_actions).keys())
+        # parse the actions and build the action space
+        self.action_set: dict[str, HighLevelAction] = {}
+        self.python_includes = ""
+        # include playwright imports
+        self.python_includes += f"""\
+import playwright.sync_api
+from typing import Literal
+"""
+        # set demo_mode and retry_with_force flags
+        self.python_includes += f"""\
+demo_mode={repr(demo_mode)}
+retry_with_force={repr(retry_with_force)}
+if demo_mode is None:
+    demo_mode = "default" if DEMO_MODE else "off"
+"""
+        # include utility functions
+        for _, func in inspect.getmembers(utils, inspect.isfunction):
+            self.python_includes += f"""\
+{inspect.getsource(func)}
+"""
+        # parse and include action functions
+        for func in allowed_actions:
+            # include action function definition in the code
+            self.python_includes += f"""\
+{inspect.getsource(func)}
+"""
+            # extract action signature
+            signature = f"{func.__name__}{inspect.signature(func)}"
+            # parse docstring
+            description, examples = action_docstring_parser.parse_string(func.__doc__)
+            # reconstruct action description
+            description = " ".join(description)
+            # reconstruct action examples
+            examples = [
+                function_name + "(" + ", ".join([repr(arg) for arg in function_args]) + ")"
+                for function_name, function_args in examples
+            ]
+            if func.__name__ in self.action_set:
+                raise ValueError(f"Duplicated action '{func.__name__}'")
+            self.action_set[func.__name__] = HighLevelAction(
+                # entrypoint=func,
+                signature=signature,
+                description=description,
+                examples=examples,
+            )
+    def example_action(self, abstract: bool, max_examples: int = 3) -> str:
+        """
+        Returns an example action as a string.
+        """
+        if abstract:
+            if self.multiaction:
+                return """\
+One or several actions, separated by new lines."""
+            else:
+                return """\
+One single action to be executed. You can only use one action at a time."""
+        else:
+            picked_examples = []
+            # use fill and click examples if action is present
+            for action_name in ["fill", "click", "mouse_click", "keyboard_type"]:
+                if action_name in self.action_set:
+                    picked_examples.extend(self.action_set[action_name].examples)
+            # last resort, use all action examples
+            if not picked_examples:
+                for _, action in self.action_set.items():
+                    picked_examples += action.examples
+            # shuffle examples
+            rng = random.Random(1)
+            rng.shuffle(picked_examples)
+            if self.multiaction:
+                return "\n".join(picked_examples[:max_examples])
+            else:
+                return picked_examples[0]
+    def describe(self, with_long_description: bool = True, with_examples: bool = True):
+        """
+        Returns a textual description of this action space.
+        """
+        description = f"""
+{len(self.action_set)} different types of actions are available.
+"""
+        for _, action in self.action_set.items():
+            description += f"""\
+{action.signature}
+"""
+            if with_long_description:
+                description += f"""\
+    Description: {action.description}
+"""
+            if with_examples and action.examples:
+                description += f"""\
+    Examples:
+"""
+                for example in action.examples:
+                    description += f"""\
+        {example}
+"""
+        if self.multiaction:
+            description += f"""\
+Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.
+More than 2-3 actions usually leads to failure or unexpected behavior."""
+        else:
+            description += f"""\
+Only a single action can be provided at once."""
+        example_action = self.example_action(abstract=False)
+        if example_action:
+            description += f""" Example:
+{example_action}
+"""
+        else:
+            description += f"""\
+"""
+        return description
+    def to_python_code(self, action):
+        """
+        Converts the given high-level action string to browsergym-compatible python code.
+        Args:
+            action: the high-level action to parse.
+        Returns:
+            Executable python code that performs the action in a browsergym environment.
+        """
+        highlevel_code = action
+        # do the actual parsing and convert each high-level action to
+        # the corresponding python function call
+        if self.strict:
+            function_calls = highlevel_action_parser.parse_string(highlevel_code, parse_all=True)
+            function_calls = function_calls.as_list()
+        else:
+            function_calls = highlevel_action_parser.search_string(
+                highlevel_code
+            )  # allow for multiple matches, skip anything in-between
+            function_calls = sum(function_calls.as_list(), [])  # unpack multiple matches
+        if not function_calls:
+            raise ValueError("Received an empty action.")
+        elif len(function_calls) > 1 and not self.multiaction:
+            raise ValueError("Received a multi-action, only single-actions are allowed.")
+        python_code = ""
+        # function definitions
+        python_code += self.python_includes
+        # function calls
+        for function_name, function_args in function_calls:
+            if function_name not in self.action_set:
+                raise NameError(f"Invalid action type '{function_name}'.")
+            python_code += (
+                function_name + "(" + ", ".join([repr(arg) for arg in function_args]) + ")\n"
+            )
+        # return the constructed python code
+        return python_code
+# consistency checks
+assert "custom" not in ACTION_SUBSETS
+assert set(typing.get_args(HighLevelActionSet.ActionSubset)) == set(
+    list(ACTION_SUBSETS.keys()) + ["custom"]
+)

BrowserGym/browsergym/core/src/browsergym/core/action/parsers.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import ast
+import pyparsing as pp
+from dataclasses import dataclass
+from typing import Any
+@dataclass
+class NamedArgument:
+    name: str
+    value: Any
+    def __repr__(self):
+        return f"{self.name}={repr(self.value)}"
+def _build_highlevel_action_parser() -> pp.ParserElement:
+    """
+    Returns:
+        An action parser that accepts Python-like function calls with string, number, list or dict literals as arguments.
+        Example:
+            func("a", 42, None, True, [2, 4, "s"], {"a_key": "a_value"}, )
+        The parser is loose and accepts multi-line or single-line combinations af calls.
+        Example:
+            func() func()
+            \tfunc()
+        Python comments are ignored.
+        Example:
+            # this is a comment
+            func()    # this function call will be parsed
+            # func()  # this one will not
+        The parser will return a list of (function_name, function_args) tuples, one for each function call in the input.
+        The parser will raise exceptions
+    """
+    def make_keyword(kwd_str, kwd_value):
+        return pp.Keyword(kwd_str).set_parse_action(pp.replace_with(kwd_value))
+    TRUE = make_keyword("True", True)
+    FALSE = make_keyword("False", False)
+    NONE = make_keyword("None", None)
+    LBRACK, RBRACK, LBRACE, RBRACE, LPAREN, RPAREN, COLON = map(pp.Suppress, "[]{}():")
+    def literal_eval(toks):
+        return ast.literal_eval(toks[0])
+    string = pp.python_quoted_string().set_parse_action(literal_eval)
+    number = pp.pyparsing_common.number()
+    dict = pp.Forward().set_name("dict")  # will be defined later
+    list = pp.Forward().set_name("list")  # will be defined later
+    _tuple = pp.Forward().set_name("tuple")  # will be defined later
+    element = (string | number | dict | list | _tuple | TRUE | FALSE | NONE).set_name("element")
+    list_items = pp.DelimitedList(element, allow_trailing_delim=True).set_name(None)
+    list << pp.Group(LBRACK + pp.Optional(list_items) + RBRACK, aslist=True)
+    _tuple << pp.Group(LPAREN + pp.Optional(list_items) + RPAREN, aslist=True).set_parse_action(
+        lambda tokens: tuple(tokens[0])
+    )
+    dict_item = pp.Group(string + COLON + element, aslist=True).set_name("dict item")
+    dict_items = pp.DelimitedList(dict_item, allow_trailing_delim=True).set_name(None)
+    dict << pp.Dict(LBRACE + pp.Optional(dict_items) + RBRACE, asdict=True)
+    arg = element
+    list_args = pp.DelimitedList(arg, allow_trailing_delim=True).set_name(None)
+    named_arg = (pp.pyparsing_common.identifier() + pp.Literal("=") + element).set_parse_action(
+        lambda tokens: NamedArgument(name=tokens[0], value=tokens[2])
+    )
+    list_named_args = pp.DelimitedList(named_arg, allow_trailing_delim=True).set_name(None)
+    function_call = pp.pyparsing_common.identifier() + pp.Group(
+        LPAREN + pp.Optional(list_args) + pp.Optional(list_named_args) + RPAREN, aslist=True
+    )
+    multiple_function_calls = pp.DelimitedList(pp.Group(function_call), delim="")
+    multiple_function_calls.ignore(pp.python_style_comment())
+    parser = multiple_function_calls
+    return parser
+# this one will be used to extract python-like function calls
+highlevel_action_parser: pp.ParserElement = _build_highlevel_action_parser()
+# this one will be used to process the docstring in high-level actions, in order to describe the action space
+action_docstring_parser: pp.ParserElement = (
+    pp.Group(pp.OneOrMore(pp.Word(pp.printables), stop_on=pp.Literal("Examples:")))
+    + pp.Literal("Examples:").suppress()
+    + pp.Group(highlevel_action_parser)
+)

BrowserGym/browsergym/core/src/browsergym/core/action/python.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import re
+from .base import AbstractActionSet
+class PythonActionSet(AbstractActionSet):
+    def describe(self, with_long_description: bool = True, with_examples: bool = True):
+        """
+        Returns a textual description of this action space.
+        """
+        description = f"""
+Each action consists of executable Python code (python>=3.10) that uses the Playwright library (playwright==1.32)
+to interact with the current webpage and the browser context. The currently active webpage is accessible via the
+global variable `page`. A function `send_message_to_user(text)` is also accessible and can be used to send a
+message to the user, as well as a function `report_infeasible_instructions(reason)` to notify the user when their
+instructions are infeasible."""
+        if with_long_description:
+            description += f"""
+The browser context is in `page.context`, and all open webpages (tabs and popups)
+are in `page.context.pages`. Here is is an example of a valid action:
+```
+frame = page.frame_locator(".result-frame")
+button = frame.get_by_text("Submit")
+button.click()
+```
+Here is another example:
+```
+frame = page.get_by_test_id("a").frame_locator(":scope")
+frame.get_by_test_id("a776").click()
+```
+Note that Playwright's `get_by_test_id()` method is configured to use the `bid` attribute to locate HTML elements,
+instead of the default `data-testid`. Also, Playwright's locators can not traverse iframes, so you have to locate
+parent iframes first in order to locate an element in an iframe. The `bid` attribute contains all the information
+required to recursively locate an element. For example, an element with `bid="ac2"` can be retrieved as follows:
+```
+frame = page.get_by_test_id("a").frame_locator(":scope")
+frame = frame.get_by_test_id("ac").frame_locator(":scope")
+elem = frame.get_by_test_id("ac2")
+```
+"""
+        else:
+            description += f"""\
+"""
+        if with_examples:
+            description += f"""\
+Here are other examples of valid actions:
+```
+page = page.context.new_page()
+page.goto("https://www.wikipedia.org/")
+```
+```
+page.get_by_label("Birth date").fill("2020-02-02")
+page.get_by_role("link", name="Get started").click()
+```
+```
+page.get_by_label('I agree to the terms above').check()
+```
+```
+page.locator('#area').fill('Hello World!')
+```
+```
+page.get_by_role("textbox").press("Control+ArrowRight")
+```
+```
+send_message_to_user("There are 7 items to choose from.")
+```
+```
+report_infeasible_instructions("I cannot follow these instructions because there is no email field in this form.")
+```
+"""
+        return description
+    def example_action(self, abstract: bool) -> str:
+        """
+        Returns an example action as a string.
+        """
+        if abstract:
+            return """\
+One single bloc of Python code. Do not include any explanation, only valid Python code."""
+        else:
+            return """\
+frame = page.get_by_test_id("b").frame_locator(":scope")
+frame = page.get_by_test_id("ba").frame_locator(":scope")
+frame.get_by_test_id("ba2").fill("Hello world!")
+frame.get_by_test_id("ba3").click()
+"""
+    def to_python_code(self, action):
+        """
+        Converts the given code action string to browsergym-compatible playwright code.
+        Args:
+            action: the code action to parse.
+        Returns:
+            Executable playwright code that performs the action in a browsergym environment.
+        """
+        python_code = ""
+        # extract markdown-style code snippets if detected
+        pattern = re.compile(r"```(?:python)?\n(?P<code>[\s\S]*?)```")
+        if pattern.match(action):
+            python_code += "\n".join([match.group("code") for match in pattern.finditer(action)])
+        # otherwise just use the code action as is
+        else:
+            python_code += action
+        # return the produced playwright code
+        return python_code

BrowserGym/browsergym/core/src/browsergym/core/action/utils.py ADDED Viewed

	@@ -0,0 +1,288 @@

+from typing import Literal
+import playwright.sync_api
+def get_elem_by_bid(
+    page: playwright.sync_api.Page, bid: str, scroll_into_view: bool = False
+) -> playwright.sync_api.Locator:
+    """
+    Parse the given bid to sequentially locate every nested frame leading to the bid, then
+    locate the bid element. Bids are expected to take the form "abDb123", which means
+    the element abDb123 is located inside frame abDAb, which is located inside frame abDA,
+    which is located inside frame a, which is located inside the page's main frame.
+    Args:
+        bid: the browsergym id (playwright testid) of the page element.
+        scroll_into_view: try to scroll element into view, unless it is completely visible.
+    Returns:
+        Playwright element.
+        Bounding box of the element.
+    """
+    if not isinstance(bid, str):
+        raise ValueError(f"expected a string, got {repr(bid)}")
+    current_frame = page
+    # dive into each nested frame, to the frame where the element is located
+    i = 0
+    while bid[i:] and not bid[i:].isnumeric():
+        i += 1
+        # allow multi-character frame ids such as aA, bCD etc.
+        while bid[i:] and bid[i].isalpha() and bid[i].isupper():
+            i += 1
+        frame_bid = bid[:i]  # bid of the next frame to select
+        frame_elem = current_frame.get_by_test_id(frame_bid)
+        if not frame_elem.count():
+            raise ValueError(f'Could not find element with bid "{bid}"')
+        if scroll_into_view:
+            frame_elem.scroll_into_view_if_needed(timeout=500)
+        current_frame = frame_elem.frame_locator(":scope")
+    # finally, we should have selected the frame where the target element is
+    elem = current_frame.get_by_test_id(bid)
+    if not elem.count():
+        raise ValueError(f'Could not find element with bid "{bid}"')
+    if scroll_into_view:
+        elem.scroll_into_view_if_needed(timeout=500)
+    return elem
+def highlight_by_box(
+    page: playwright.sync_api.Page, box: dict, color: Literal["blue", "red"] = "blue"
+):
+    """Highlights the target element based on its bounding box attributes."""
+    assert color in ("blue", "red")
+    if box:
+        left, top, width, height = box["x"], box["y"], box["width"], box["height"]
+        page.evaluate(
+            f"""\
+const overlay = document.createElement('div');
+document.body.appendChild(overlay);
+overlay.setAttribute('style', `
+    all: initial;
+    position: fixed;
+    border: 2px solid transparent;  /* Start with transparent border */
+    borderRadius: 10px;  /* Add rounded corners */
+    boxShadow: 0 0 0px {color};  /* Initial boxShadow with 0px spread */
+    left: {left - 2}px;  /* Adjust left position to accommodate initial shadow spread */
+    top: {top - 2}px;  /* Adjust top position likewise */
+    width: {width}px;
+    height: {height}px;
+    z-index: 2147483646; /* Maximum value - 1 */
+    pointerEvents: none; /* Ensure the overlay does not interfere with user interaction */
+`);
+// Animate the boxShadow to create a "wave" effect
+let spread = 0;  // Initial spread radius of the boxShadow
+const waveInterval = setInterval(() => {{
+    spread += 10;  // Increase the spread radius to simulate the wave moving outward
+    overlay.style.boxShadow = `0 0 40px ${{spread}}px {color}`;  // Update boxShadow to new spread radius
+    overlay.style.opacity = 1 - spread / 38;  // Gradually decrease opacity to fade out the wave
+    if (spread >= 38) {{  // Assuming 76px ~ 2cm spread radius
+        clearInterval(waveInterval);  // Stop the animation once the spread radius reaches 2cm
+        document.body.removeChild(overlay);  // Remove the overlay from the document
+    }}
+}}, 200);  // Adjust the interval as needed to control the speed of the wave animation
+"""
+        )
+        # Wait a bit to let users see the highlight
+        page.wait_for_timeout(1000)  # Adjust delay as needed
+def smooth_move_visual_cursor_to(
+    page: playwright.sync_api.Page, x: float, y: float, speed: float = 400
+):
+    """
+    Smoothly moves the visual cursor to a specific point, with constant
+    movement speed.
+    Args:
+        x: target location X coordinate (in viewport pixels)
+        y: target location Y coordinate (in viewport pixels)
+        speed: cursor speed (in pixels per second)
+    """
+    movement_time = page.evaluate(
+        """\
+    ([targetX, targetY, speed]) => {
+        // create cursor if needed
+        if (!("browsergym_visual_cursor" in window)) {
+            if (window.trustedTypes && window.trustedTypes.createPolicy) {
+                window.trustedTypes.createPolicy('default', {
+                    createHTML: (string, sink) => string
+                });
+            }
+            let cursor = document.createElement('div');
+            cursor.setAttribute('id', 'browsergym-visual-cursor');
+            cursor.innerHTML = `
+                <svg width="50px" height="50px" viewBox="213 106 713 706" fill="none" xmlns="http://www.w3.org/2000/svg">
+                <path d="M213.333 106.667L426.667 853.333 512 512 853.333 426.667 213.333 106.667z" fill="blue"/>
+                </svg>
+`;
+            cursor.setAttribute('style', `
+                all: initial;
+                position: fixed;
+                opacity: 0.7; /* Slightly transparent */
+                z-index: 2147483647; /* Maximum value */
+                pointer-events: none; /* Ensures the SVG doesn't interfere with page interactions */
+            `);
+            // Calculate center position within the viewport
+            const centerX = window.innerWidth / 2;
+            const centerY = window.innerHeight / 2;
+            cursor.style.left = `${centerX}px`;
+            cursor.style.top = `${centerY}px`;
+            // save cursor element
+            window.browsergym_visual_cursor = cursor;
+            window.browsergym_visual_cursor_n_owners = 0;
+        }
+        // recover cursor
+        let cursor = window.browsergym_visual_cursor;
+        // attach cursor to document
+        document.body.appendChild(cursor);
+        window.browsergym_visual_cursor_n_owners += 1;
+        x = parseFloat(cursor.style.left);
+        y = parseFloat(cursor.style.top);
+        dx = targetX - x;
+        dy = targetY - y;
+        dist = Math.hypot(dx, dy);
+        movement_time = (dist / speed) * 1000;  // seconds to milliseconds
+        still_wait_time = 1000;
+        // Adjust steps based on distance to keep movement speed consistent
+        // 1 step per 10 pixels of distance, adjust as needed
+        steps = Math.max(1, Math.trunc(dist / 10));
+        step_dx = dx / steps;
+        step_dy = dy / steps;
+        step_dist = dist / steps;
+        step_wait_time = Math.max(10, movement_time / steps);
+        let step = 0;
+        let time_still = 0;
+        const cursorInterval = setInterval(() => {
+            // move cursor
+            if (step < steps) {
+                x += step_dx;
+                y += step_dy;
+                cursor.style.left = `${x}px`;
+                cursor.style.top = `${y}px`;
+            }
+            // still cursor (wait a bit)
+            else if (time_still < still_wait_time) {
+                time_still += step_wait_time;
+            }
+            // stop and detach cursor
+            else {
+                clearInterval(cursorInterval);
+                window.browsergym_visual_cursor_n_owners -= 1;
+                if (window.browsergym_visual_cursor_n_owners <= 0) {
+                    document.body.removeChild(cursor);
+                }
+            }
+            step += 1;
+        }, step_wait_time);
+        return movement_time;
+    }""",
+        [x, y, speed],
+    )
+    page.wait_for_timeout(movement_time)
+def check_for_overlay(
+    page: playwright.sync_api.Page, bid: str, element: playwright.sync_api.ElementHandle, box: dict
+):
+    if not element:
+        return False
+    visibility = element.get_attribute("browsergym_visibility_ratio")
+    if visibility is not None:
+        return float(visibility) >= 0.5
+    """Checks if a given element is the topmost element at its center position by default.
+    If check_corners is True, it checks if any of the corners is visible."""
+    if box:
+        # corners
+        points_to_check = [
+            (box["x"], box["y"]),
+            (box["x"] + box["width"], box["y"]),
+            (box["x"], box["y"] + box["height"]),
+            (box["x"] + box["width"], box["y"] + box["height"]),
+        ]
+        for x, y in points_to_check:
+            # Execute JavaScript to find the topmost element at the point.
+            top_element = page.evaluate(
+                f"""() => {{
+                const el = document.elementFromPoint({x}, {y});
+                return el ? el.outerHTML : '';
+            }}"""
+            )
+            # Check if the topmost element is the element we're interested in.
+            if top_element and bid in top_element:
+                return True
+    return False
+def add_demo_mode_effects(
+    page: playwright.sync_api.Page,
+    elem: playwright.sync_api.ElementHandle,
+    bid: str,
+    demo_mode: Literal["off", "default", "all_blue", "only_visible_elements"],
+    move_cursor: bool = True,
+    highlight_box: bool = True,
+):
+    if demo_mode == "off":
+        return
+    """Adds visual effects to the target element"""
+    box = elem.bounding_box()
+    # box = extract_bounds_cdp(page, bid)
+    if box:
+        center_x, center_y = box["x"] + box["width"] / 2, box["y"] + box["height"] / 2
+        is_top_element = check_for_overlay(page, bid, elem, box)
+        if demo_mode == "only_visible_elements":
+            if not is_top_element:
+                return
+            else:
+                color = "blue"
+        elif demo_mode == "default":
+            if is_top_element:
+                color = "blue"
+            else:
+                color = "red"
+        elif demo_mode == "all_blue":
+            color = "blue"
+        if move_cursor:
+            smooth_move_visual_cursor_to(page, center_x, center_y)
+        if highlight_box:
+            highlight_by_box(page, box, color=color)
+def call_fun(fun: callable, retry_with_force: bool):
+    try:
+        fun(force=False)
+    except playwright.sync_api.TimeoutError as e:
+        if retry_with_force:
+            fun(force=True)
+        else:
+            raise e

BrowserGym/browsergym/core/src/browsergym/core/chat.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import base64
+from pathlib import Path
+from typing import Literal
+import logging
+import playwright.sync_api
+import re
+import time
+from importlib import resources
+from . import _get_global_playwright, chat_files
+CHATBOX_DIR = resources.files(chat_files)
+logger = logging.getLogger(__name__)
+class Chat:
+    def __init__(
+        self, headless: bool, chat_size=(500, 800), record_video_dir=None, modern=True
+    ) -> None:
+        self.messages = []
+        # create a new browser, browser context and page for the chat
+        pw: playwright.sync_api.Playwright = _get_global_playwright()
+        self.browser = pw.chromium.launch(
+            headless=headless, args=[f"--window-size={chat_size[0]},{chat_size[1]}"]
+        )
+        self.context = self.browser.new_context(
+            no_viewport=True,
+            record_video_dir=Path(record_video_dir) / "chat_video" if record_video_dir else None,
+            record_video_size=dict(width=chat_size[0], height=chat_size[1]),
+        )
+        self.page = self.context.new_page()
+        self.recording_start_time = time.time() if record_video_dir else None
+        # setup the chat page
+        self.page.expose_function(
+            "send_user_message", lambda msg: self._js_user_message_received_callback(msg=msg)
+        )
+        if modern:
+            self.page.set_content(get_chatbox_modern(CHATBOX_DIR))
+        else:
+            self.page.set_content(get_chatbox_classic(CHATBOX_DIR))
+    def _js_user_message_received_callback(self, msg: str):
+        """Callback function for when a user message is received in the chatbox"""
+        utc_time = time.time()
+        self.messages.append({"role": "user", "timestamp": utc_time, "message": msg})
+        # returning a list as JS doesnt like tuples
+        return ["user", time.strftime("%H:%M", time.localtime(utc_time)), msg]
+    def add_message(
+        self, role: Literal["user", "user_image", "assistant", "info", "infeasible"], msg: str
+    ):
+        """Add a message to the chatbox and update the page accordingly."""
+        utc_time = time.time()
+        if role not in ("user", "user_image", "assistant", "info", "infeasible"):
+            raise ValueError(f"Invalid role: {role}")
+        if role in ("user", "user_image", "assistant", "infeasible"):
+            self.messages.append({"role": role, "timestamp": utc_time, "message": msg})
+        timestamp = time.strftime("%H:%M:%S", time.localtime(utc_time))
+        self.page.evaluate(f"addChatMessage({repr(role)}, {repr(timestamp)}, {repr(msg)});")
+    def wait_for_user_message(self):
+        logger.info("Waiting for message from user...")
+        # reset flag
+        self.page.evaluate("USER_MESSAGE_RECEIVED = false;")
+        # wait for flag to be raised
+        self.page.wait_for_function("USER_MESSAGE_RECEIVED", polling=100, timeout=0)
+        logger.info("Message received.")
+    def close(self):
+        self.context.close()
+        self.browser.close()
+def get_chatbox_modern(chatbox_dir) -> str:
+    with open(chatbox_dir / "chatbox_modern.html", "r") as file:
+        chatbox_html = file.read()
+    return chatbox_html
+def get_chatbox_classic(chatbox_dir) -> str:
+    with open(chatbox_dir / "chatbox.html", "r") as file:
+        chatbox_html = file.read()
+    with open(chatbox_dir / "assistant.png", "rb") as f:
+        image_base64 = base64.b64encode(f.read()).decode("utf-8")
+    assistant_image_url = f"data:image/png;base64,{image_base64}"
+    chatbox_html = re.sub("<ASSISTANT_IMAGE_URL>", assistant_image_url, chatbox_html)
+    return chatbox_html

BrowserGym/browsergym/core/src/browsergym/core/chat_files/chatbox.html ADDED Viewed

	@@ -0,0 +1,243 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>UI Assistant Chat</title>
+    <style>
+        .chat-container {
+            display: flex;
+            flex-flow: column;
+            position: fixed;
+            bottom: 0;
+            right: 0;
+            height: 100%;
+            width: 100%;
+            border: 1px solid black;
+            background-color: white;
+            padding: 0;
+            overflow: hidden;
+            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+            font-family: 'Source Sans Pro', Arial, Helvetica, sans-serif;
+        }
+        .chat-header {
+            background-color: #032D42;
+            color: white;
+            padding: 5px;
+            padding-left: 15px;
+            text-align: center;
+            flex: 0 1 auto;
+        }
+        .chat-body {
+            padding: 10px;
+            overflow-y: auto;
+            display: flex;
+            flex-direction: column;
+            flex: 1 1 auto;
+        }
+        .chat-debug {
+            padding: 10px;
+            max-height: 30%;
+            overflow-y: auto;
+            display: flex;
+            flex-direction: column;
+            flex: 0 0 auto;
+        }
+        .chat-input-area {
+            display: flex;
+            flex-flow: row;
+            margin-top: 5px;
+            margin-top: 5px;
+            padding: 10px;
+            border-top: 1px solid #ddd;
+            flex: 0 1 50px;
+        }
+        .chat-input-area form {
+            display: flex;
+            width: 100%;
+            height: 100%;
+        }
+        .input-box {
+            padding: 5px;
+            margin-right: 10px;
+            border-radius: 5px;
+            border: 1px solid #ccc;
+            width: 100%;
+        }
+        .submit-button {
+            padding: 5px 10px;
+            border-radius: 5px;
+            background-color: #4CAF50;
+            color: white;
+            border: none;
+            align-self: center;
+        }
+        .message {
+            display: flex;
+            align-items: center;
+            margin: 0px;
+            padding: 0px;
+        }
+        .message p {
+            padding: 10px;
+            /* Added padding inside the bubble */
+            border-radius: 15px;
+            flex-grow: 1;
+            margin-top: 10;
+            margin-bottom: 0;
+        }
+        .chat-debug .message p {
+            padding: 0;
+            border-radius: 0;
+            flex-grow: 1;
+            margin-top: 0;
+            margin-bottom: 0;
+        }
+        .user-message {
+            background-color: #d1f4d1;
+        }
+        .assistant-message {
+            background-color: #e0e0e0;
+        }
+        .info-message {
+            background-color: #f0f0f0;
+            color: #707070;
+            font-size: 13px;
+        }
+        .assistant-image {
+            margin: 0px;
+            padding: 10px;
+            width: 40px;
+        }
+    </style>
+</head>
+<body>
+    <div class="chat-container">
+        <div class="chat-header">
+            <h2>BrowserGym</h2>
+        </div>
+        <div class="chat-body" id="chatBody"></div>
+        <div class="chat-debug" id="chatDebug"></div>
+        <div class="chat-input-area">
+            <form id="chatForm">
+                <textarea class="input-box" rows="2" id="inputBox"></textarea>
+                <input type="submit" class="submit-button" value="Send">
+            </form>
+        </div>
+    </div>
+    <script>
+        const assistant_image_data = "<ASSISTANT_IMAGE_URL>";
+        var USER_MESSAGE_RECEIVED = false;
+        function escapeHtml(unsafe) {
+            return unsafe
+                .replace(/&/g, "&amp;")
+                .replace(/</g, "&lt;")
+                .replace(/>/g, "&gt;")
+                .replace(/"/g, "&quot;")
+                .replace(/'/g, "&#039;");
+        }
+        function addChatMessage(role, msg) {
+            const chatBody = document.getElementById('chatBody');
+            const chatDebug = document.getElementById('chatDebug');
+            const msgContainer = document.createElement('div');
+            msgContainer.className = 'message';
+            const text = document.createElement('p');
+            text.innerHTML = escapeHtml(msg);
+            const assistant_img = document.createElement('img');
+            assistant_img.src = assistant_image_data;
+            assistant_img.alt = 'Assistant';
+            assistant_img.className = 'assistant-image';
+            switch (role) {
+                case "user":
+                    text.className = 'user-message';
+                    msgContainer.appendChild(text);
+                    chatBody.appendChild(msgContainer);
+                    break;
+                case "assistant":
+                    text.className = 'assistant-message';
+                    msgContainer.appendChild(assistant_img); // Add the image to the message container
+                    msgContainer.appendChild(text);
+                    chatBody.appendChild(msgContainer);
+                    break;
+                case "info":
+                    text.className = 'info-message';
+                    text.innerHTML = msg;
+                    msgContainer.appendChild(text);
+                    // hide previous debug messages
+                    for (const msg of chatDebug.children) {
+                        msg.style.display = 'none';
+                    }
+                    chatDebug.appendChild(msgContainer);
+                    break;
+                default:
+                    throw new TypeError(`Illegal role "${role}".`);
+            }
+            chatBody.scrollTop = chatBody.scrollHeight;
+            if (role === "user") {
+                USER_MESSAGE_RECEIVED = true;
+            }
+        }
+        if (typeof send_user_message !== 'function') {
+            function send_user_message(msg) {
+                // This will be overloaded by playwright
+            }
+        }
+        const inputBox = document.getElementById('inputBox');
+        function send_msg(msg) {
+            if (msg.trim()) {
+                send_user_message(msg);
+                addChatMessage('user', msg);
+                inputBox.value = '';
+            }
+        }
+        inputBox.onkeypress = (e) => {
+            if (e.key === 'Enter' && !e.shiftKey) {
+                e.preventDefault();
+                send_msg(inputBox.value);
+            }
+        };
+        document.getElementById('chatForm').onsubmit = function (event) {
+            event.preventDefault();
+            send_msg(inputBox.value);
+            return false;
+        }
+    </script>
+</body>
+</html>

BrowserGym/browsergym/core/src/browsergym/core/chat_files/chatbox_modern.html ADDED Viewed

	@@ -0,0 +1,379 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>UI Assistant Chat</title>
+    <style>
+        body {
+            font-family: 'Gilroy', sans-serif;
+        }
+        textarea {
+            font-family: 'Gilroy', sans-serif;
+        }
+        .chat-container {
+            position: fixed;
+            bottom: 0;
+            right: 0;
+            height: 100%;
+            width: 100%;
+            border: 1px solid black;
+            background-color: black;
+            padding: 0;
+            overflow: hidden;
+            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+            display: flex;
+            flex-direction: column;
+            min-height: 0;
+        }
+        .gradient {
+            height: 100%;
+            width: 100%;
+            position: absolute;
+            top: 0;
+            background: linear-gradient(180deg, #002239 0%, rgba(0, 34, 57, 0) 100%);
+            z-index: -1;
+        }
+        #chatui-generating-indicator {
+            position: absolute;
+            height: 100vh;
+            width: 8px;
+        }
+        #chatui-generating-indicator-gradient {
+            height: 100%;
+            width: 100%;
+            animation: 1.5s ease alternate infinite thinking;
+            background: linear-gradient(0deg, #032D42 0%, #50CED8 100%);
+            background-size: 400% 400%;
+        }
+        @keyframes thinking {
+            0% {
+                background-position: 0% 0%;
+            }
+            100% {
+                background-position: 0% 100%;
+            }
+        }
+        .spacer {
+            flex: 1;
+        }
+        .chat-wrapper {
+            padding: 0px 48px 48px 48px;
+            display: flex;
+            flex-flow: column;
+            flex: 1;
+            min-height: 0;
+        }
+        .chat-body {
+            padding: 10px;
+            overflow-y: auto;
+            display: flex;
+            flex-direction: column;
+            flex: 1 1 auto;
+        }
+        /* Hide scrollbar for Chrome, Safari and Opera */
+        .chat-body::-webkit-scrollbar {
+            display: none;
+        }
+        .chat-debug {
+            padding: 10px;
+            max-height: 45%;
+            overflow-y: auto;
+            display: flex;
+            flex-direction: column;
+            flex: 0 0 auto;
+        }
+        /* Hide scrollbar for Chrome, Safari and Opera */
+        .chat-debug::-webkit-scrollbar {
+            display: none;
+        }
+        .chat-input-area {
+            display: flex;
+            flex-flow: row;
+            margin-top: 48px;
+            padding: 10px;
+            padding-left: 18px;
+            flex: 0 1 50px;
+            background-color: #022435;
+            border-radius: 12px;
+        }
+        .chat-input-area form {
+            display: flex;
+            width: 100%;
+            height: 100%;
+        }
+        .input-box {
+            padding: 5px;
+            margin-right: 10px;
+            border-radius: 5px;
+            width: 100%;
+            background-color: transparent;
+            color: white;
+            border: none;
+            outline: none;
+            resize: none;
+            font-size: 18px;
+            min-height: 100px;
+            /* Minimum starting height */
+            max-height: 300px;
+            /* Maximum height */
+            overflow-y: auto;
+            /* Allows scrolling within the input box if content exceeds max height */
+            height: auto;
+            /* Automatically adjust height, but limited by other CSS properties */
+        }
+        /* Hide scrollbar for Chrome, Safari and Opera */
+        .input-box::-webkit-scrollbar {
+            display: none;
+        }
+        .submit-button {
+            margin-left: 10px;
+            background-color: #022435;
+            color: #9AABB3;
+            font-weight: bold;
+            cursor: pointer;
+            background-image: url('data:image/svg+xml,<svg width="14" height="13" viewBox="0 0 14 13" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M13.7038 6.04336L0.709549 0.0460291C0.528509 -0.0375275 0.315131 -0.00553368 0.166559 0.127445C0.0179865 0.260423 -0.0373753 0.468963 0.0256778 0.658123L1.97297 6.50001L0.0256778 12.3419C-0.0373753 12.5311 0.0179865 12.7396 0.166559 12.8726C0.315131 13.0056 0.528509 13.0375 0.709549 12.954L13.7038 6.95666C13.8817 6.87718 14.0001 6.69465 14.0001 6.50001C14.0001 6.3048 13.8819 6.12293 13.7038 6.04336ZM2.8604 6.00001L1.33983 1.4383L11.2235 6.00001H2.8604ZM11.2235 7.00001L1.33983 11.5617L2.8604 7.00001H11.2235Z" fill="%234F6C7B"/></svg>');
+            background-repeat: no-repeat;
+            background-position: center;
+            width: 60px;
+            background-repeat: no-repeat;
+            background-position: center;
+            background-size: 20px 20px;
+            border: none;
+            border-radius: 4px;
+        }
+        .submit-button:hover {
+            background-color: #03334a;
+        }
+        .message {
+            display: flex;
+            align-items: center;
+            margin: 0px;
+            padding: 0px;
+            margin-bottom: 10px;
+        }
+        .message p {
+            margin-bottom: 0;
+        }
+        .user-message {
+            background-color: transparent;
+            color: white;
+            font-size: 20px;
+        }
+        .user-message::before {
+            content: var(--before-content, "You");
+            color: #09A2BF;
+            display: block;
+            margin-bottom: 4px;
+            font-size: 10px;
+            text-transform: uppercase;
+        }
+        .assistant-message {
+            background-color: transparent;
+            color: #7ACA87;
+            font-size: 20px;
+        }
+        .assistant-message::before {
+            content: var(--before-content, "Bot");
+            color: #29A93E;
+            display: block;
+            margin-bottom: 4px;
+            font-size: 10px;
+            text-transform: uppercase;
+        }
+        .info-message {
+            color: #afadad;
+            font-size: 22px;
+            background: #04334b;
+            padding: 10px;
+            border-radius: 4px;
+            width: 100%;
+        }
+        .assistant-image {
+            margin: 0px;
+            padding: 10px;
+            width: 40px;
+        }
+    </style>
+</head>
+<body>
+    <div class="chat-container">
+        <div class="chat-debug" id="chatDebug"></div>
+        <div class="gradient">
+        </div>
+        <div id="chatui-generating-indicator" style="display: none;">
+            <div id="chatui-generating-indicator-gradient"></div>
+        </div>
+        <div class="chat-wrapper">
+            <div class="chat-body" id="chatBody">
+                <div class="spacer"></div>
+            </div>
+            <div class="chat-input-area">
+                <form id="chatForm">
+                    <textarea class="input-box" id="inputBox" placeholder="How can I help you?"
+                        title="Ask any question or type exit to quit."></textarea>
+                    <input type="submit" class="submit-button" value="">
+                    </input>
+                </form>
+            </div>
+        </div>
+    </div>
+    <script>
+        const assistant_image_data = "<ASSISTANT_IMAGE_URL>";
+        var USER_MESSAGE_RECEIVED = false;
+        function escapeHtml(unsafe) {
+            return unsafe
+                .replace(/&/g, "&amp;")
+                .replace(/</g, "&lt;")
+                .replace(/>/g, "&gt;")
+                .replace(/"/g, "&quot;")
+                .replace(/'/g, "&#039;");
+        }
+        function addHtmlLineBreaks(text) {
+            return text.replace(/\n/g, "<br>");
+        }
+        function toHtmlImage(url) {
+            return `<img src="${url}">`
+        }
+        function addChatMessage(role, timeString, msg) {
+            const chatBody = document.getElementById('chatBody');
+            const chatDebug = document.getElementById('chatDebug');
+            const msgContainer = document.createElement('div');
+            msgContainer.className = 'message';
+            const text = document.createElement('div');
+            // const assistant_img = document.createElement('img');
+            // assistant_img.src = assistant_image_data;
+            // assistant_img.alt = 'Assistant';
+            // assistant_img.className = 'assistant-image';
+            switch (role) {
+                case "user":
+                    text.className = 'user-message';
+                    text.innerHTML = addHtmlLineBreaks(msg);
+                    text.style.setProperty('--before-content', `"${timeString} - You"`);
+                    msgContainer.appendChild(text);
+                    chatBody.appendChild(msgContainer);
+                    break;
+                case "user_image":
+                    text.className = 'user-message';
+                    text.innerHTML = toHtmlImage(msg);
+                    text.style.setProperty('--before-content', `"${timeString} - You"`);
+                    msgContainer.appendChild(text);
+                    chatBody.appendChild(msgContainer);
+                    break;
+                case "assistant":
+                    text.className = 'assistant-message';
+                    text.innerHTML = addHtmlLineBreaks(escapeHtml(msg));
+                    text.style.setProperty('--before-content', `"${timeString} - Bot"`);
+                    // msgContainer.appendChild(assistant_img); // Add the image to the message container
+                    msgContainer.appendChild(text);
+                    chatBody.appendChild(msgContainer);
+                    break;
+                case "infeasible":
+                    text.className = 'assistant-message';
+                    text.innerHTML = addHtmlLineBreaks(escapeHtml(msg));
+                    text.style.setProperty('--before-content', `"${timeString} - Bot (abort)"`);
+                    msgContainer.appendChild(text);
+                    chatBody.appendChild(msgContainer);
+                    break;
+                case "info":
+                    text.className = 'info-message';
+                    text.innerHTML = addHtmlLineBreaks(escapeHtml(msg));
+                    msgContainer.appendChild(text);
+                    // hide previous debug messages
+                    for (const msg of chatDebug.children) {
+                        msg.style.display = 'none';
+                    }
+                    chatDebug.appendChild(msgContainer);
+                    break;
+                default:
+                    throw new TypeError(`Illegal role "${role}".`);
+            }
+            chatBody.scrollTop = chatBody.scrollHeight;
+            if (role === "user") {
+                USER_MESSAGE_RECEIVED = true;
+            }
+        }
+        if (typeof send_user_message !== 'function') {
+            function send_user_message(msg) {
+                // This will be overloaded by playwright
+            }
+        }
+        const inputBox = document.getElementById('inputBox');
+        async function send_msg(msg) {
+            if (msg.trim()) {
+                const strings = await send_user_message(msg);
+                addChatMessage(strings[0], strings[1], strings[2]);
+                inputBox.value = '';
+            }
+        }
+        inputBox.onkeypress = (e) => {
+            if (e.key === 'Enter' && !e.shiftKey) {
+                e.preventDefault();
+                send_msg(inputBox.value);
+            }
+        };
+        document.getElementById('chatForm').onsubmit = function (event) {
+            event.preventDefault();
+            send_msg(inputBox.value);
+            return false;
+        }
+        // addChatMessage('info', 'Hello World');
+        // addChatMessage('assistant', 'Hello assistant');
+        // addChatMessage('user', 'Hello user');
+    </script>
+</body>
+</html>

BrowserGym/browsergym/core/src/browsergym/core/chat_files/img/send.svg ADDED Viewed

BrowserGym/browsergym/core/src/browsergym/core/constants.py ADDED Viewed

	@@ -0,0 +1,5 @@

+BROWSERGYM_ID_ATTRIBUTE = "bid"  # Playwright's default is "data-testid"
+BROWSERGYM_VISIBILITY_ATTRIBUTE = "browsergym_visibility_ratio"
+BROWSERGYM_SETOFMARKS_ATTRIBUTE = "browsergym_set_of_marks"
+EXTRACT_OBS_MAX_TRIES = 5

BrowserGym/browsergym/core/src/browsergym/core/env.py ADDED Viewed

	@@ -0,0 +1,625 @@

+import copy
+import logging
+import re
+import time
+from abc import ABC
+from pathlib import Path
+from typing import Any, Callable, Literal, Optional
+import gymnasium as gym
+import numpy as np
+import playwright.sync_api
+from . import _get_global_playwright
+from .action.base import execute_python_code
+from .action.highlevel import HighLevelActionSet
+from .chat import Chat
+from .constants import BROWSERGYM_ID_ATTRIBUTE, EXTRACT_OBS_MAX_TRIES
+from .observation import (
+    MarkingError,
+    _post_extract,
+    _pre_extract,
+    extract_dom_extra_properties,
+    extract_dom_snapshot,
+    extract_focused_element_bid,
+    extract_merged_axtree,
+    extract_screenshot,
+)
+from .spaces import AnyBox, AnyDict, Float, Unicode
+from .task import AbstractBrowserTask
+logger = logging.getLogger(__name__)
+def _try_to_extract_legacy_goal(goal: list):
+    legacy_goal_strings = []
+    for message in goal:
+        if message["type"] == "text":
+            legacy_goal_strings.append(message["text"])
+        else:
+            logger.debug(
+                f"Message type {repr(message['type'])} present in the goal, cannot be converted to legacy text-only format."
+            )
+            legacy_goal_strings.append(
+                'WARNING: This goal cannot be converted to a text-only goal format. Use the new goal format instead ("goal_object" field). Any agent reading this should abort immediately.'
+            )
+            break
+    legacy_goal = "\n".join(legacy_goal_strings)
+    return legacy_goal
+class BrowserEnv(gym.Env, ABC):
+    """The main BrowserGym class, which encapsulates instruction-following Web browsing into a Gymnasium environment."""
+    # gym metadata
+    metadata = {"render_modes": None}
+    def __init__(
+        self,
+        # task-related arguments
+        task_entrypoint: type[AbstractBrowserTask],
+        task_kwargs: dict = {},
+        viewport: Optional[dict] = None,  # will override the task's viewport
+        slow_mo: Optional[int] = None,  # will override the task's slow_mo
+        timeout: Optional[int] = None,  # will override the task's timeout
+        locale: Optional[str] = None,  # will override the task's locale
+        timezone_id: Optional[str] = None,  # will override the task's timezone_id
+        tags_to_mark: Literal["all", "standard_html"] = "standard_html",
+        # interactive / debugging arguments
+        headless: bool = True,
+        wait_for_user_message: bool = False,
+        terminate_on_infeasible: bool = True,
+        resizeable_window: bool = False,
+        record_video_dir: Optional[str] = None,
+        pw_chromium_kwargs: dict = {},
+        pw_context_kwargs: dict = {},
+        # agent-related arguments
+        action_mapping: Optional[callable] = HighLevelActionSet().to_python_code,
+    ):
+        """
+        Instantiate a ready to use BrowserEnv gym environment.
+        Args:
+            task_entrypoint: a callable that returns a new task object from a seed. Used for creating a new task during `reset()`.
+            task_kwargs: additional arguments passed to `task_entrypoint`.
+            viewport: desired viewport size. This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing.
+            slow_mo: desired slow_mo value for Playwright. This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing.
+            timeout: desired timeout value for Playwright. This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing.
+            locale: desired user locale for Playwright, for example en-GB, de-DE, etc. This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing.
+            timezone_id. desired timezone for Playwright, for example "Pacific/Tahiti". This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing.
+            tags_to_mark: which HTML tags should be marked by BrowserGym and receive a bid. Value "all" will mark every element in the page, while "standard_html" (default) will only mark standard html tags.
+            headless: whether the browser should run in headless mode or not. This will affect the viewport size, which might change the behaviour and difficulty of the task. Headless mode should only be disabled for debugging/testing.
+            wait_for_user_message: whether the environment should pause and wait for a user message in the chat after a new message is sent by the agent. Useful for running agents in interactive mode.
+            resizeable_window: whether the browser window should be resizeable or not. This will affect the viewport size, which might change the behaviour and difficulty of the task. Should only be set for debugging/testing.
+            record_video_dir: if set, indicates a directory to which viewport videos will be recorded.
+            pw_chromium_kwargs: extra parameters for the playwright Browser. Should only be used for debugging/testing.
+            pw_context_kwargs: extra parameters for the playwright BrowserContext. Should only be used for debugging/testing.
+            action_mapping: if set, the environment will use this function to map every received action to executable Python code.
+        """
+        super().__init__()
+        self.task_entrypoint = task_entrypoint
+        self.task_kwargs = dict(**task_kwargs)
+        self.viewport = viewport
+        self.slow_mo = slow_mo
+        self.timeout = timeout
+        self.locale = locale
+        self.timezone_id = timezone_id
+        self.tags_to_mark = tags_to_mark
+        self.headless = headless
+        self.wait_for_user_message = wait_for_user_message
+        self.terminate_on_infeasible = terminate_on_infeasible
+        self.resizeable_window = resizeable_window
+        self.record_video_dir = record_video_dir
+        self.pw_chromium_kwargs = pw_chromium_kwargs
+        self.pw_context_kwargs = pw_context_kwargs
+        self.action_mapping = action_mapping
+        # check argument values
+        assert tags_to_mark in ("all", "standard_html")
+        # task
+        self.task = None
+        # playwright
+        self.browser: playwright.sync_api.Browser = None
+        self.context: playwright.sync_api.BrowserContext = None
+        self.page: playwright.sync_api.Page = None
+        self.page_history: dict = {}
+        # chat
+        self.chat: Chat = None
+        # observation space
+        self.observation_space = gym.spaces.Dict(
+            {
+                "chat_messages": gym.spaces.Sequence(
+                    gym.spaces.Dict(
+                        {
+                            "role": Unicode(),
+                            "timestamp": Float(),
+                            "message": Unicode(),
+                        }
+                    )
+                ),
+                "goal": Unicode(),
+                "goal_object": gym.spaces.Sequence(AnyDict()),
+                "open_pages_urls": gym.spaces.Sequence(Unicode()),
+                "open_pages_titles": gym.spaces.Sequence(Unicode()),
+                "active_page_index": gym.spaces.Box(
+                    low=0, high=255, dtype=int
+                ),  # TODO: change to an Integer (breaking change for users)
+                "url": Unicode(),
+                "screenshot": AnyBox(
+                    low=0,
+                    high=255,
+                    shape=(-1, -1, 3),
+                    dtype=np.uint8,
+                ),  # swapped axes (height, width, RGB)
+                "dom_object": AnyDict(),
+                "axtree_object": AnyDict(),
+                "extra_element_properties": AnyDict(),
+                "focused_element_bid": Unicode(),
+                "last_action": Unicode(),
+                "last_action_error": Unicode(),
+                "elapsed_time": gym.spaces.Box(
+                    low=0, high=np.inf, dtype=float
+                ),  # TODO: change to a Float (breaking change for users)
+            }
+        )
+        # action space
+        self.action_space = Unicode()
+    def close(self):
+        # stop the task
+        if self.task:
+            self.task.teardown()
+            self.task = None
+        # close the chat
+        if self.chat:
+            self.chat.close()
+            self.chat = None
+        # close the browser context
+        if self.context:
+            self.context.close()
+            self.context = None
+        # close the browser
+        if self.browser:
+            self.browser.close()
+            self.browser = None
+    def reset(self, seed=None, *args, **kwargs):
+        super().reset(seed=seed, *args, **kwargs)
+        self.np_random = None  # make sure all randomness is handled by the task
+        if self.task:
+            self.task.teardown()
+            self.context.close()
+            self.chat.close()
+            self.browser.close()
+        # create a new task
+        self.task = self.task_entrypoint(seed=seed, **self.task_kwargs)
+        def override_property(task, env, property):
+            """Extract property value from env if not None, otherwise from task."""
+            env_value = getattr(env, property)
+            task_value = getattr(task, property)
+            if env_value is None:
+                return task_value
+            else:
+                if task_value is not None:
+                    logger.warning(
+                        f"Overriding the task's {property} parameter ({repr(task_value)} => {repr(env_value)}). This might change the task's behaviour and difficulty."
+                    )
+                return env_value
+        # fetch task's desired parameters for browser setup
+        viewport = override_property(self.task, self, "viewport")
+        slow_mo = override_property(self.task, self, "slow_mo")
+        timeout = override_property(self.task, self, "timeout")
+        locale = override_property(self.task, self, "locale")
+        timezone_id = override_property(self.task, self, "timezone_id")
+        # use the global Playwright instance
+        pw: playwright.sync_api.Playwright = _get_global_playwright()
+        # important: change playwright's test id attribute from "data-testid" to "bid"
+        pw.selectors.set_test_id_attribute(BROWSERGYM_ID_ATTRIBUTE)
+        # create a new browser
+        self.browser = pw.chromium.launch(
+            headless=self.headless,
+            slow_mo=slow_mo,
+            args=(
+                [f"--window-size={viewport['width']},{viewport['height']}"]
+                if self.resizeable_window
+                else None
+            ),
+            # will raise an Exception if above args are overriden
+            **self.pw_chromium_kwargs,
+        )
+        # create a new browser context for pages
+        self.context = self.browser.new_context(
+            no_viewport=True if self.resizeable_window else None,
+            viewport=viewport if not self.resizeable_window else None,
+            record_video_dir=(
+                Path(self.record_video_dir) / "task_video" if self.record_video_dir else None
+            ),
+            record_video_size=viewport,
+            locale=locale,
+            timezone_id=timezone_id,
+            # will raise an Exception if above args are overriden
+            **self.pw_context_kwargs,
+        )
+        # set default timeout
+        self.context.set_default_timeout(timeout)
+        # hack: keep track of the active page with a javascript callback
+        # there is no concept of active page in playwright
+        # https://github.com/microsoft/playwright/issues/2603
+        self.context.expose_binding(
+            "browsergym_page_activated", lambda source: self._activate_page_from_js(source["page"])
+        )
+        self.context.add_init_script(
+            r"""
+window.browsergym_page_activated();
+window.addEventListener("focus", () => {window.browsergym_page_activated();}, {capture: true});
+window.addEventListener("focusin", () => {window.browsergym_page_activated();}, {capture: true});
+window.addEventListener("load", () => {window.browsergym_page_activated();}, {capture: true});
+window.addEventListener("pageshow", () => {window.browsergym_page_activated();}, {capture: true});
+window.addEventListener("mousemove", () => {window.browsergym_page_activated();}, {capture: true});
+window.addEventListener("mouseup", () => {window.browsergym_page_activated();}, {capture: true});
+window.addEventListener("mousedown", () => {window.browsergym_page_activated();}, {capture: true});
+window.addEventListener("wheel", () => {window.browsergym_page_activated();}, {capture: true});
+window.addEventListener("keyup", () => {window.browsergym_page_activated();}, {capture: true});
+window.addEventListener("keydown", () => {window.browsergym_page_activated();}, {capture: true});
+window.addEventListener("input", () => {window.browsergym_page_activated();}, {capture: true});
+window.addEventListener("touchstart", () => {window.browsergym_page_activated();}, {capture: true});
+window.addEventListener("touchend", () => {window.browsergym_page_activated();}, {capture: true});
+document.addEventListener("visibilitychange", () => {
+    if (document.visibilityState === "visible") {
+        window.browsergym_page_activated();
+    }
+}, {capture: true});
+"""
+        )
+        # create the chat
+        self.chat = Chat(
+            headless=self.headless,
+            chat_size=(500, max(viewport["height"], 800)),
+            record_video_dir=self.record_video_dir,
+        )
+        # create a new page
+        self.page = self.context.new_page()
+        recording_start_time = time.time()
+        # setup the task
+        task_goal, task_info = self.task.setup(page=self.page)
+        # process the task goal
+        # no goal specified
+        if task_goal is None:
+            self.goal_object = []
+        # convert text-only goal (legacy) to new format
+        elif isinstance(task_goal, str):
+            self.goal_object = [{"type": "text", "text": task_goal}]
+        # new format goal with multiple texts and images (OpenAI style)
+        elif isinstance(task_goal, list):
+            self.goal_object = task_goal
+        else:
+            raise ValueError(f"task_goal should be of type str or list, got {task_goal.__class__}")
+        # initialize the chat
+        self.chat.add_message(
+            role="assistant",
+            msg="Hi! I am your UI assistant, I can perform web tasks for you. What can I help you with?",
+        )
+        # send task goal (if any) to the chat
+        for message in self.goal_object:
+            match message["type"]:
+                case "text":
+                    self.chat.add_message(role="user", msg=message["text"])
+                case "image_url":
+                    image_src = message["image_url"]
+                    if isinstance(image_src, dict):
+                        image_src = image_src["url"]
+                    self.chat.add_message(role="user_image", msg=image_src)
+                case _:
+                    raise ValueError(
+                        f"Unknown message type {repr(message['type'])} in the task goal."
+                    )
+        self._wait_dom_loaded()
+        # after the task's setup, the active page might have changed
+        # perform a safety check
+        self._active_page_check()
+        # init start time
+        self.start_time = time.time()
+        # no action yet
+        self.last_action = ""
+        self.last_action_error = ""
+        self.infeasible_message_received = False
+        # if asked, wait for user message
+        self._wait_for_user_message()
+        # extract obs and info from environment
+        obs = self._get_obs()
+        info = {}
+        info["task_info"] = task_info
+        # TODO this is a bit hacky, find a better solution to record videos
+        if self.record_video_dir:
+            info["recording_start_time"] = recording_start_time
+            info["recording_file"] = str(self.page.video.path())
+            info["chat"] = {
+                "recording_start_time": self.chat.recording_start_time,
+                "recording_file": str(self.chat.page.video.path()),
+            }
+        return obs, info
+    def pre_step(self) -> tuple[dict[str, Any], Callable, Callable]:
+        info = {}
+        info["action_exec_start"] = time.time()
+        info["action_exec_timeout"] = 0
+        def send_message_to_user(text: str):
+            if not isinstance(text, str):
+                raise ValueError(f"Forbidden value: {text} is not a string")
+            self.chat.add_message(role="assistant", msg=text)
+        def report_infeasible_instructions(reason: str):
+            if not isinstance(reason, str):
+                raise ValueError(f"Forbidden value: {reason} is not a string")
+            self.chat.add_message(role="infeasible", msg=reason)
+            self.infeasible_message_received = True
+        # try to execute the action
+        logger.debug("Executing action")
+        return info, send_message_to_user, report_infeasible_instructions
+    def step(self, action: str) -> tuple[dict[str, Any], float, bool, bool, dict[str, Any]]:
+        """
+        Execute the action in the environment.
+        Args:
+            action: the action to execute. This should be a string with code or a function call
+        Returns:
+            obs: the observation after executing the action
+            reward: the reward received after executing the action
+            terminated: whether the episode is terminated or not
+            truncated: whether the episode is truncated or not
+            info: additional information about the step
+        """
+        self.last_action = action
+        info, send_message_to_user, report_infeasible_instructions = self.pre_step()
+        try:
+            if self.action_mapping:
+                code = self.action_mapping(action)
+            else:
+                code = action
+            execute_python_code(
+                code,
+                self.page,
+                send_message_to_user=send_message_to_user,
+                report_infeasible_instructions=report_infeasible_instructions,
+            )
+            self.last_action_error = ""
+        except Exception as e:
+            self.last_action_error = f"{type(e).__name__}: {e}"
+            match = re.match("TimeoutError: Timeout ([0-9]+)ms exceeded.", self.last_action_error)
+            if match:
+                info["action_exec_timeout"] = float(match.groups()[0]) / 1000  # ms to sec
+        return self.post_step(info)
+    def post_step(
+        self, info: dict[str, Any], validate: bool = True
+    ) -> tuple[dict[str, Any], float, bool, bool, dict[str, Any]]:
+        """
+        Post step method, called after executing the action.
+        This method is responsible for extracting the observation after the action.
+        It also prepares reward, task status, user message and other step info.
+        Args:
+            info: dictionary containing information about the step
+        Returns:
+            obs: the observation after executing the action
+            reward: the reward received after executing the action
+            terminated: whether the episode is terminated or not
+            truncated: whether the episode is truncated or not
+            info: additional information about the step
+        """
+        logger.debug("Action executed")
+        info["action_exec_stop"] = time.time()
+        # wait a bit (for the JavaScript callback to set the active page)
+        time.sleep(0.5)  # wait for JS events to be fired (half a second)
+        self.context.cookies()  # trigger all waiting Playwright callbacks on the stack (hack, see https://playwright.dev/java/docs/multithreading)
+        # wait for the network to idle before extracting the observation, reward etc.
+        self._wait_dom_loaded()
+        if validate:
+            # after the action is executed, the active page might have changed
+            # perform a safety check
+            self._active_page_check()
+            logger.debug("Active page checked")
+            # if asked, wait for user message
+            self._wait_for_user_message()
+            logger.debug("User message done")
+            logger.debug("Initiating task validation")
+            # extract reward, done, user_message, info (task-specific)
+            reward, done, user_message, task_info = self._task_validate()
+            info["task_info"] = task_info
+            logger.debug("Task validation done")
+        else:
+            reward = 0
+            done = False
+            user_message = None
+            info["task_info"] = {}
+            logger.debug("Task validation skipped")
+        # add any user message sent by the task to the chat
+        if user_message:
+            self.chat.add_message(role="user", msg=user_message)
+        # extract observation (generic)
+        obs = self._get_obs()
+        logger.debug("Observation extracted")
+        # new step API wants a 5-tuple (gymnasium)
+        terminated = done or (
+            self.terminate_on_infeasible and self.infeasible_message_received
+        )  # task or agent can terminate the episode
+        truncated: bool = False
+        return obs, reward, terminated, truncated, info
+    def _task_validate(self):
+        # back-up these in case validate() navigates pages and messes the history
+        prev_active_page = self.page
+        prev_page_history = self.page_history.copy()
+        # call validate
+        reward, done, user_message, info = self.task.validate(self.page, self.chat.messages)
+        # safety fix, in case validate() did mess up the active page and/or page history
+        if prev_active_page != self.page or prev_page_history != self.page_history:
+            logger.debug(
+                "The active page and / or page history has changed during task.validate(). A recovery fix will be applied."
+            )
+            self.page = prev_active_page
+            self.page_history = prev_page_history
+        return reward, done, user_message, info
+    def _wait_for_user_message(self):
+        # if last message is from the assistant, wait for a user message to continue
+        # TODO: be smarter about when to wait for a user message (different action from the assistant?)
+        if self.chat.messages[-1]["role"] == "assistant" and self.wait_for_user_message:
+            self.chat.wait_for_user_message()
+    def _wait_dom_loaded(self):
+        for page in self.context.pages:
+            try:
+                page.wait_for_load_state("domcontentloaded", timeout=3000)
+            except playwright.sync_api.Error:
+                pass
+            for frame in page.frames:
+                try:
+                    frame.wait_for_load_state("domcontentloaded", timeout=3000)
+                except playwright.sync_api.Error:
+                    pass
+    def _activate_page_from_js(self, page: playwright.sync_api.Page):
+        logger.debug(f"_activate_page_from_js(page) called, page={str(page)}")
+        if not page.context == self.context:
+            raise RuntimeError(
+                f"Unexpected: activating a page that belongs to a different browser context ({page})."
+            )
+        # add the activated page to the page history (or move it to last which is the most recent)
+        if page in self.page_history:
+            self.page_history[page] = self.page_history.pop(
+                page
+            )  # move page to the end of dictionnary
+        else:
+            self.page_history[page] = None  # add page to the end of dictionnary
+        self.page = page
+    def _active_page_check(self):
+        # make sure there is always a page open
+        # if all pages have been closed, create a new page
+        if len(self.context.pages) == 0:
+            logger.warning("All pages are closed, opening a new page.")
+            self.page = self.context.new_page()
+        # if the active page got closed, get the last active page from the history
+        while self.page_history and (self.page.is_closed() or self.page not in self.context.pages):
+            self.page_history.pop(self.page)  # remove active page from history
+            self.page = list(self.page_history.keys())[
+                -1
+            ]  # set last active page as the active page (most recent)
+        # active page should share the same browser context with the environment
+        if self.page not in self.context.pages:
+            raise RuntimeError(
+                f"Unexpected: active page is not part of the browser context's open pages ({self.page})."
+            )
+        # active page should not be closed
+        if self.page.is_closed():
+            raise RuntimeError(f"Unexpected: active page has been closed ({self.page}).")
+    def _get_obs(self):
+        for retries_left in reversed(range(EXTRACT_OBS_MAX_TRIES)):
+            try:
+                # pre-extraction, mark dom elements (set bid, set dynamic attributes like value and checked)
+                _pre_extract(self.page, tags_to_mark=self.tags_to_mark, lenient=(retries_left == 0))
+                dom = extract_dom_snapshot(self.page)
+                axtree = extract_merged_axtree(self.page)
+                focused_element_bid = extract_focused_element_bid(self.page)
+                extra_properties = extract_dom_extra_properties(dom)
+            except (playwright.sync_api.Error, MarkingError) as e:
+                err_msg = str(e)
+                # try to add robustness to async events (detached / deleted frames)
+                if retries_left > 0 and (
+                    "Frame was detached" in err_msg
+                    or "Frame with the given frameId is not found" in err_msg
+                    or "Execution context was destroyed" in err_msg
+                    or "Frame has been detached" in err_msg
+                    or "Cannot mark a child frame without a bid" in err_msg
+                    or "Cannot read properties of undefined" in err_msg
+                ):
+                    logger.warning(
+                        f"An error occurred while extracting the dom and axtree. Retrying ({retries_left}/{EXTRACT_OBS_MAX_TRIES} tries left).\n{repr(e)}"
+                    )
+                    # post-extract cleanup (ARIA attributes)
+                    _post_extract(self.page)
+                    time.sleep(0.5)
+                    continue
+                else:
+                    raise e
+            break
+        # post-extraction cleanup of temporary info in dom
+        _post_extract(self.page)
+        # obs is generic to all tasks
+        obs = {
+            "chat_messages": tuple(copy.deepcopy(self.chat.messages)),
+            "goal": _try_to_extract_legacy_goal(self.goal_object),  # legacy goal, deprecated
+            "goal_object": tuple(
+                copy.deepcopy(self.goal_object)
+            ),  # new goal format, list of messages openai style
+            "open_pages_urls": tuple(page.url for page in self.context.pages),
+            "open_pages_titles": tuple(page.title() for page in self.context.pages),
+            "active_page_index": np.asarray([self.context.pages.index(self.page)]),
+            "url": self.page.url,  # redundant with "open_pages_urls" and "active_page_index"
+            "screenshot": extract_screenshot(self.page),
+            "dom_object": dom,
+            "axtree_object": axtree,
+            "extra_element_properties": extra_properties,
+            "focused_element_bid": focused_element_bid,
+            "last_action": self.last_action,
+            "last_action_error": self.last_action_error,
+            "elapsed_time": np.asarray([time.time() - self.start_time]),
+        }
+        return obs

BrowserGym/browsergym/core/src/browsergym/core/javascript/frame_mark_elements.js ADDED Viewed

	@@ -0,0 +1,295 @@

+/**
+ * Go through all DOM elements in the frame (including shadowDOMs), give them unique browsergym
+ * identifiers (bid), and store custom data in ARIA attributes.
+ */
+async ([parent_bid, bid_attr_name, tags_to_mark]) => {
+    // standard html tags
+    // https://www.w3schools.com/tags/
+    const html_tags = new Set([
+        "a", "abbr", "acronym", "address", "applet", "area", "article", "aside", "audio",
+        "b", "base", "basefont", "bdi", "bdo", "big", "blockquote", "body", "br", "button",
+        "canvas", "caption", "center", "cite", "code", "col", "colgroup", "data", "datalist",
+        "dd", "del", "details", "dfn", "dialog", "dir", "div", "dl", "dt", "em", "embed",
+        "fieldset", "figcaption", "figure", "font", "footer", "form", "frame", "frameset",
+        "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i",
+        "iframe", "img", "input", "ins", "kbd", "label", "legend", "li", "link", "main",
+        "map", "mark", "menu", "meta", "meter", "nav", "noframes", "noscript", "object",
+        "ol", "optgroup", "option", "output", "p", "param", "picture", "pre", "progress",
+        "q", "rp", "rt", "ruby", "s", "samp", "script", "search", "section", "select",
+        "small", "source", "span", "strike", "strong", "style", "sub", "summary", "sup",
+        "svg", "table", "tbody", "td", "template", "textarea", "tfoot", "th", "thead",
+        "time", "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr"
+    ]);
+    const set_of_marks_tags = new Set([
+        "input", "textarea", "select", "button", "a", "iframe", "video", "li", "td", "option"
+    ]);
+    let browsergym_first_visit = false;
+    // if no yet set, set the frame (local) element counter to 0
+    if (!("browsergym_elem_counter" in window)) {
+        window.browsergym_elem_counter = 0;
+        window.browsergym_frame_id_generator = new IFrameIdGenerator();
+        browsergym_first_visit = true;
+    }
+    // mechanism for computing all element's visibility
+    // the intersection observer will set the visibility ratio of elements entering / exiting the viewport
+    // a set is used to keep track of not-yet-visited elements
+    let elems_to_be_visited = new Set();
+    let intersection_observer = new IntersectionObserver(
+        entries => {
+          entries.forEach(entry => {
+            let elem = entry.target;
+            elem.setAttribute('browsergym_visibility_ratio', Math.round(entry.intersectionRatio * 100) / 100);
+            if (elems_to_be_visited.has(elem)) {
+                elems_to_be_visited.delete(elem);
+            }
+          })
+        },
+        {
+            threshold: [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
+        }
+    )
+    let all_bids = new Set();
+    // get all DOM elements in the current frame (does not include elements in shadowDOMs)
+    let elements = Array.from(document.querySelectorAll('*'));
+    let som_buttons = [];
+    i = 0;
+    while (i < elements.length) {
+        const elem = elements[i];
+        // add shadowDOM elements to the elements array, in such a way that order is preserved
+        // TODO: do we really need the order preserved?
+        if (elem.shadowRoot !== null) {
+            elements = new Array(
+                ...Array.prototype.slice.call(elements, 0, i + 1),
+                ...Array.from(elem.shadowRoot.querySelectorAll("*")),
+                ...Array.prototype.slice.call(elements, i + 1)
+            );
+        }
+        i++;
+        // decide if the current element should be marked or not
+        switch (tags_to_mark) {
+            // mark all elements
+            case "all":
+                break;
+            // mark only standard HTML tags
+            case "standard_html":
+                if (!elem.tagName || !html_tags.has(elem.tagName.toLowerCase())) {
+                    // continue the loop, i.e., move on to the next element
+                    continue;
+                }
+                break;
+            // non-recognized argument
+            default:
+                throw new Error(`Invalid value for parameter \"tags_to_mark\": ${JSON.stringify(tags_to_mark)}`);
+        }
+        // Processing element
+        // register intersection callback on element, and keep track of element for waiting later
+        elem.setAttribute('browsergym_visibility_ratio', 0);
+        elems_to_be_visited.add(elem);
+        intersection_observer.observe(elem);
+        // write dynamic element values to the DOM
+        if (typeof elem.value !== 'undefined') {
+            elem.setAttribute("value", elem.value);
+        }
+        // write dynamic checked properties to the DOM
+        if (typeof elem.checked !== 'undefined') {
+            if (elem.checked === true) {
+                elem.setAttribute("checked", "");
+            }
+            else {
+                elem.removeAttribute("checked");
+            }
+        }
+        // add the element global id (browsergym id) to a custom HTML attribute
+        // https://playwright.dev/docs/locators#locate-by-test-id
+        // recover the element id if it has one already, else compute a new element id
+        let elem_global_bid = null;
+        if (elem.hasAttribute(bid_attr_name)) {
+            // throw an error if the attribute is already set while this is the first visit of the page
+            if (browsergym_first_visit) {
+                throw new Error(`Attribute ${bid_attr_name} already used in element ${elem.outerHTML}`);
+            }
+            elem_global_bid = elem.getAttribute(bid_attr_name);
+            // if the bid has already been encountered, then this is a duplicate and a new bid should be set
+            if (all_bids.has(elem_global_bid)) {
+                console.log(`BrowserGym: duplicate bid ${elem_global_bid} detected, generating a new one`);
+                elem_global_bid = null;
+            }
+        }
+        if (elem_global_bid === null) {
+            let elem_local_id = null;
+            // iFrames get alphabetical ids: 'a', 'b', ..., 'z', 'aA', 'aB' etc.
+            if (['iframe', 'frame'].includes(elem.tagName.toLowerCase())) {
+                elem_local_id = `${window.browsergym_frame_id_generator.next()}`;
+            }
+            // other elements get numerical ids: '0', '1', '2', ...
+            else {
+                elem_local_id = `${window.browsergym_elem_counter++}`;
+            }
+            if (parent_bid == "") {
+                elem_global_bid = `${elem_local_id}`;
+            }
+            else {
+                elem_global_bid = `${parent_bid}${elem_local_id}`;
+            }
+            elem.setAttribute(bid_attr_name, `${elem_global_bid}`);
+        }
+        all_bids.add(elem_global_bid);
+        // Hack: store custom data inside ARIA attributes (will be available in DOM and AXTree)
+        //  - elem_global_bid: global element identifier (unique over multiple frames)
+        // TODO: add more data if needed (x, y coordinates, bounding box, is_visible, is_clickable etc.)
+        push_bid_to_attribute(elem_global_bid, elem, "aria-roledescription");
+        push_bid_to_attribute(elem_global_bid, elem, "aria-description");  // fallback for generic nodes
+        // set-of-marks flag (He et al. 2024)
+        // https://github.com/MinorJerry/WebVoyager/blob/main/utils.py
+        elem.setAttribute("browsergym_set_of_marks", "0");
+        // click at center activates self or a child
+        if (["self", "child"].includes(whoCapturesCenterClick(elem))) {
+            // has valid tag name, or has click event, or triggers a pointer cursor
+            if (set_of_marks_tags.has(elem.tagName.toLowerCase()) || (elem.onclick != null) || (window.getComputedStyle(elem).cursor == "pointer")) {
+                let rect = elem.getBoundingClientRect();
+                let area = (rect.right - rect.left) * (rect.bottom - rect.top);
+                // area is large enough
+                if (area >= 20) {
+                    // is not a child of a button (role, type, tag) set to be marked
+                    if (som_buttons.every(button => !button.contains(elem))) {
+                        // is not the sole child of span that has a role and is set to be marked
+                        let parent = elem.parentElement;
+                        if (!(parent && parent.tagName.toLowerCase() == "span" && parent.children.length === 1 && parent.getAttribute("role") && parent.getAttribute("browsergym_set_of_marks") === "1")) {
+                            // all checks have passed, flag the element for inclusion in set-of-marks
+                            elem.setAttribute("browsergym_set_of_marks", "1");
+                            if (elem.matches('button, a, input[type="button"], div[role="button"]')) {
+                                som_buttons.push(elem)
+                            }
+                            // lastly, remove the set-of-marks flag from all parents, if any
+                            while (parent) {
+                                if (parent.getAttribute("browsergym_set_of_marks") === "1") {
+                                    parent.setAttribute("browsergym_set_of_marks", "0")
+                                }
+                                parent = parent.parentElement;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    warning_msgs = new Array();
+    // wait for all elements to be visited for visibility
+    let visibility_marking_timeout = 1000;  // ms
+    try {
+        await until(() => elems_to_be_visited.size == 0, visibility_marking_timeout);
+    } catch {
+        warning_msgs.push(`Frame marking: not all elements have been visited by the intersection_observer after ${visibility_marking_timeout} ms`);
+    }
+    // disconnect intersection observer
+    intersection_observer.disconnect();
+    return warning_msgs;
+}
+async function until(f, timeout, interval=40) {
+    return new Promise((resolve, reject) => {
+        const start_time = Date.now();
+        // immediate check
+        if (f()) {
+            resolve();
+        }
+        // loop check
+        const wait = setInterval(() => {
+            if (f()) {
+                clearInterval(wait);
+                resolve();
+            } else if (Date.now() - start_time > timeout) {
+                clearInterval(wait);
+                reject();
+            }
+        }, interval);
+    });
+}
+function whoCapturesCenterClick(element){
+    var rect = element.getBoundingClientRect();
+    var x = (rect.left + rect.right) / 2 ;
+    var y = (rect.top + rect.bottom) / 2 ;
+    var element_at_center = elementFromPoint(x, y); // return the element in the foreground at position (x,y)
+    if (!element_at_center) {
+        return "nobody";
+    } else if (element_at_center === element) {
+        return "self";
+    } else if (element.contains(element_at_center)) {
+        return "child";
+    } else {
+        return "non-descendant";
+    }
+}
+function push_bid_to_attribute(bid, elem, attr){
+    let original_content = "";
+    if (elem.hasAttribute(attr)) {
+        original_content = elem.getAttribute(attr);
+    }
+    let new_content = `browsergym_id_${bid} ${original_content}`
+    elem.setAttribute(attr, new_content);
+}
+function elementFromPoint(x, y) {
+    let dom = document;
+    let last_elem = null;
+    let elem = null;
+    do {
+        last_elem = elem;
+        elem = dom.elementFromPoint(x, y);
+        dom = elem?.shadowRoot;
+    } while(dom && elem !== last_elem);
+    return elem;
+}
+// https://stackoverflow.com/questions/12504042/what-is-a-method-that-can-be-used-to-increment-letters#answer-12504061
+class IFrameIdGenerator {
+    constructor(chars = 'abcdefghijklmnopqrstuvwxyz') {
+      this._chars = chars;
+      this._nextId = [0];
+    }
+    next() {
+      const r = [];
+      for (let i = 0; i < this._nextId.length; i++) {
+        let char = this._chars[this._nextId[i]];
+        // all but first character must be upper-cased (a, aA, bCD)
+        if (i < this._nextId.length - 1) {
+            char = char.toUpperCase();
+        }
+        r.unshift(char);
+      }
+      this._increment();
+      return r.join('');
+    }
+    _increment() {
+      for (let i = 0; i < this._nextId.length; i++) {
+        const val = ++this._nextId[i];
+        if (val < this._chars.length) {
+          return;
+        }
+        this._nextId[i] = 0;
+      }
+      this._nextId.push(0);
+    }
+    *[Symbol.iterator]() {
+      while (true) {
+        yield this.next();
+      }
+    }
+  }

BrowserGym/browsergym/core/src/browsergym/core/javascript/frame_unmark_elements.js ADDED Viewed

	@@ -0,0 +1,40 @@

+/**
+ * Go through all DOM elements in the frame (including shadowDOMs),
+ * and cleanup previously stored data in ARIA attributes.
+ */
+() => {
+    // get all DOM elements in the current frame (does not include elements in shadowDOMs)
+    let elements = Array.from(document.querySelectorAll('*'));
+    let i = 0;
+    while (i < elements.length) {
+        const elem = elements[i];
+        // add shadowDOM elements to the elements array, in such a way that order is preserved
+        // TODO: do we really need the order preserved?
+        if (elem.shadowRoot !== null) {
+            elements = new Array(
+                ...Array.prototype.slice.call(elements, 0, i + 1),
+                ...Array.from(elem.shadowRoot.querySelectorAll("*")),
+                ...Array.prototype.slice.call(elements, i + 1)
+            );
+        }
+        i++;
+        // Hack: remove custom data stored in ARIA attributes
+        //  - elem_global_id: global browsergym identifier
+        pop_bid_from_attribute(elem, "aria-description");
+        pop_bid_from_attribute(elem, "aria-roledescription");  // fallback for generic nodes
+    }
+}
+function pop_bid_from_attribute(elem, attr) {
+    let bid_regex = /^browsergym_id[^\s]*\s/;
+    if (elem.hasAttribute(attr)) {
+        let content = elem.getAttribute(attr);
+        let original_content = content.replace(bid_regex, '');
+        if (original_content) {
+            elem.setAttribute(attr, original_content);
+        }
+        else {
+            elem.removeAttribute(attr);
+        }
+    }
+}

BrowserGym/browsergym/core/src/browsergym/core/observation.py ADDED Viewed

	@@ -0,0 +1,575 @@

+import base64
+import io
+import logging
+import pkgutil
+import re
+from typing import Literal
+import numpy as np
+import PIL.Image
+import playwright.sync_api
+from .constants import BROWSERGYM_ID_ATTRIBUTE as BID_ATTR
+from .constants import BROWSERGYM_SETOFMARKS_ATTRIBUTE as SOM_ATTR
+from .constants import BROWSERGYM_VISIBILITY_ATTRIBUTE as VIS_ATTR
+MARK_FRAMES_MAX_TRIES = 3
+logger = logging.getLogger(__name__)
+class MarkingError(Exception):
+    pass
+def _pre_extract(
+    page: playwright.sync_api.Page,
+    tags_to_mark: Literal["all", "standard_html"] = "standard_html",
+    lenient: bool = False,
+):
+    """
+    pre-extraction routine, marks dom elements (set bid and dynamic attributes like value and checked)
+    """
+    js_frame_mark_elements = pkgutil.get_data(__name__, "javascript/frame_mark_elements.js").decode(
+        "utf-8"
+    )
+    # we can't run this loop in JS due to Same-Origin Policy
+    # (can't access the content of an iframe from a another one)
+    def mark_frames_recursive(frame, frame_bid: str):
+        assert frame_bid == "" or re.match(r"^[a-z][a-zA-Z]*$", frame_bid)
+        logger.debug(f"Marking frame {repr(frame_bid)}")
+        # mark all DOM elements in the frame (it will use the parent frame element's bid as a prefix)
+        warning_msgs = frame.evaluate(
+            js_frame_mark_elements,
+            [frame_bid, BID_ATTR, tags_to_mark],
+        )
+        # print warning messages if any
+        for msg in warning_msgs:
+            logger.warning(msg)
+        # recursively mark all descendant frames
+        for child_frame in frame.child_frames:
+            # deal with detached frames
+            if child_frame.is_detached():
+                continue
+            # deal with weird frames (pdf viewer in <embed>)
+            child_frame_elem = child_frame.frame_element()
+            if not child_frame_elem.content_frame() == child_frame:
+                logger.warning(
+                    f"Skipping frame '{child_frame.name}' for marking, seems problematic."
+                )
+                continue
+            # deal with sandboxed frames with blocked script execution
+            sandbox_attr = child_frame_elem.get_attribute("sandbox")
+            if sandbox_attr is not None and "allow-scripts" not in sandbox_attr.split():
+                continue
+            child_frame_bid = child_frame_elem.get_attribute(BID_ATTR)
+            if child_frame_bid is None:
+                if lenient:
+                    logger.warning("Cannot mark a child frame without a bid. Skipping frame.")
+                    continue
+                else:
+                    raise MarkingError("Cannot mark a child frame without a bid.")
+            mark_frames_recursive(child_frame, frame_bid=child_frame_bid)
+    # mark all frames recursively
+    mark_frames_recursive(page.main_frame, frame_bid="")
+def _post_extract(page: playwright.sync_api.Page):
+    js_frame_unmark_elements = pkgutil.get_data(
+        __name__, "javascript/frame_unmark_elements.js"
+    ).decode("utf-8")
+    # we can't run this loop in JS due to Same-Origin Policy
+    # (can't access the content of an iframe from a another one)
+    for frame in page.frames:
+        try:
+            if not frame == page.main_frame:
+                # deal with weird frames (pdf viewer in <embed>)
+                if not frame.frame_element().content_frame() == frame:
+                    logger.warning(
+                        f"Skipping frame '{frame.name}' for unmarking, seems problematic."
+                    )
+                    continue
+                # deal with sandboxed frames with blocked script execution
+                sandbox_attr = frame.frame_element().get_attribute("sandbox")
+                if sandbox_attr is not None and "allow-scripts" not in sandbox_attr.split():
+                    continue
+                # deal with frames without a BID
+                bid = frame.frame_element().get_attribute(BID_ATTR)
+                if bid is None:
+                    continue
+            frame.evaluate(js_frame_unmark_elements)
+        except playwright.sync_api.Error as e:
+            if any(msg in str(e) for msg in ("Frame was detached", "Frame has been detached")):
+                pass
+            else:
+                raise e
+def extract_screenshot(page: playwright.sync_api.Page):
+    """
+    Extracts the screenshot image of a Playwright page using Chrome DevTools Protocol.
+    Args:
+        page: the playwright page of which to extract the screenshot.
+    Returns:
+        A screenshot of the page, in the form of a 3D array (height, width, rgb).
+    """
+    cdp = page.context.new_cdp_session(page)
+    cdp_answer = cdp.send(
+        "Page.captureScreenshot",
+        {
+            "format": "png",
+        },
+    )
+    cdp.detach()
+    # bytes of a png file
+    png_base64 = cdp_answer["data"]
+    png_bytes = base64.b64decode(png_base64)
+    with io.BytesIO(png_bytes) as f:
+        # load png as a PIL image
+        img = PIL.Image.open(f)
+        # convert to RGB (3 channels)
+        img = img.convert(mode="RGB")
+        # convert to a numpy array
+        img = np.array(img)
+    return img
+# we could handle more data items here if needed
+__BID_EXPR = r"([a-zA-Z0-9]+)"
+__DATA_REGEXP = re.compile(r"^browsergym_id_" + __BID_EXPR + r"\s?" + r"(.*)")
+def extract_data_items_from_aria(string: str, log_level: int = logging.NOTSET):
+    """
+    Utility function to extract temporary data stored in the ARIA attributes of a node
+    """
+    match = __DATA_REGEXP.fullmatch(string)
+    if not match:
+        logger.log(
+            level=log_level,
+            msg=f"Failed to extract BrowserGym data from ARIA string: {repr(string)}",
+        )
+        return [], string
+    groups = match.groups()
+    data_items = groups[:-1]
+    original_aria = groups[-1]
+    return data_items, original_aria
+def extract_dom_snapshot(
+    page: playwright.sync_api.Page,
+    computed_styles=[],
+    include_dom_rects: bool = True,
+    include_paint_order: bool = True,
+    temp_data_cleanup: bool = True,
+):
+    """
+    Extracts the DOM snapshot of a Playwright page using Chrome DevTools Protocol.
+    Args:
+        page: the playwright page of which to extract the screenshot.
+        computed_styles: whitelist of computed styles to return.
+        include_dom_rects: whether to include DOM rectangles (offsetRects, clientRects, scrollRects) in the snapshot.
+        include_paint_order: whether to include paint orders in the snapshot.
+        temp_data_cleanup: whether to clean up the temporary data stored in the ARIA attributes.
+    Returns:
+        A document snapshot, including the full DOM tree of the root node (including iframes,
+        template contents, and imported documents) in a flattened array, as well as layout
+        and white-listed computed style information for the nodes. Shadow DOM in the returned
+        DOM tree is flattened.
+    """
+    cdp = page.context.new_cdp_session(page)
+    dom_snapshot = cdp.send(
+        "DOMSnapshot.captureSnapshot",
+        {
+            "computedStyles": computed_styles,
+            "includeDOMRects": include_dom_rects,
+            "includePaintOrder": include_paint_order,
+        },
+    )
+    cdp.detach()
+    # if requested, remove temporary data stored in the ARIA attributes of each node
+    if temp_data_cleanup:
+        pop_bids_from_attribute(dom_snapshot, "aria-roledescription")
+        pop_bids_from_attribute(dom_snapshot, "aria-description")
+    return dom_snapshot
+def pop_bids_from_attribute(dom_snapshot, attr: str):
+    try:
+        target_attr_name_id = dom_snapshot["strings"].index(attr)
+    except ValueError:
+        target_attr_name_id = -1
+    # run the cleanup only if the target attribute string is present
+    if target_attr_name_id > -1:
+        processed_string_ids = set()
+        for document in dom_snapshot["documents"]:
+            for node_attributes in document["nodes"]["attributes"]:
+                i = 0
+                # find the target attribute, if any
+                for i in range(0, len(node_attributes), 2):
+                    attr_name_id = node_attributes[i]
+                    attr_value_id = node_attributes[i + 1]
+                    if attr_name_id == target_attr_name_id:
+                        attr_value = dom_snapshot["strings"][attr_value_id]
+                        # remove any data stored in the target attribute
+                        if attr_value_id not in processed_string_ids:
+                            _, new_attr_value = extract_data_items_from_aria(attr_value)
+                            dom_snapshot["strings"][
+                                attr_value_id
+                            ] = new_attr_value  # update the string in the metadata
+                            processed_string_ids.add(
+                                attr_value_id
+                            )  # mark string as processed (in case several nodes share the same target attribute string value)
+                            attr_value = new_attr_value
+                        # remove target attribute (name and value) if empty
+                        if attr_value == "":
+                            del node_attributes[i : i + 2]
+                        # once target attribute is found, exit the search
+                        break
+def extract_dom_extra_properties(dom_snapshot):
+    def to_string(idx):
+        if idx == -1:
+            return None
+        else:
+            return dom_snapshot["strings"][idx]
+    # pre-locate important string ids
+    try:
+        bid_string_id = dom_snapshot["strings"].index(BID_ATTR)
+    except ValueError:
+        bid_string_id = -1
+    try:
+        vis_string_id = dom_snapshot["strings"].index(VIS_ATTR)
+    except ValueError:
+        vis_string_id = -1
+    try:
+        som_string_id = dom_snapshot["strings"].index(SOM_ATTR)
+    except ValueError:
+        som_string_id = -1
+    # build the iframe tree (DFS from the first frame)
+    doc_properties = {
+        0: {
+            "parent": None,
+        }
+    }
+    docs_to_process = [0]
+    while docs_to_process:
+        doc = docs_to_process.pop(-1)  # DFS
+        children = dom_snapshot["documents"][doc]["nodes"]["contentDocumentIndex"]
+        for node, child_doc in zip(children["index"], children["value"]):
+            doc_properties[child_doc] = {
+                "parent": {
+                    "doc": doc,  # parent frame index
+                    "node": node,  # node index within the parent frame
+                }
+            }
+            docs_to_process.append(child_doc)
+        # recover the absolute x and y position of the frame node in the parent (if any)
+        parent = doc_properties[doc]["parent"]
+        if parent:
+            parent_doc = parent["doc"]
+            parent_node = parent["node"]
+            try:
+                node_layout_idx = dom_snapshot["documents"][parent_doc]["layout"][
+                    "nodeIndex"
+                ].index(parent_node)
+            except ValueError:
+                node_layout_idx = -1
+            if node_layout_idx >= 0:
+                node_bounds = dom_snapshot["documents"][parent_doc]["layout"]["bounds"][
+                    node_layout_idx
+                ]  # can be empty?
+                # absolute position of parent + relative position of frame node within parent
+                parent_node_abs_x = doc_properties[parent_doc]["abs_pos"]["x"] + node_bounds[0]
+                parent_node_abs_y = doc_properties[parent_doc]["abs_pos"]["y"] + node_bounds[1]
+            else:
+                parent_node_abs_x = 0
+                parent_node_abs_y = 0
+        else:
+            parent_node_abs_x = 0
+            parent_node_abs_y = 0
+        # get the frame's absolute position, by adding any scrolling offset if any
+        doc_properties[doc]["abs_pos"] = {
+            "x": parent_node_abs_x - dom_snapshot["documents"][doc]["scrollOffsetX"],
+            "y": parent_node_abs_y - dom_snapshot["documents"][doc]["scrollOffsetY"],
+        }
+        document = dom_snapshot["documents"][doc]
+        doc_properties[doc]["nodes"] = [
+            {
+                "bid": None,  # default value, to be filled (str)
+                "visibility": None,  # default value, to be filled (float)
+                "bbox": None,  # default value, to be filled (list)
+                "clickable": False,  # default value, to be filled (bool)
+                "set_of_marks": None,  # default value, to be filled (bool)
+            }
+            for _ in enumerate(document["nodes"]["parentIndex"])
+        ]  # all nodes in document
+        # extract clickable property
+        for node_idx in document["nodes"]["isClickable"]["index"]:
+            doc_properties[doc]["nodes"][node_idx]["clickable"] = True
+        # extract bid and visibility properties (attribute-based)
+        for node_idx, node_attrs in enumerate(document["nodes"]["attributes"]):
+            i = 0
+            # loop over all attributes
+            for i in range(0, len(node_attrs), 2):
+                name_string_id = node_attrs[i]
+                value_string_id = node_attrs[i + 1]
+                if name_string_id == bid_string_id:
+                    doc_properties[doc]["nodes"][node_idx]["bid"] = to_string(value_string_id)
+                if name_string_id == vis_string_id:
+                    doc_properties[doc]["nodes"][node_idx]["visibility"] = float(
+                        to_string(value_string_id)
+                    )
+                if name_string_id == som_string_id:
+                    doc_properties[doc]["nodes"][node_idx]["set_of_marks"] = (
+                        to_string(value_string_id) == "1"
+                    )
+        # extract bbox property (in absolute coordinates)
+        for node_idx, bounds, client_rect in zip(
+            document["layout"]["nodeIndex"],
+            document["layout"]["bounds"],
+            document["layout"]["clientRects"],
+        ):
+            # empty clientRect means element is not actually rendered
+            if not client_rect:
+                doc_properties[doc]["nodes"][node_idx]["bbox"] = None
+            else:
+                # bounds gives the relative position within the document
+                doc_properties[doc]["nodes"][node_idx]["bbox"] = bounds.copy()
+                # adjust for absolute document position
+                doc_properties[doc]["nodes"][node_idx]["bbox"][0] += doc_properties[doc]["abs_pos"][
+                    "x"
+                ]
+                doc_properties[doc]["nodes"][node_idx]["bbox"][1] += doc_properties[doc]["abs_pos"][
+                    "y"
+                ]
+        # Note: other interesting fields
+        # document["nodes"]["parentIndex"]  # parent node
+        # document["nodes"]["nodeType"]
+        # document["nodes"]["nodeName"]
+        # document["nodes"]["nodeValue"]
+        # document["nodes"]["textValue"]
+        # document["nodes"]["inputValue"]
+        # document["nodes"]["inputChecked"]
+        # document["nodes"]["optionSelected"]
+        # document["nodes"]["pseudoType"]
+        # document["nodes"]["pseudoIdentifier"]
+        # document["nodes"]["isClickable"]
+        # document["textBoxes"]
+        # document["layout"]["nodeIndex"]
+        # document["layout"]["bounds"]
+        # document["layout"]["offsetRects"]
+        # document["layout"]["scrollRects"]
+        # document["layout"]["clientRects"]
+        # document["layout"]["paintOrders"]
+    # collect the extra properties of all nodes with a browsergym_id attribute
+    extra_properties = {}
+    for doc in doc_properties.keys():
+        for node in doc_properties[doc]["nodes"]:
+            bid = node["bid"]
+            if bid:
+                if bid in extra_properties:
+                    logger.warning(f"duplicate {BID_ATTR}={repr(bid)} attribute detected")
+                extra_properties[bid] = {
+                    extra_prop: node[extra_prop]
+                    for extra_prop in ("visibility", "bbox", "clickable", "set_of_marks")
+                }
+    return extra_properties
+def extract_all_frame_axtrees(page: playwright.sync_api.Page):
+    """
+    Extracts the AXTree of all frames (main document and iframes) of a Playwright page using Chrome DevTools Protocol.
+    Args:
+        page: the playwright page of which to extract the frame AXTrees.
+    Returns:
+        A dictionnary of AXTrees (as returned by Chrome DevTools Protocol) indexed by frame IDs.
+    """
+    cdp = page.context.new_cdp_session(page)
+    # extract the frame tree
+    frame_tree = cdp.send(
+        "Page.getFrameTree",
+        {},
+    )
+    # extract all frame IDs into a list
+    # (breadth-first-search through the frame tree)
+    frame_ids = []
+    root_frame = frame_tree["frameTree"]
+    frames_to_process = [root_frame]
+    while frames_to_process:
+        frame = frames_to_process.pop()
+        frames_to_process.extend(frame.get("childFrames", []))
+        # extract the frame ID
+        frame_id = frame["frame"]["id"]
+        frame_ids.append(frame_id)
+    # extract the AXTree of each frame
+    frame_axtrees = {
+        frame_id: cdp.send(
+            "Accessibility.getFullAXTree",
+            {"frameId": frame_id},
+        )
+        for frame_id in frame_ids
+    }
+    cdp.detach()
+    # extract browsergym data from ARIA attributes
+    for ax_tree in frame_axtrees.values():
+        for node in ax_tree["nodes"]:
+            data_items = []
+            # look for data in the node's "roledescription" property
+            if "properties" in node:
+                for i, prop in enumerate(node["properties"]):
+                    if prop["name"] == "roledescription":
+                        data_items, new_value = extract_data_items_from_aria(prop["value"]["value"])
+                        prop["value"]["value"] = new_value
+                        # remove the "description" property if empty
+                        if new_value == "":
+                            del node["properties"][i]
+                        break
+            # look for data in the node's "description" (fallback plan)
+            if "description" in node:
+                data_items_bis, new_value = extract_data_items_from_aria(
+                    node["description"]["value"]
+                )
+                node["description"]["value"] = new_value
+                if new_value == "":
+                    del node["description"]
+                if not data_items:
+                    data_items = data_items_bis
+            # add the extracted "browsergym" data to the AXTree
+            if data_items:
+                (browsergym_id,) = data_items
+                node["browsergym_id"] = browsergym_id
+    return frame_axtrees
+def extract_merged_axtree(page: playwright.sync_api.Page):
+    """
+    Extracts the merged AXTree of a Playwright page (main document and iframes AXTrees merged) using Chrome DevTools Protocol.
+    Args:
+        page: the playwright page of which to extract the merged AXTree.
+    Returns:
+        A merged AXTree (same format as those returned by Chrome DevTools Protocol).
+    """
+    frame_axtrees = extract_all_frame_axtrees(page)
+    cdp = page.context.new_cdp_session(page)
+    # merge all AXTrees into one
+    merged_axtree = {"nodes": []}
+    for ax_tree in frame_axtrees.values():
+        merged_axtree["nodes"].extend(ax_tree["nodes"])
+        # connect each iframe node to the corresponding AXTree root node
+        for node in ax_tree["nodes"]:
+            if node["role"]["value"] == "Iframe":
+                frame_id = (
+                    cdp.send("DOM.describeNode", {"backendNodeId": node["backendDOMNodeId"]})
+                    .get("node", {})
+                    .get("frameId", None)
+                )
+                if not frame_id:
+                    logger.warning(
+                        f"AXTree merging: unable to recover frameId of node with backendDOMNodeId {repr(node['backendDOMNodeId'])}, skipping"
+                    )
+                # it seems Page.getFrameTree() from CDP omits certain Frames (empty frames?)
+                # if a frame is not found in the extracted AXTrees, we just ignore it
+                elif frame_id in frame_axtrees:
+                    # root node should always be the first node in the AXTree
+                    frame_root_node = frame_axtrees[frame_id]["nodes"][0]
+                    assert frame_root_node["frameId"] == frame_id
+                    node["childIds"].append(frame_root_node["nodeId"])
+                else:
+                    logger.warning(
+                        f"AXTree merging: extracted AXTree does not contain frameId '{frame_id}', skipping"
+                    )
+    cdp.detach()
+    return merged_axtree
+def extract_focused_element_bid(page: playwright.sync_api.Page):
+    # this JS code will dive through ShadowDOMs
+    extract_focused_element_with_bid_script = """\
+() => {
+    // This recursive function traverses shadow DOMs
+    function getActiveElement(root) {
+        const active_element = root.activeElement;
+        if (!active_element) {
+            return null;
+        }
+        if (active_element.shadowRoot) {
+            return getActiveElement(active_element.shadowRoot);
+        } else {
+            return active_element;
+        }
+    }
+    return getActiveElement(document);
+}"""
+    # this playwright code will dive through iFrames
+    frame = page
+    focused_bid = ""
+    try:
+        while frame:
+            focused_element = frame.evaluate_handle(
+                extract_focused_element_with_bid_script, BID_ATTR
+            ).as_element()
+            if focused_element:
+                frame = focused_element.content_frame()
+                focused_bid = focused_element.get_attribute(BID_ATTR)
+            else:
+                frame = None
+    except playwright.sync_api.TimeoutError:
+        focused_bid = ""
+    # convert null / None to empty string
+    if not focused_bid:
+        focused_bid = ""
+    return focused_bid

BrowserGym/browsergym/core/src/browsergym/core/registration.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from functools import partial
+from typing import Type
+import gymnasium as gym
+from .env import BrowserEnv
+from .task import AbstractBrowserTask
+class frozen_partial:
+    """
+    Freeze some keyword arguments of a function.
+    """
+    def __init__(self, func, **frozen_kwargs):
+        self.func = func
+        self.frozen_kwargs = frozen_kwargs
+    def __call__(self, *args, **kwargs):
+        # check overlap between kwargs and frozen_kwargs
+        clashing_kwargs = set(self.frozen_kwargs) & set(kwargs)  # key set intersection
+        if clashing_kwargs:
+            raise ValueError(f"Illegal attempt to override frozen parameters {clashing_kwargs}.")
+        # merge the two dicts
+        kwargs = kwargs | self.frozen_kwargs
+        return self.func(*args, **kwargs)
+def register_task(
+    id: str,
+    task_class: Type[AbstractBrowserTask],
+    task_kwargs: dict = {},
+    default_task_kwargs: dict = {},
+    nondeterministic: bool = True,
+    *args,
+    **kwargs,
+):
+    """
+    Registers a browser task as a gym environment with its unique id.
+    Args:
+        id: the id of the task to register (will be prepended by "browsergym/").
+        task_class: the task class to register.
+        task_kwargs: frozen task arguments (can not be overloaded at environment creation time).
+        task_kwargs_default: default task arguments (can be overloaded at environment creation time).
+        nondeterministic: whether the task cannot be guaranteed deterministic transitions.
+        *args: additional sequential arguments for either the gym or the browsergym environment.
+        *kwargs: additional keyword arguments for either the gym or the browsergym environment.
+    """
+    if task_kwargs and default_task_kwargs:
+        # check overlap between frozen and default task_kwargs
+        clashing_kwargs = set(task_kwargs) & set(default_task_kwargs)  # key set intersection
+        if clashing_kwargs:
+            raise ValueError(
+                f"Illegal attempt to register Browsergym environment {id} with both frozen and default values for task parameters {clashing_kwargs}."
+            )
+    task_entrypoint = task_class
+    # freeze task_kwargs (cannot be overriden at environment creation)
+    task_entrypoint = frozen_partial(task_class, **task_kwargs)
+    # pre-set default_task_kwargs (can be overriden at environment creation)
+    task_entrypoint = partial(task_entrypoint, **default_task_kwargs)
+    gym.register(
+        id=f"browsergym/{id}",
+        entry_point=lambda *env_args, **env_kwargs: BrowserEnv(
+            task_entrypoint, *env_args, **env_kwargs
+        ),
+        nondeterministic=nondeterministic,
+        *args,
+        **kwargs,
+    )

BrowserGym/browsergym/core/src/browsergym/core/spaces.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""Borrowed from https://github.com/Farama-Foundation/miniwob-plusplus/blob/553daee55ea0b2cc32b181a474083ab4cad782a1/miniwob/spaces.py"""
+from typing import Any
+import numpy as np
+from gymnasium.spaces import Space
+from numpy.typing import NDArray
+class Unicode(Space):
+    """
+    A space representing a unicode string.
+    """
+    def __init__(self):
+        super().__init__()
+    def contains(self, x: Any) -> bool:
+        """Return boolean specifying if x is a valid member of this space."""
+        # Do not check the character set.
+        return isinstance(x, str)
+    def __repr__(self) -> str:
+        """Gives a string representation of this space."""
+        return f"Unicode()"
+    def __eq__(self, other: Any) -> bool:
+        """Check whether ``other`` is equivalent to this instance."""
+        return isinstance(other, Unicode)
+class Float(Space):
+    """
+    A space representing a float.
+    """
+    def __init__(self):
+        super().__init__()
+    def contains(self, x: Any) -> bool:
+        """Return boolean specifying if x is a valid member of this space."""
+        return isinstance(x, float)
+    def __repr__(self) -> str:
+        """Gives a string representation of this space."""
+        return f"Float()"
+    def __eq__(self, other: Any) -> bool:
+        """Check whether ``other`` is equivalent to this instance."""
+        return isinstance(other, Float)
+class Integer(Space):
+    """
+    A space representing an integer.
+    """
+    def __init__(self):
+        super().__init__()
+    def contains(self, x: Any) -> bool:
+        """Return boolean specifying if x is a valid member of this space."""
+        return isinstance(x, int)
+    def __repr__(self) -> str:
+        """Gives a string representation of this space."""
+        return f"Integer()"
+    def __eq__(self, other: Any) -> bool:
+        """Check whether ``other`` is equivalent to this instance."""
+        return isinstance(other, Integer)
+class AnyDict(Space):
+    """A space representing an arbitrary dictionary object."""
+    def contains(self, x: Any) -> bool:
+        """Return boolean specifying if x is a valid member of this space."""
+        # Do not check anything specific.
+        return isinstance(x, dict)
+    def __repr__(self) -> str:
+        """Gives a string representation of this space."""
+        return f"AnyDict()"
+    def __eq__(self, other: Any) -> bool:
+        """Check whether ``other`` is equivalent to this instance."""
+        return isinstance(other, AnyDict)
+class Anything(Space):
+    """A space representing an arbitrary dictionary object."""
+    def contains(self, x: Any) -> bool:
+        return True
+    def __repr__(self) -> str:
+        return f"Anything()"
+    def __eq__(self, other: Any) -> bool:
+        return isinstance(other, Anything)
+class AnyBox(Space[NDArray[Any]]):
+    """A space representing an arbitrary dictionary object."""
+    def __init__(self, low, high, shape, dtype):
+        super().__init__(shape, dtype)
+        self.low = low
+        self.high = high
+    def contains(self, x: Any) -> bool:
+        """Return boolean specifying if x is a valid member of this space."""
+        if not isinstance(x, np.ndarray):
+            try:
+                x = np.asarray(x, dtype=self.dtype)
+            except (ValueError, TypeError):
+                return False
+        return bool(
+            np.can_cast(x.dtype, self.dtype)
+            and len(x.shape) == len(self.shape)
+            and all([dim in (xdim, -1) for xdim, dim in zip(x.shape, self.shape)])
+            and np.all(x >= self.low)
+            and np.all(x <= self.high)
+        )
+    def __repr__(self) -> str:
+        """Gives a string representation of this space."""
+        return f"AnyBox(low={repr(self.low)}, high={repr(self.high)}, shape={repr(self.shape)}, dtype={repr(self.dtype)})"
+    def __eq__(self, other: Any) -> bool:
+        """Check whether ``other`` is equivalent to this instance."""
+        return (
+            isinstance(other, AnyBox)
+            and self.low == other.low
+            and self.high == other.high
+            and self.shape == other.shape
+            and self.dtype == other.dtype
+        )

BrowserGym/browsergym/core/src/browsergym/core/task.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from abc import ABC, abstractmethod
+from typing import Tuple
+import numpy as np
+import playwright.sync_api
+class AbstractBrowserTask(ABC):
+    """
+    Abstract class for browsergym tasks.
+    """
+    @classmethod
+    def get_task_id(cls):
+        raise NotImplementedError
+    def __init__(self, seed: int) -> None:
+        # initiate a random number generator
+        self.random = np.random.RandomState(seed)
+        # task properties, will be used to set up the browsergym environment
+        # default values, can be overriden in children classes
+        self.viewport = {"width": 1280, "height": 720}
+        self.slow_mo = 1000  # ms
+        self.timeout = 5000  # ms
+        self.locale = None  # see https://playwright.dev/python/docs/api/class-browser#browser-new-context-option-locale
+        self.timezone_id = None  # see https://playwright.dev/python/docs/api/class-browser#browser-new-context-option-timezone-id
+    @abstractmethod
+    def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]:
+        """
+        Set up everything needed to execute the task.
+        Args:
+            page: the active playwright page.
+        Returns:
+            goal: str, goal of the task.
+            info: dict, custom information from the task.
+        """
+    @abstractmethod
+    def validate(
+        self, page: playwright.sync_api.Page, chat_messages: list[str]
+    ) -> Tuple[float, bool, str, dict]:
+        """
+        Validate the task was completed successfully
+        Args:
+            page: the active playwright page.
+            chat_messages: the chat messages.
+        Returns:
+            reward: float, the reward obtained since last call to validate().
+            done: boolean flag, indicates if the task has finished or not (be it success or fail).
+            message: string, a new user message for the chat.
+            info: dictionnary, custom information from the task.
+        """
+    def cheat(self, page: playwright.sync_api.Page, chat_messages: list[str]) -> None:
+        """
+        Solve the task using a pre-defined solution (optional).
+        """
+        raise NotImplementedError
+    def teardown(self) -> None:
+        """
+        Tear down the task and clean up any resource / data created by the task (optional).
+        """
+        pass
+class OpenEndedTask(AbstractBrowserTask):
+    @classmethod
+    def get_task_id(cls):
+        return "openended"
+    def __init__(self, seed: int, start_url: str, goal: str = None) -> None:
+        """
+        Args:
+            seed: random seed.
+            start_url: str, the url for the starting page.
+            goal: str, the initial goal.
+        """
+        super().__init__(seed)
+        self.start_url = start_url
+        self.goal = goal
+    def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]:
+        page.goto(self.start_url, timeout=10000)
+        return self.goal, {}
+    def teardown(self) -> None:
+        pass
+    def validate(
+        self, page: playwright.sync_api.Page, chat_messages: list[str]
+    ) -> Tuple[float, bool, str, dict]:
+        reward, done, msg, info = 0, False, "", {}
+        for message in chat_messages:
+            if message["role"] == "user" and message["message"] == "exit":
+                done = True
+                break
+        return reward, done, msg, info

BrowserGym/browsergym/core/src/browsergym/utils/mcp_server.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# MCP server for BrowserGym
+import argparse
+import asyncio
+import re
+from collections.abc import AsyncIterator
+from contextlib import asynccontextmanager
+from dataclasses import dataclass, field
+from typing import Callable
+import gymnasium as gym
+from mcp.server.fastmcp import FastMCP
+from browsergym.core.action.highlevel import ACTION_SUBSETS, HighLevelActionSet
+from browsergym.core.env import BrowserEnv
+@dataclass
+class BgymConfig:
+    headless: bool = True
+    timeout_ms: int = 10000
+    record_video_dir: str | None = None
+    demo_mode: HighLevelActionSet.DemoMode = "default"
+    validate_actions: list[str] = field(default_factory=list)
+@dataclass
+class AppContext:
+    gym: BrowserEnv
+    config: BgymConfig
+    task_id: str
+    actions: HighLevelActionSet
+def get_cli_args():
+    parser = argparse.ArgumentParser(
+        description="BrowserGym MCP server",
+        usage="python browsergym/core/src/browsergym/utils/%(prog)s [options]",
+        epilog="To run Dev UI: mcp dev browsergym/core/src/browsergym/utils/mcp_server.py -e browsergym/core/",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "-t",
+        "--task_id",
+        type=str,
+        default="browsergym/openended",
+        help="Task ID to run",
+    )
+    parser.add_argument(
+        "-l",
+        "--headless",
+        action="store_true",
+        help="Run in headless mode",
+    )
+    parser.add_argument(
+        "-r",
+        "--record_video_dir",
+        type=str,
+        default=None,
+        help="Directory to save recorded videos",
+    )
+    parser.add_argument(
+        "--demo_mode",
+        type=str,
+        default="off",
+        choices=["off", "default", "all_blue", "only_visible_elements"],
+        help="Demo mode for action set",
+    )
+    parser.add_argument(
+        "--timeout_ms",
+        type=int,
+        default=10000,
+        help="Timeout in milliseconds for each step",
+    )
+    parser.add_argument(
+        "--subset",
+        type=str,
+        default="workarena++",
+        choices=ACTION_SUBSETS.keys(),
+        help="Subset of actions to use",
+    )
+    parser.add_argument(
+        "--validate_actions",
+        type=str,
+        nargs="+",
+        default=["click", "goto"],
+        help="Names of actions for which validation should be performed",
+    )
+    args, _ = parser.parse_known_args()
+    return args
+args = get_cli_args()
+task_id = args.task_id
+config = BgymConfig(
+    headless=args.headless,
+    timeout_ms=args.timeout_ms,
+    record_video_dir=args.record_video_dir,
+    demo_mode=args.demo_mode,
+    validate_actions=args.validate_actions,
+)
+@asynccontextmanager
+async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]:
+    """Manage application lifecycle with type-safe context"""
+    # Initialize on startup
+    actions = HighLevelActionSet(demo_mode=config.demo_mode, subsets=args.subset)
+    _gym: BrowserEnv = await asyncio.to_thread(
+        gym.make,
+        task_id,
+        headless=config.headless,
+        record_video_dir=config.record_video_dir,
+        action_mapping=actions.to_python_code,
+        timeout=config.timeout_ms,
+        task_kwargs={"start_url": "about:blank"},
+    )  # type: ignore
+    await asyncio.to_thread(_gym.reset)
+    try:
+        yield AppContext(gym=_gym, config=config, task_id=task_id, actions=actions)
+    finally:
+        # Cleanup on shutdown
+        await asyncio.to_thread(_gym.close)
+mcp = FastMCP("BrowserGym", lifespan=app_lifespan)
+def format_func_call(func: Callable, args, kwargs) -> str:
+    args_str = ", ".join(repr(arg) for arg in args)
+    kwargs_str = ", ".join(f"{k}={repr(v)}" for k, v in kwargs.items())
+    all_args_str = ", ".join(filter(None, [args_str, kwargs_str]))
+    return f"{func.__name__}({all_args_str})"
+def fn_wrapper(func: Callable, validate: bool = True):
+    async def decorator(*args, **kwargs):
+        """
+        Decorator to execute function from the action space in the context of the gym.
+        1. Loads the parent module of the function to use as function context
+        2. Executes the pre_step method of the gym
+        3. Sets up the module vars from the current state of the gym
+        4. Executes the function from this module and handles any exceptions
+        5. Executes the post_step method of the gym
+        """
+        gym: BrowserEnv = mcp.get_context().request_context.lifespan_context.gym  # type: ignore
+        while not isinstance(gym, BrowserEnv):
+            gym = (
+                gym.env
+            )  # gym library wraps the BrowserEnv in a few layers (usually 2) of wrappers, this loop unwraps them
+        # Load the parent module of the function to use as function context
+        import browsergym.core.action.functions as fn_context
+        fn = getattr(fn_context, func.__name__)
+        gym.last_action = format_func_call(fn, args, kwargs)
+        info, send_message_to_user, report_infeasible_instructions = await asyncio.to_thread(
+            gym.pre_step
+        )
+        # Set up the module vars from the current state of the gym
+        fn_context.send_message_to_user = send_message_to_user
+        fn_context.report_infeasible_instructions = report_infeasible_instructions
+        fn_context.page = gym.page
+        fn_context.demo_mode = config.demo_mode
+        try:
+            fn(*args, **kwargs)
+            gym.last_action_error = ""
+        except Exception as e:
+            gym.last_action_error = f"{type(e).__name__}: {e}"
+            match = re.match("TimeoutError: Timeout ([0-9]+)ms exceeded.", gym.last_action_error)
+            if match:
+                info["action_exec_timeout"] = float(match.groups()[0]) / 1000
+        results = await asyncio.to_thread(gym.post_step, info, validate)
+        return results
+    decorator.__wrapped__ = func  # type: ignore
+    decorator.__name__ = func.__name__
+    decorator.__doc__ = func.__doc__
+    return decorator
+for fn in ACTION_SUBSETS[args.subset]:
+    validate = fn.__name__ in config.validate_actions
+    mcp.add_tool(fn_wrapper(fn, validate))
+if __name__ == "__main__":
+    mcp.run(transport="stdio")

BrowserGym/browsergym/core/src/browsergym/utils/obs.py ADDED Viewed

	@@ -0,0 +1,554 @@

+import ast
+import logging
+import math
+import re
+from collections import defaultdict
+import numpy as np
+import PIL.Image
+import PIL.ImageDraw
+import PIL.ImageFont
+from bs4 import BeautifulSoup
+from browsergym.core.constants import BROWSERGYM_ID_ATTRIBUTE as BID_ATTR
+from browsergym.core.constants import BROWSERGYM_SETOFMARKS_ATTRIBUTE as SOM_ATTR
+from browsergym.core.constants import BROWSERGYM_VISIBILITY_ATTRIBUTE as VIS_ATTR
+logger = logging.getLogger(__name__)
+IGNORED_AXTREE_ROLES = ["LineBreak"]
+IGNORED_AXTREE_PROPERTIES = (
+    "editable",
+    "readonly",
+    "level",
+    "settable",
+    "multiline",
+    "invalid",
+    "focusable",
+)
+def flatten_dom_to_str(
+    dom_snapshot,
+    extra_properties: dict = None,
+    with_visible: bool = False,
+    with_clickable: bool = False,
+    with_center_coords: bool = False,
+    with_bounding_box_coords: bool = False,
+    with_som: bool = False,
+    filter_visible_only: bool = False,
+    filter_with_bid_only: bool = False,
+    filter_som_only: bool = False,
+    coord_decimals: int = 0,
+    hide_bid_if_invisible: int = False,
+    hide_all_bids: bool = False,
+) -> str:
+    """Formats a DOM snapshot into a string text"""
+    def to_string(idx):
+        if idx == -1:
+            return None
+        else:
+            return dom_snapshot["strings"][idx]
+    def parse_document(document_idx) -> str:
+        # adapted from [natbot](https://github.com/nat/natbot)
+        nodes = dom_snapshot["documents"][document_idx]["nodes"]
+        node_children = defaultdict(lambda: [])
+        for node_idx in range(len(nodes["nodeName"])):
+            parent_idx = nodes["parentIndex"][node_idx]
+            if parent_idx != -1:
+                node_children[parent_idx].append(node_idx)
+        def dfs(node_idx: int, parent_node_skipped: bool) -> str:
+            # https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
+            # https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeName
+            # https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeValue
+            node_type = nodes["nodeType"][node_idx]
+            node_name = to_string(nodes["nodeName"][node_idx])
+            node_value = to_string(nodes["nodeValue"][node_idx])
+            html_before = ""
+            html_after = ""
+            skip_node = False
+            # text nodes: print text content only if parent was not skipped
+            if node_type == 3:  # node_name == "#text"
+                if not parent_node_skipped and node_value is not None:
+                    html_before += node_value
+            # CData nodes: print content only if parent was not skipped
+            elif node_type == 4:  # node_name == "#cdata-section":
+                if not parent_node_skipped and node_value is not None:
+                    html_before += f"<!CDATA[[{node_value}]]>"
+            # processing instructions, comments, documents, doctypes, document fragments: don't print
+            elif node_type in (7, 8, 9, 10, 11):
+                skip_node = True
+            # now we should have an element node
+            else:
+                assert node_type == 1
+                tag_name = node_name.lower().strip()
+                attributes = []  # to be printed as attributes with the tag
+                bid = None
+                # parse node attributes
+                node_attr_idxs = nodes["attributes"][node_idx]
+                for i in range(0, len(node_attr_idxs), 2):
+                    attr_name = to_string(node_attr_idxs[i])
+                    attr_value = to_string(node_attr_idxs[i + 1])
+                    # extract and print bid
+                    if attr_name == BID_ATTR:
+                        bid = attr_value
+                    # ignore browsergym attributes
+                    elif attr_name in (VIS_ATTR, SOM_ATTR):
+                        pass
+                    # print other attributes
+                    else:
+                        if attr_value is None:
+                            # attribute value missing
+                            attributes.append(f"{attr_name}")
+                        else:
+                            # attribute value present
+                            attributes.append(f'{attr_name}="{attr_value}"')
+                skip_node, extra_attributes_to_print = _process_bid(
+                    bid,
+                    extra_properties=extra_properties,
+                    with_visible=with_visible,
+                    with_clickable=with_clickable,
+                    with_center_coords=with_center_coords,
+                    with_bounding_box_coords=with_bounding_box_coords,
+                    with_som=with_som,
+                    filter_visible_only=filter_visible_only,
+                    filter_with_bid_only=filter_with_bid_only,
+                    filter_som_only=filter_som_only,
+                    coord_decimals=coord_decimals,
+                )
+                # insert extra attributes before regular attributes
+                attributes = extra_attributes_to_print + attributes
+                # insert bid as first attribute
+                if not (
+                    hide_all_bids
+                    or bid is None
+                    or (
+                        hide_bid_if_invisible
+                        and extra_properties.get(bid, {}).get("visibility", 0) < 0.5
+                    )
+                ):
+                    attributes.insert(0, f'bid="{bid}"')
+                if not skip_node:
+                    # print node opening tag, with its attributes
+                    html_before += f"<{tag_name}" + " ".join([""] + attributes) + ">"
+                    # print node closing tag
+                    html_after += f"</{tag_name}>"
+            html = ""
+            html += html_before
+            # recursively print iframe nodes if any
+            if node_idx in nodes["contentDocumentIndex"]["index"]:
+                sub_document_idx = nodes["contentDocumentIndex"]["value"][
+                    nodes["contentDocumentIndex"]["index"].index(node_idx)
+                ]
+                html += parse_document(document_idx=sub_document_idx)
+            # recursively print children nodes if any
+            for child_idx in node_children[node_idx]:
+                html += dfs(node_idx=child_idx, parent_node_skipped=skip_node)
+            html += html_after
+            return html
+        html = dfs(node_idx=0, parent_node_skipped=False)
+        # Format the HTML document with indentation
+        soup = BeautifulSoup(html, "lxml")
+        html = soup.prettify()
+        return html
+    html = parse_document(document_idx=0)
+    return html
+def _get_coord_str(coord, decimals):
+    if isinstance(coord, str):
+        coord = list(map(float, ast.literal_eval(coord)))
+    coord_format = f".{decimals}f"
+    coord_str = ",".join([f"{c:{coord_format}}" for c in coord])
+    return f"({coord_str})"
+def _process_bid(
+    bid,
+    extra_properties: dict = None,
+    with_visible: bool = False,
+    with_clickable: bool = False,
+    with_center_coords: bool = False,
+    with_bounding_box_coords: bool = False,
+    with_som: bool = False,
+    filter_visible_only: bool = False,
+    filter_with_bid_only: bool = False,
+    filter_som_only: bool = False,
+    coord_decimals: int = 0,
+):
+    """
+    Process extra attributes and attribute-based filters, for the element with the given bid.
+    Returns:
+        A flag indicating if the element should be skipped or not (due to filters).
+        Attributes to be printed, as a list of "x=y" strings.
+    """
+    if extra_properties is None:
+        if any(
+            (
+                with_visible,
+                with_clickable,
+                with_center_coords,
+                with_bounding_box_coords,
+                with_som,
+                filter_visible_only,
+                filter_with_bid_only,
+                filter_som_only,
+            )
+        ):
+            raise ValueError("extra_properties argument required")
+        else:
+            extra_properties = {}
+    skip_element = False
+    attributes_to_print = []
+    if bid is None:
+        # skip nodes without a bid (if requested)
+        if filter_with_bid_only:
+            skip_element = True
+        if filter_som_only:
+            skip_element = True
+        if filter_visible_only:
+            # element without bid have no visibility mark, they could be visible or non-visible
+            # TODO we consider them as visible. Is this what we want? Now that duplicate bids are handled, should we mark all non-html elements?
+            pass  # keep elements without visible property
+            # skip_element = True  # filter elements without visible property
+    # parse extra browsergym properties, if node has a bid
+    else:
+        if bid in extra_properties:
+            node_vis = extra_properties[bid]["visibility"]
+            node_bbox = extra_properties[bid]["bbox"]
+            node_is_clickable = extra_properties[bid]["clickable"]
+            node_in_som = extra_properties[bid]["set_of_marks"]
+            node_is_visible = node_vis >= 0.5
+            # skip non-visible nodes (if requested)
+            if filter_visible_only and not node_is_visible:
+                skip_element = True
+            if filter_som_only and not node_in_som:
+                skip_element = True
+            # print extra attributes if requested (with new names)
+            if with_som and node_in_som:
+                attributes_to_print.insert(0, f"som")
+            if with_visible and node_is_visible:
+                attributes_to_print.insert(0, f"visible")
+            if with_clickable and node_is_clickable:
+                attributes_to_print.insert(0, f"clickable")
+            if with_center_coords and node_bbox is not None:
+                x, y, width, height = node_bbox
+                center = (x + width / 2, y + height / 2)
+                attributes_to_print.insert(0, f'center="{_get_coord_str(center, coord_decimals)}"')
+            if with_bounding_box_coords and node_bbox is not None:
+                x, y, width, height = node_bbox
+                box = (x, y, x + width, y + height)
+                attributes_to_print.insert(0, f'box="{_get_coord_str(box, coord_decimals)}"')
+    return skip_element, attributes_to_print
+def flatten_axtree_to_str(
+    AX_tree,
+    extra_properties: dict = None,
+    with_visible: bool = False,
+    with_clickable: bool = False,
+    with_center_coords: bool = False,
+    with_bounding_box_coords: bool = False,
+    with_som: bool = False,
+    skip_generic: bool = True,
+    filter_visible_only: bool = False,
+    filter_with_bid_only: bool = False,
+    filter_som_only: bool = False,
+    coord_decimals: int = 0,
+    ignored_roles=IGNORED_AXTREE_ROLES,
+    ignored_properties=IGNORED_AXTREE_PROPERTIES,
+    remove_redundant_static_text: bool = True,
+    hide_bid_if_invisible: bool = False,
+    hide_all_children: bool = False,
+    hide_all_bids: bool = False,
+) -> str:
+    """Formats the accessibility tree into a string text"""
+    node_id_to_idx = {}
+    for idx, node in enumerate(AX_tree["nodes"]):
+        node_id_to_idx[node["nodeId"]] = idx
+    def dfs(node_idx: int, depth: int, parent_node_filtered: bool, parent_node_name: str) -> str:
+        tree_str = ""
+        node = AX_tree["nodes"][node_idx]
+        indent = "\t" * depth
+        skip_node = False  # node will not be printed, with no effect on children nodes
+        filter_node = False  # node will not be printed, possibly along with its children nodes
+        node_role = node["role"]["value"]
+        node_name = ""
+        if node_role in ignored_roles:
+            skip_node = True
+            pass
+        elif "name" not in node:
+            skip_node = True
+            pass
+        else:
+            node_name = node["name"]["value"]
+            if "value" in node and "value" in node["value"]:
+                node_value = node["value"]["value"]
+            else:
+                node_value = None
+            # extract bid
+            bid = node.get("browsergym_id", None)
+            # extract node attributes
+            attributes = []
+            for property in node.get("properties", []):
+                if not "value" in property:
+                    continue
+                if not "value" in property["value"]:
+                    continue
+                prop_name = property["name"]
+                prop_value = property["value"]["value"]
+                if prop_name in ignored_properties:
+                    continue
+                elif prop_name in ("required", "focused", "atomic"):
+                    if prop_value:
+                        attributes.append(prop_name)
+                else:
+                    attributes.append(f"{prop_name}={repr(prop_value)}")
+            if skip_generic and node_role == "generic" and not attributes:
+                skip_node = True
+            if hide_all_children and parent_node_filtered:
+                skip_node = True
+            if node_role == "StaticText":
+                if parent_node_filtered:
+                    skip_node = True
+                elif remove_redundant_static_text and node_name in parent_node_name:
+                    skip_node = True
+            else:
+                filter_node, extra_attributes_to_print = _process_bid(
+                    bid,
+                    extra_properties=extra_properties,
+                    with_visible=with_visible,
+                    with_clickable=with_clickable,
+                    with_center_coords=with_center_coords,
+                    with_bounding_box_coords=with_bounding_box_coords,
+                    with_som=with_som,
+                    filter_visible_only=filter_visible_only,
+                    filter_with_bid_only=filter_with_bid_only,
+                    filter_som_only=filter_som_only,
+                    coord_decimals=coord_decimals,
+                )
+                # if either is True, skip the node
+                skip_node = skip_node or filter_node
+                # insert extra attributes before regular attributes
+                attributes = extra_attributes_to_print + attributes
+            # actually print the node string
+            if not skip_node:
+                if node_role == "generic" and not node_name:
+                    node_str = f"{node_role}"
+                else:
+                    node_str = f"{node_role} {repr(node_name.strip())}"
+                if not (
+                    hide_all_bids
+                    or bid is None
+                    or (
+                        hide_bid_if_invisible
+                        and extra_properties.get(bid, {}).get("visibility", 0) < 0.5
+                    )
+                ):
+                    node_str = f"[{bid}] " + node_str
+                if node_value is not None:
+                    node_str += f' value={repr(node["value"]["value"])}'
+                if attributes:
+                    node_str += ", ".join([""] + attributes)
+                tree_str += f"{indent}{node_str}"
+        for child_node_id in node["childIds"]:
+            if child_node_id not in node_id_to_idx or child_node_id == node["nodeId"]:
+                continue
+            # mark this to save some tokens
+            child_depth = depth if skip_node else (depth + 1)
+            child_str = dfs(
+                node_id_to_idx[child_node_id],
+                child_depth,
+                parent_node_filtered=filter_node,
+                parent_node_name=node_name,
+            )
+            if child_str:
+                if tree_str:
+                    tree_str += "\n"
+                tree_str += child_str
+        return tree_str
+    tree_str = dfs(0, 0, False, "")
+    return tree_str
+def overlay_som(
+    screenshot: np.typing.ArrayLike,
+    extra_properties: dict,
+    fontsize: int = 12,
+    linewidth: int = 2,
+    tag_margin: int = 2,
+):
+    img = PIL.Image.fromarray(screenshot).copy()  # make a copy
+    img = img.convert(mode="RGBA")
+    draw = PIL.ImageDraw.Draw(img)
+    font = PIL.ImageFont.load_default(size=fontsize)
+    # Adapted from https://stackoverflow.com/questions/51908563/dotted-or-dashed-line-with-python-pillow/58885306#58885306
+    def linedashed(
+        draw: PIL.ImageDraw.Draw, x0, y0, x1, y1, fill, width, dash_length=4, nodash_length=8
+    ):
+        line_dx = x1 - x0  # delta x (can be negative)
+        line_dy = y1 - y0  # delta y (can be negative)
+        line_length = math.hypot(line_dx, line_dy)  # line length (positive)
+        if line_length == 0:
+            return  # Avoid division by zero in case the line length is 0
+        pixel_dx = line_dx / line_length  # x add for 1px line length
+        pixel_dy = line_dy / line_length  # y add for 1px line length
+        dash_start = 0
+        while dash_start < line_length:
+            dash_end = dash_start + dash_length
+            if dash_end > line_length:
+                dash_end = line_length
+            draw.line(
+                (
+                    round(x0 + pixel_dx * dash_start),
+                    round(y0 + pixel_dy * dash_start),
+                    round(x0 + pixel_dx * dash_end),
+                    round(y0 + pixel_dy * dash_end),
+                ),
+                fill=fill,
+                width=width,
+            )
+            dash_start += dash_length + nodash_length
+    for bid, properties in extra_properties.items():
+        if properties["set_of_marks"] and properties["bbox"]:
+            x, y, width, height = properties["bbox"]
+            x0, y0 = x, y
+            x1, y1 = x + width, y + height
+            # skip small boxes
+            area = (x1 - x0) * (y1 - y0)
+            if area < 20:
+                logger.warning(
+                    f'som overlay: skipping bid "{bid}" due to bbox too small (area={area})'
+                )
+                continue
+            # draw bounding box with dashed lines
+            linedashed(draw, x0, y0, x1, y0, fill=(0, 0, 0, 255), width=linewidth)
+            linedashed(draw, x1, y0, x1, y1, fill=(0, 0, 0, 255), width=linewidth)
+            linedashed(draw, x1, y1, x0, y1, fill=(0, 0, 0, 255), width=linewidth)
+            linedashed(draw, x0, y1, x0, y0, fill=(0, 0, 0, 255), width=linewidth)
+            # get text box size (left, top, right, bottom)
+            tag_box = font.getbbox(
+                bid,
+            )
+            # set tag size, including margins
+            tag_size = (
+                (tag_box[2] - tag_box[0] + 2 * (tag_margin + 1)),
+                (tag_box[3] - tag_box[1] + 2 * (tag_margin + 1)),
+            )
+            # create tag image with correct size and black background
+            tag_img = PIL.Image.new("RGBA", tag_size, "black")
+            tag_draw = PIL.ImageDraw.Draw(tag_img)
+            # write text with 1px horizontal margin
+            tag_draw.text(
+                (-tag_box[0] + tag_margin + 1, -tag_box[1] + tag_margin + 1),
+                bid,
+                font=font,
+                fill=(255, 255, 255, 255),
+                spacing=0,
+            )
+            tag_draw.rectangle(
+                (0, 0, tag_size[0] - 1, tag_size[1] - 1),
+                fill=None,
+                outline=(255, 255, 255, 255),
+                width=1,
+            )
+            # draw tag in the source image, upper left of the bounding box
+            tag_pos = (x + 0, y - tag_size[1] / 2 + 4)
+            tag_pos = list(map(round, tag_pos))
+            img.paste(tag_img, tag_pos)
+    # convert to RGB (3 channels)
+    img = img.convert(mode="RGB")
+    # convert to a numpy array
+    img = np.array(img)
+    return img
+def prune_html(html):
+    html = re.sub(r"\n", " ", html)
+    # remove html comments
+    html = re.sub(r"<!--(.*?)-->", "", html, flags=re.MULTILINE)
+    soup = BeautifulSoup(html, "lxml")
+    for tag in reversed(soup.find_all()):
+        # remove body and html tags (not their content)
+        if tag.name in ("html", "body"):
+            tag.unwrap()
+        # remove useless tags
+        elif tag.name in ("style", "link", "script", "br"):
+            tag.decompose()
+        # remove / unwrap structural tags
+        elif tag.name in ("div", "span", "i", "p") and len(tag.attrs) == 1 and tag.has_attr("bid"):
+            if not tag.contents:
+                tag.decompose()
+            else:
+                tag.unwrap()
+    html = soup.prettify()
+    return html

BrowserGym/browsergym/experiments/README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+# BrowserGym experiments
+This package provides `browsergym.experiments`, a suite of experimentation tools for [BrowserGym](https://github.com/ServiceNow/BrowserGym).
+As a convenience namespace, it also provides `bgym`.
+## Setup
+1. Install the package
+```sh
+pip install browsergym-experiments
+```

BrowserGym/browsergym/experiments/pyproject.toml ADDED Viewed

	@@ -0,0 +1,65 @@

+[build-system]
+requires = ["hatchling", "hatch-requirements-txt"]
+build-backend = "hatchling.build"
+[project]
+name = "browsergym-experiments"
+description = "Experimentation tools for BrowserGym"
+authors = [
+    {name = "Massimo Caccia"},
+    {name = "Alex Lacoste"},
+    {name = "Thibault Le Sellier De Chezelles"},
+    {name = "Maxime Gasse"},
+]
+readme = "README.md"
+requires-python = ">3.7"
+license = {text = "Apache-2.0"}
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "License :: OSI Approved :: Apache Software License",
+]
+dynamic = ["dependencies", "version"]
+[project.optional-dependencies]
+miniwob = [
+  "browsergym-miniwob",
+]
+workarena = [
+  "browsergym-workarena",
+]
+webarena = [
+  "browsergym-webarena",
+]
+visualwebarena = [
+  "browsergym-visualwebarena",
+]
+assistantbench = [
+  "browsergym-assistantbench",
+]
+weblinx = [
+  "weblinx_browsergym",
+]
+all = [
+  "browsergym-experiment[miniwob]",
+  "browsergym-experiment[workarena]",
+  "browsergym-experiment[webarena]",
+  "browsergym-experiment[visualwebarena]",
+  "browsergym-experiment[assistantbench]",
+  "browsergym-experiment[weblinx]",
+]
+[project.urls]
+homepage = "https://github.com/ServiceNow/BrowserGym"
+[tool.hatch.version]
+path = "../core/src/browsergym/core/__init__.py"
+[tool.hatch.metadata.hooks.requirements_txt]
+files = ["requirements.txt"]
+[tool.hatch.build.targets.wheel]
+packages = ["src/browsergym", "src/bgym"]

BrowserGym/browsergym/experiments/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+browsergym-core==0.13.4
+tiktoken>=0.4
+dataclasses-json

BrowserGym/browsergym/experiments/src/bgym/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from browsergym.core.action.base import AbstractActionSet
+from browsergym.core.action.highlevel import HighLevelActionSet
+from browsergym.core.action.python import PythonActionSet
+from browsergym.experiments.agent import Agent, AgentInfo
+from browsergym.experiments.benchmark import (
+    DEFAULT_BENCHMARKS,
+    Benchmark,
+    HighLevelActionSetArgs,
+)
+from browsergym.experiments.loop import (
+    AbstractAgentArgs,
+    EnvArgs,
+    ExpArgs,
+    ExpResult,
+    StepInfo,
+    StepTimestamps,
+)

BrowserGym/browsergym/experiments/src/browsergym/experiments/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .agent import Agent, AgentInfo
2	+ from .loop import AbstractAgentArgs, EnvArgs, ExpArgs, get_exp_result

BrowserGym/browsergym/experiments/src/browsergym/experiments/agent.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any
+from browsergym.core.action.base import AbstractActionSet
+from browsergym.core.action.highlevel import HighLevelActionSet
+from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html
+def default_obs_preprocessor(obs: dict) -> dict:
+    obs = obs.copy()  # shallow copy to avoid modifying the original dict
+    # augment the observation with text versions of the DOM and AXTree
+    obs["dom_txt"] = flatten_dom_to_str(obs["dom_object"])
+    obs["axtree_txt"] = flatten_axtree_to_str(obs["axtree_object"])
+    obs["pruned_html"] = prune_html(obs["dom_txt"])
+    # remove raw entries that the agent won't use, and we don't want to record
+    del obs["dom_object"]
+    del obs["axtree_object"]
+    return obs
+DEFAULT_ACTION_SET: AbstractActionSet = HighLevelActionSet()
+DEFAULT_OBS_PREPROCESSOR: callable = default_obs_preprocessor
+@dataclass
+class AgentInfo:
+    think: str = None
+    chat_messages: list = None
+    stats: dict = field(default_factory=dict)
+    markdown_page: str = ""
+    html_page: str = ""
+    extra_info: dict = None
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __contains__(self, key):
+        return hasattr(self, key)
+    def pop(self, key, default=None):
+        return getattr(self, key, default)
+    def get(self, key, default=None):
+        return getattr(self, key, default)
+class Agent(ABC):
+    """
+    A template class that defines the required signature of an agent interacting
+    with a browsergym environment
+    Attributes:
+        action_set: AbstractActionSet
+            Defines the set of actions that the agent can take in the environment.
+            This property is meant to be overloaded by your agent (optional).
+            By default, uses BrowserGym's high-level action set.
+    """
+    action_set: AbstractActionSet = DEFAULT_ACTION_SET
+    def obs_preprocessor(self, obs: dict) -> Any:
+        """
+        Function that pre-processes observations before feeding them to `get_action()`.
+        This property is meant to be overloaded by your agent (optional).
+        By default, the base observation is augmented with text versions of the DOM and AXTREE.
+        Why this mapping? This mapping will happen within the experiment loop, so that the
+        resulting observation gets recorded in the execution traces, and statistics can be computed from it.
+        """
+        return DEFAULT_OBS_PREPROCESSOR(obs)
+    @abstractmethod
+    def get_action(self, obs: Any) -> tuple[str, AgentInfo]:
+        """
+        Updates the agent with the current observation, and returns its next action (plus an info dict, optional).
+        Parameters:
+        -----------
+        obs:
+            The current observation of the environment, after it has been processed by `obs_preprocessor()`.
+            By default, a BrowserGym observation is a dict with the following entries:
+            - "chat_messages": list[str], messages between the agent and the user.
+            - "goal": str, the current goal.
+            - "open_pages_urls": list[str], open pages.
+            - "active_page_index": int, the index of the active page.
+            - "url": str, the current URL.
+            - "screenshot": 3D np.array, the current screenshot.
+            - "dom_object": dict, the current DOM object. See DOMSnapshot from chrome devtools.
+            - "axtree_object": dict, the current AXTREE object. See Accessibility Tree from chrome devtools.
+            - "extra_element_properties": dict[bid, dict[name, value]] extra
+            properties of elements in the DOM.
+            - "focused_element_bid": str, the bid of the focused element.
+            - "last_action": str, the last action executed.
+            - "last_action_error": str, the error of the last action.
+            - "elapsed_time": float, the time elapsed since the start of the episode.
+        Returns:
+        --------
+        action: str
+            The action to be processed by `action_mapping()` (if any), and executed in the environment.
+        info: AgentInfo
+            Additional information about the action. with the following entries
+            being handled by BrowserGym:
+                - "think": optional chain of thought
+                - "messages": list of messages with the LLM
+                - "stats": dict of extra statistics that will be saved and
+                  aggregated.
+                - "markdown_page": str, string that will be displayed by agentlab's xray tool.
+                - "extra_info": dict, additional information that will be saved
+                  and aggregated.
+        """

BrowserGym/browsergym/experiments/src/browsergym/experiments/benchmark/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .base import Benchmark, HighLevelActionSetArgs
2	+ from .configs import DEFAULT_BENCHMARKS