diff --git a/app.py b/app.py index 6535dde0d99e6c6cb22bc7c2c47361d2e138b56e..017017def98deddc2ba5309a97180f6859a9e9a1 100644 --- a/app.py +++ b/app.py @@ -9,7 +9,6 @@ import plotly.graph_objs as go from huggingface_hub import HfApi from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError import streamlit.components.v1 as components -from datetime import datetime from urllib.parse import quote from pathlib import Path @@ -17,7 +16,7 @@ import re import html from typing import Dict, Any -BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "VisualWebArena", "AssistantBench"] +BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB",] def sanitize_agent_name(agent_name): # Only allow alphanumeric chars, hyphen, underscore @@ -44,34 +43,12 @@ def sanitize_column_name(col: str) -> str: return html.escape(str(col)) def sanitize_cell_value(value: Any) -> str: + """Sanitize cell values for HTML display""" if isinstance(value, (int, float)): return str(value) - if isinstance(value, str) and '±' in value: - score, std_err = value.split('±') - return f'{score.strip()} ±{std_err.strip()}' return html.escape(str(value)) def create_html_table_main(df): - col1, col2 = st.columns([2,6]) - with col1: - sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column") - with col2: - sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order") - - def get_sort_value(row): - if row == "-": - return float('-inf') - else: - try: - return float(row) - except ValueError: - return row - - # Sort dataframe - if sort_order == "Ascending": - df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value)) - else: - df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value)) html = ''' - """, unsafe_allow_html=True) - st.markdown("""
{benchmark}", unsafe_allow_html=True) + # st.dataframe( + # df_, + # use_container_width=True, + # column_config={benchmark: {'alignment': 'center'}}, + # hide_index=True, + # ) + html_table = create_html_table_benchmark(df_) st.markdown(html_table, unsafe_allow_html=True) diff --git a/results/Bgym-GPT-3.5/README.md b/results/Bgym-GPT-3.5/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f15589f8889e87445e98f51746ac426bcb81b9c0 --- /dev/null +++ b/results/Bgym-GPT-3.5/README.md @@ -0,0 +1 @@ +## GPT-3.5 model \ No newline at end of file diff --git a/results/Bgym-GPT-3.5/config.json b/results/Bgym-GPT-3.5/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3ea3825d4f9cc8568d9cdfb93eac5fcb48a07f86 --- /dev/null +++ b/results/Bgym-GPT-3.5/config.json @@ -0,0 +1,4 @@ +{ + "agent_name": "GPT-3.5", + "backend_llm": "GPT-3.5" +} \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o/miniwob.json b/results/Bgym-GPT-3.5/miniwob.json similarity index 69% rename from results/GenericAgent-GPT-4o/miniwob.json rename to results/Bgym-GPT-3.5/miniwob.json index b3aa2029432187b2737e378d5a7127aae52230b1..b4d117fa36474116ffc4e8392cd375ccfeb52e6d 100644 --- a/results/GenericAgent-GPT-4o/miniwob.json +++ b/results/Bgym-GPT-3.5/miniwob.json @@ -1,11 +1,11 @@ [ { - "agent_name": "GenericAgent-GPT-4o", - "study_id": "2024-10-25_06-08-16", + "agent_name": "Bgym-GPT-3.5", + "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "MiniWoB", - "score": 63.8, - "std_err": 1.9, + "score": 43.4, + "std_err": 0.1, "benchmark_specific": "No", "benchmark_tuned": "No", "followed_evaluation_protocol": "Yes", diff --git a/results/Bgym-GPT-3.5/webarena.json b/results/Bgym-GPT-3.5/webarena.json new file mode 100644 index 0000000000000000000000000000000000000000..7b352122f2d3056156d07f11f086e85753b090e4 --- /dev/null +++ b/results/Bgym-GPT-3.5/webarena.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "Bgym-GPT-3.5", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "WebArena", + "score": 6.7, + "std_err": 0.2, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/Bgym-GPT-3.5/workarena-l1.json b/results/Bgym-GPT-3.5/workarena-l1.json new file mode 100644 index 0000000000000000000000000000000000000000..81d2975b3eef0a9f509633ca1bd68d19ac7a689a --- /dev/null +++ b/results/Bgym-GPT-3.5/workarena-l1.json @@ -0,0 +1,44 @@ +[ + { + "agent_name": "Bgym-GPT-3.5", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "WorkArena-L1", + "score": 6.1, + "std_err": 0.3, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + }, + { + "agent_name": "Bgym-GPT-3.5", + "study_id": "study_id", + "benchmark": "WorkArena-L1", + "score": 5.7, + "std_err": 0.3, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Reproduced", + "date_time": "2021-01-04 12:06:00" + }, + { + "benchmark": "WorkArena-L1", + "agent_name": "Bgym-GPT-3.5", + "study_id": "study_id", + "score": 5.1, + "std_err": 0.3, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Reproduced", + "date_time": "2021-01-04 12:06:00" + } +] \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o/workarena-l2.json b/results/Bgym-GPT-3.5/workarena-l2.json similarity index 70% rename from results/GenericAgent-GPT-4o/workarena-l2.json rename to results/Bgym-GPT-3.5/workarena-l2.json index 9c8f1c07bef2ac72f54b9243f71427ea5bd65a04..ad6ab82a380e20dd6ffad919d2d9860703bce2ee 100644 --- a/results/GenericAgent-GPT-4o/workarena-l2.json +++ b/results/Bgym-GPT-3.5/workarena-l2.json @@ -1,11 +1,11 @@ [ { - "agent_name": "GenericAgent-GPT-4o", - "study_id": "2024-10-23_17-10-46", + "agent_name": "Bgym-GPT-3.5", + "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L2", - "score": 8.5, - "std_err": 1.8, + "score": 0.0, + "std_err": 0.0, "benchmark_specific": "No", "benchmark_tuned": "No", "followed_evaluation_protocol": "Yes", diff --git a/results/GenericAgent-GPT-4o/workarena-l3.json b/results/Bgym-GPT-3.5/workarena-l3.json similarity index 73% rename from results/GenericAgent-GPT-4o/workarena-l3.json rename to results/Bgym-GPT-3.5/workarena-l3.json index 01650347b957bcca9ab6fe3d587f2f4f56ad4858..40093a485842f340d16d25af5768e8d066377a05 100644 --- a/results/GenericAgent-GPT-4o/workarena-l3.json +++ b/results/Bgym-GPT-3.5/workarena-l3.json @@ -1,8 +1,8 @@ [ { - "agent_name": "GenericAgent-GPT-4o", - "study_id": "-", - "date_time": "2024-10-24 23:03:30", + "agent_name": "Bgym-GPT-3.5", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L3", "score": 0.0, "std_err": 0.0, diff --git a/results/Bgym-GPT-4o-V/README.md b/results/Bgym-GPT-4o-V/README.md new file mode 100644 index 0000000000000000000000000000000000000000..065c2f2bbfe5c0845debe1baa0a086f2dd2c019a --- /dev/null +++ b/results/Bgym-GPT-4o-V/README.md @@ -0,0 +1 @@ +## GPT-4o-V model \ No newline at end of file diff --git a/results/Bgym-GPT-4o-V/config.json b/results/Bgym-GPT-4o-V/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ebdbb1db77cb11d5a01b6f7568f6c8196e881818 --- /dev/null +++ b/results/Bgym-GPT-4o-V/config.json @@ -0,0 +1,4 @@ +{ + "agent_name": "GPT-4o-V", + "backend_llm": "GPT-4o-V" +} \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o-mini/miniwob.json b/results/Bgym-GPT-4o-V/miniwob.json similarity index 68% rename from results/GenericAgent-GPT-4o-mini/miniwob.json rename to results/Bgym-GPT-4o-V/miniwob.json index 8fd1b099c7968b9f41faad3dfd46fbe340ab535a..1090c29ca8017fddd7eb43d7de424c4ef5115f7c 100644 --- a/results/GenericAgent-GPT-4o-mini/miniwob.json +++ b/results/Bgym-GPT-4o-V/miniwob.json @@ -1,11 +1,11 @@ [ { - "agent_name": "GenericAgent-GPT-4o-mini", - "study_id": "2024-10-25_06-08-16", + "agent_name": "Bgym-GPT-4o-V", + "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "MiniWoB", - "score": 56.6, - "std_err": 2.0, + "score": 72.5, + "std_err": 0.5, "benchmark_specific": "No", "benchmark_tuned": "No", "followed_evaluation_protocol": "Yes", diff --git a/results/Bgym-GPT-4o-V/webarena.json b/results/Bgym-GPT-4o-V/webarena.json new file mode 100644 index 0000000000000000000000000000000000000000..4908982e7d053542eeaaf8f2410aa794dc05d52b --- /dev/null +++ b/results/Bgym-GPT-4o-V/webarena.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "Bgym-GPT-4o-V", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "WebArena", + "score": 24.0, + "std_err": 0.4, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o/workarena-l1.json b/results/Bgym-GPT-4o-V/workarena-l1.json similarity index 69% rename from results/GenericAgent-GPT-4o/workarena-l1.json rename to results/Bgym-GPT-4o-V/workarena-l1.json index 17de417aa4cacdc9f4bfde75c3d22b26820324b3..ed6776d3ca134d76b4e69886ba372792b5ea212a 100644 --- a/results/GenericAgent-GPT-4o/workarena-l1.json +++ b/results/Bgym-GPT-4o-V/workarena-l1.json @@ -1,11 +1,11 @@ [ { - "agent_name": "GenericAgent-GPT-4o", - "study_id": "2024-10-23_14-17-40", + "agent_name": "Bgym-GPT-4o-V", + "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L1", - "score": 45.5, - "std_err": 2.7, + "score": 41.8, + "std_err": 0.4, "benchmark_specific": "No", "benchmark_tuned": "No", "followed_evaluation_protocol": "Yes", diff --git a/results/GenericAgent-GPT-4o-mini/workarena-l2.json b/results/Bgym-GPT-4o-V/workarena-l2.json similarity index 69% rename from results/GenericAgent-GPT-4o-mini/workarena-l2.json rename to results/Bgym-GPT-4o-V/workarena-l2.json index ba423ad1010b93ecf159e2db44e428376b66a90c..25e2c312fd03d6b61211943add71430d7bbf1003 100644 --- a/results/GenericAgent-GPT-4o-mini/workarena-l2.json +++ b/results/Bgym-GPT-4o-V/workarena-l2.json @@ -1,11 +1,11 @@ [ { - "agent_name": "GenericAgent-GPT-4o-mini", - "study_id": "2024-10-23_17-10-46", + "agent_name": "Bgym-GPT-4o-V", + "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L2", - "score": 1.3, - "std_err": 0.7, + "score": 3.8, + "std_err": 0.6, "benchmark_specific": "No", "benchmark_tuned": "No", "followed_evaluation_protocol": "Yes", diff --git a/results/GenericAgent-GPT-4o-mini/workarena-l3.json b/results/Bgym-GPT-4o-V/workarena-l3.json similarity index 72% rename from results/GenericAgent-GPT-4o-mini/workarena-l3.json rename to results/Bgym-GPT-4o-V/workarena-l3.json index 43aab700e44b6fc39160b306b7bbbcf56c507b05..e9b990349435d7c131cee59d0dc559d4dafbd377 100644 --- a/results/GenericAgent-GPT-4o-mini/workarena-l3.json +++ b/results/Bgym-GPT-4o-V/workarena-l3.json @@ -1,8 +1,8 @@ [ { - "agent_name": "GenericAgent-GPT-4o-mini", - "study_id": "-", - "date_time": "2024-10-24 23:03:30", + "agent_name": "Bgym-GPT-4o-V", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L3", "score": 0.0, "std_err": 0.0, diff --git a/results/Bgym-GPT-4o/README.md b/results/Bgym-GPT-4o/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f521ba2aaa82109c0f67a1c19ff0811ffb0e1085 --- /dev/null +++ b/results/Bgym-GPT-4o/README.md @@ -0,0 +1 @@ +## GPT-4o model \ No newline at end of file diff --git a/results/Bgym-GPT-4o/config.json b/results/Bgym-GPT-4o/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a08adf507dc482df79c852e11b393c756a6fd71 --- /dev/null +++ b/results/Bgym-GPT-4o/config.json @@ -0,0 +1,4 @@ +{ + "agent_name": "GPT-4o", + "backend_llm": "GPT-4o" +} \ No newline at end of file diff --git a/results/Bgym-GPT-4o/miniwob.json b/results/Bgym-GPT-4o/miniwob.json new file mode 100644 index 0000000000000000000000000000000000000000..400f137bef1d0fdceb662fafbeebe73a743934b5 --- /dev/null +++ b/results/Bgym-GPT-4o/miniwob.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "Bgym-GPT-4o", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "MiniWoB", + "score": 71.3, + "std_err": 0.5, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/Bgym-GPT-4o/webarena.json b/results/Bgym-GPT-4o/webarena.json new file mode 100644 index 0000000000000000000000000000000000000000..388494e96508a4273f5974e83c45fad8f84cf172 --- /dev/null +++ b/results/Bgym-GPT-4o/webarena.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "Bgym-GPT-4o", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "WebArena", + "score": 23.5, + "std_err": 0.4, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o-mini/workarena-l1.json b/results/Bgym-GPT-4o/workarena-l1.json similarity index 69% rename from results/GenericAgent-GPT-4o-mini/workarena-l1.json rename to results/Bgym-GPT-4o/workarena-l1.json index ca27a541ef9ffe198860ad51a9db327b9928adbd..20cc70c2a512e602e3c3fa4781ca0f9d635002ab 100644 --- a/results/GenericAgent-GPT-4o-mini/workarena-l1.json +++ b/results/Bgym-GPT-4o/workarena-l1.json @@ -1,11 +1,11 @@ [ { - "agent_name": "GenericAgent-GPT-4o-mini", - "study_id": "2024-10-23_14-17-40", + "agent_name": "Bgym-GPT-4o", + "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L1", - "score": 27, - "std_err": 2.4, + "score": 42.7, + "std_err": 0.4, "benchmark_specific": "No", "benchmark_tuned": "No", "followed_evaluation_protocol": "Yes", diff --git a/results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json b/results/Bgym-GPT-4o/workarena-l2.json similarity index 68% rename from results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json rename to results/Bgym-GPT-4o/workarena-l2.json index a0927edab449cd654a9ab66b6fb4524d3fd6bc6d..26495ed99ecb04d7d68dbcec2228cdb0f72596a3 100644 --- a/results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json +++ b/results/Bgym-GPT-4o/workarena-l2.json @@ -1,11 +1,11 @@ [ { - "agent_name": "GenericAgent-Claude-3.5-Sonnet", - "study_id": "2024-10-23_17-10-46", + "agent_name": "Bgym-GPT-4o", + "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L2", - "score": 39.1, - "std_err": 3.2, + "score": 3.0, + "std_err": 0.6, "benchmark_specific": "No", "benchmark_tuned": "No", "followed_evaluation_protocol": "Yes", diff --git a/results/GenericAgent-GPT-o1-mini/workarena-l3.json b/results/Bgym-GPT-4o/workarena-l3.json similarity index 72% rename from results/GenericAgent-GPT-o1-mini/workarena-l3.json rename to results/Bgym-GPT-4o/workarena-l3.json index 4f824ed4a043f2b39890ce4c9eb2ea4fd98008e7..a5ec6780ea9ca807b4a8ef4d401ca14d33f7e3d6 100644 --- a/results/GenericAgent-GPT-o1-mini/workarena-l3.json +++ b/results/Bgym-GPT-4o/workarena-l3.json @@ -1,8 +1,8 @@ [ { - "agent_name": "GenericAgent-GPT-o1-mini", - "study_id": "-", - "date_time": "2024-10-24 23:03:30", + "agent_name": "Bgym-GPT-4o", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L3", "score": 0.0, "std_err": 0.0, diff --git a/results/Bgym-Llama-3-70b/README.md b/results/Bgym-Llama-3-70b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8798ff4c72825c5049ef24cf16b818faa9ae5d2b --- /dev/null +++ b/results/Bgym-Llama-3-70b/README.md @@ -0,0 +1 @@ +### Llama-3-70B \ No newline at end of file diff --git a/results/Bgym-Llama-3-70b/config.json b/results/Bgym-Llama-3-70b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6aa02bf47d3ad80e2616b77e62c40f0103d722c6 --- /dev/null +++ b/results/Bgym-Llama-3-70b/config.json @@ -0,0 +1,4 @@ +{ + "agent_name": "Llama-3-70B", + "backend_llm": "Llama-3-70B" +} \ No newline at end of file diff --git a/results/Bgym-Llama-3-70b/miniwob.json b/results/Bgym-Llama-3-70b/miniwob.json new file mode 100644 index 0000000000000000000000000000000000000000..5dadad99d077784d8a2c662c6262722c34e834f8 --- /dev/null +++ b/results/Bgym-Llama-3-70b/miniwob.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "Bgym-Llama-3-70b", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "MiniWoB", + "score": 68.2, + "std_err": 0.7, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/Bgym-Llama-3-70b/webarena.json b/results/Bgym-Llama-3-70b/webarena.json new file mode 100644 index 0000000000000000000000000000000000000000..6c229ed5d2d97ac3623759b8f7f7131a3830abea --- /dev/null +++ b/results/Bgym-Llama-3-70b/webarena.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "Bgym-Llama-3-70b", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "WebArena", + "score": 11.0, + "std_err": 0.3, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/Bgym-Llama-3-70b/workarena-l1.json b/results/Bgym-Llama-3-70b/workarena-l1.json new file mode 100644 index 0000000000000000000000000000000000000000..59c5280fdc8de77cdb38d2c73398a50addaa5220 --- /dev/null +++ b/results/Bgym-Llama-3-70b/workarena-l1.json @@ -0,0 +1,58 @@ +[ + { + "agent_name": "Bgym-Llama-3-70b", + "study_id": "study_id", + "benchmark": "WorkArena-L1", + "score": 17.9, + "std_err": 0.6, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original", + "date_time": "2021-01-01 12:00:00" + }, + { + "agent_name": "Bgym-Llama-3-70b", + "study_id": "study_id", + "benchmark": "WorkArena-L1", + "score": 15.9, + "std_err": 0.6, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Reproduced", + "date_time": "2021-01-04 12:06:00" + }, + { + "agent_name": "Bgym-Llama-3-70b", + "study_id": "study_id", + "benchmark": "WorkArena-L1", + "score": 19.9, + "std_err": 0.6, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Reproduced", + "date_time": "2021-01-05 2:07:00" + }, + { + "agent_name": "Bgym-Llama-3-70b", + "study_id": "study_id", + "benchmark": "WorkArena-L1", + "score": 17.9, + "std_err": 0.6, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Reproduced", + "date_time": "2021-01-12 12:00:00" + } +] \ No newline at end of file diff --git a/results/Bgym-Llama-3-70b/workarena-l2.json b/results/Bgym-Llama-3-70b/workarena-l2.json new file mode 100644 index 0000000000000000000000000000000000000000..0f0f8451f47f7022fe3b85923f9356c2f1617c15 --- /dev/null +++ b/results/Bgym-Llama-3-70b/workarena-l2.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "Bgym-Llama-3-70b", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "WorkArena-L2", + "score": 0.0, + "std_err": 0.0, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json b/results/Bgym-Llama-3-70b/workarena-l3.json similarity index 68% rename from results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json rename to results/Bgym-Llama-3-70b/workarena-l3.json index dc85da14be3c4f1722f11f2c30e28db8ceeaf670..acf0a81f58b3ab0e03178ad4f7ae27cb9fec4e97 100644 --- a/results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json +++ b/results/Bgym-Llama-3-70b/workarena-l3.json @@ -1,11 +1,11 @@ [ { - "agent_name": "GenericAgent-Claude-3.5-Sonnet", - "study_id": "2024-10-24_18-06-57", + "agent_name": "Bgym-Llama-3-70b", + "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L3", - "score": 0.4, - "std_err": 0.4, + "score": 0.0, + "std_err": 0.0, "benchmark_specific": "No", "benchmark_tuned": "No", "followed_evaluation_protocol": "Yes", diff --git a/results/Bgym-Mixtral-8x22b/README.md b/results/Bgym-Mixtral-8x22b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..25b17de698790810b5a434228a08691aa048e4ca --- /dev/null +++ b/results/Bgym-Mixtral-8x22b/README.md @@ -0,0 +1 @@ +## Mixtral 8x22B \ No newline at end of file diff --git a/results/Bgym-Mixtral-8x22b/config.json b/results/Bgym-Mixtral-8x22b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..77a92a7ee77cc1a96eeb37a6923290294b7a3fdb --- /dev/null +++ b/results/Bgym-Mixtral-8x22b/config.json @@ -0,0 +1,4 @@ +{ + "agent_name": "Mixtral-8x22B", + "backend_llm": "Mixtral-8x22B" +} \ No newline at end of file diff --git a/results/Bgym-Mixtral-8x22b/miniwob.json b/results/Bgym-Mixtral-8x22b/miniwob.json new file mode 100644 index 0000000000000000000000000000000000000000..0b6ea125b66d8032f19d3922284cadcfe7e5b957 --- /dev/null +++ b/results/Bgym-Mixtral-8x22b/miniwob.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "Bgym-Mixtral-8x22b", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "MiniWoB", + "score": 62.4, + "std_err": 0.5, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/Bgym-Mixtral-8x22b/webarena.json b/results/Bgym-Mixtral-8x22b/webarena.json new file mode 100644 index 0000000000000000000000000000000000000000..823344e109e805942efd8e34caa90d2e4a0c4d33 --- /dev/null +++ b/results/Bgym-Mixtral-8x22b/webarena.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "Bgym-Mixtral-8x22b", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "WebArena", + "score": 12.6, + "std_err": 0.9, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/Bgym-Mixtral-8x22b/workarena-l1.json b/results/Bgym-Mixtral-8x22b/workarena-l1.json new file mode 100644 index 0000000000000000000000000000000000000000..eccaa19d2a1980051186312bdf50a545fac60836 --- /dev/null +++ b/results/Bgym-Mixtral-8x22b/workarena-l1.json @@ -0,0 +1,44 @@ +[ + { + "agent_name": "Bgym-Mixtral-8x22b", + "study_id": "study_id", + "benchmark": "WorkArena-L1", + "score": 12.4, + "std_err": 0.7, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original", + "date_time": "2021-01-04 12:06:00" + }, + { + "agent_name": "Bgym-Mixtral-8x22b", + "study_id": "study_id", + "benchmark": "WorkArena-L1", + "score": 11.4, + "std_err": 0.7, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Reproduced", + "date_time": "2021-01-04 12:06:00" + }, + { + "agent_name": "Bgym-Mixtral-8x22b", + "study_id": "study_id", + "benchmark": "WorkArena-L1", + "score": 13.4, + "std_err": 0.7, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Reproduced", + "date_time": "2021-01-04 12:06:00" + } +] \ No newline at end of file diff --git a/results/Bgym-Mixtral-8x22b/workarena-l2.json b/results/Bgym-Mixtral-8x22b/workarena-l2.json new file mode 100644 index 0000000000000000000000000000000000000000..fbc2324755d166804d3586bf0f54766c3c940232 --- /dev/null +++ b/results/Bgym-Mixtral-8x22b/workarena-l2.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "Bgym-Mixtral-8x22b", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "WorkArena-L2", + "score": 0.0, + "std_err": 0.0, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/Bgym-Mixtral-8x22b/workarena-l3.json b/results/Bgym-Mixtral-8x22b/workarena-l3.json new file mode 100644 index 0000000000000000000000000000000000000000..2cfe04fa51a97f69d7172f239484939e0716102a --- /dev/null +++ b/results/Bgym-Mixtral-8x22b/workarena-l3.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "Bgym-Mixtral-8x22b", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "WorkArena-L3", + "score": 0.0, + "std_err": 0.0, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/GenericAgent-AgentTrek-1.0-32b/README.md b/results/GenericAgent-AgentTrek-1.0-32b/README.md deleted file mode 100644 index 572348c9ec90c9c9c37366a60916c3c3f76b3bd3..0000000000000000000000000000000000000000 --- a/results/GenericAgent-AgentTrek-1.0-32b/README.md +++ /dev/null @@ -1,85 +0,0 @@ -### GenericAgent-AgentTrek-1.0-32b - -this agent is GenericAgent from Agentlab - -- **Base Model:** - - - Qwen/Qwen2.5-32B-Instruct -- **Architecture:** - - - Type: Causal Language Models - - Training Stage: Pretraining & Post-training - - Architecture: transformers with RoPE, SwiGLU, RMSNorm, and Attention QKV bias - - Number of Parameters: 32.5B - - Number of Paramaters (Non-Embedding): 31.0B - - Number of Layers: 64 - - Number of Attention Heads (GQA): 40 for Q and 8 for KV -- Input/Output Format: - - - with the following flags: - ```txt - flags=GenericPromptFlags( - obs=ObsFlags( - use_html=True, - use_ax_tree=True, - use_tabs=False, - use_focused_element=False, - use_error_logs=True, - use_history=True, - use_past_error_logs=False, - use_action_history=True, - use_think_history=False, - use_diff=False, - html_type='pruned_html', - use_screenshot=False, - use_som=False, - extract_visible_tag=False, - extract_clickable_tag=False, - extract_coords='False', - filter_visible_elements_only=False, - openai_vision_detail='auto', - filter_with_bid_only=False, - filter_som_only=False - ), - action=ActionFlags( - action_set=HighLevelActionSetArgs( - subsets=('miniwob_all',), - multiaction=False, - strict=False, - retry_with_force=True, - demo_mode='off' - ), - long_description=False, - individual_examples=False, - multi_actions=None, - is_strict=None - ), - use_plan=False, - use_criticise=False, - use_thinking=True, - use_memory=True, - use_concrete_example=True, - use_abstract_example=True, - use_hints=False, - enable_chat=False, - max_prompt_tokens=40000, - be_cautious=True, - extra_instructions=None, - add_missparsed_messages=True, - max_trunc_itr=20, - flag_group=None - ) - ``` -- Training Details - - - Dataset used: [AgentTrek-6K](https://agenttrek.github.io) - - Number of training steps: 3 Epochs -- Paper Link: - - - https://arxiv.org/abs/2412.09605 -- Code Repository: - - - https://agenttrek.github.io -- Lisense: - - - apache2.0 diff --git a/results/GenericAgent-AgentTrek-1.0-32b/miniwob.json b/results/GenericAgent-AgentTrek-1.0-32b/miniwob.json deleted file mode 100644 index 0877890bd9296bb436a8afbd6cc5ae2614408df2..0000000000000000000000000000000000000000 --- a/results/GenericAgent-AgentTrek-1.0-32b/miniwob.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-AgentTrek-1.0-32b", - "study_id": "4c636aa0-ea52-429d-9d7e-301b7bf0ac74", - "date_time": "2025-01-22 04:27:37", - "benchmark": "MiniWoB", - "score": 60.0, - "std_err": 2.0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Additional details", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-AgentTrek-1.0-32b/webarena.json b/results/GenericAgent-AgentTrek-1.0-32b/webarena.json deleted file mode 100644 index 0634bdfc8304a9148264ced6498c59868388910b..0000000000000000000000000000000000000000 --- a/results/GenericAgent-AgentTrek-1.0-32b/webarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-AgentTrek-1.0-32b", - "study_id": "ac309635-f3fd-417e-ac16-1e0fc943a54f", - "date_time": "2025-01-25 10:16:41", - "benchmark": "WebArena", - "score": 22.4, - "std_err": 1.5, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Additional details", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-AgentTrek-1.0-32b/workarena-l1.json b/results/GenericAgent-AgentTrek-1.0-32b/workarena-l1.json deleted file mode 100644 index 9c5a3ff367959ebb56b9f606bc9da0576606399d..0000000000000000000000000000000000000000 --- a/results/GenericAgent-AgentTrek-1.0-32b/workarena-l1.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-AgentTrek-1.0-32b", - "study_id": "ed14232c-cd7e-4708-b334-ebaf1f220000", - "date_time": "2025-01-12 00:37:04", - "benchmark": "WorkArena-L1", - "score": 38.29, - "std_err": 2.70, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Additional details", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-AgentTrek-1.0-32b/workarena-l2.json b/results/GenericAgent-AgentTrek-1.0-32b/workarena-l2.json deleted file mode 100644 index 15e30b9ae065a131118a164a22008fc6f7e3e578..0000000000000000000000000000000000000000 --- a/results/GenericAgent-AgentTrek-1.0-32b/workarena-l2.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-AgentTrek-1.0-32b", - "study_id": "957fb895-8548-46f4-92f0-5de6be7ceb61", - "date_time": "2025-01-12 09:39:21", - "benchmark": "WorkArena-L2", - "score": 2.98, - "std_err": 1.10, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Additional details", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-AgentTrek-1.0-32b/workarena-l3.json b/results/GenericAgent-AgentTrek-1.0-32b/workarena-l3.json deleted file mode 100644 index fca096b12f2212917f97efd35769d470244be488..0000000000000000000000000000000000000000 --- a/results/GenericAgent-AgentTrek-1.0-32b/workarena-l3.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-AgentTrek-1.0-32b", - "study_id": "a951b33f-d118-4cf4-a2ef-cc2ef204eeb0", - "date_time": "2025-01-13 12:11:45", - "benchmark": "WorkArena-L3", - "score": 0.0, - "std_err": 0.0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Additional details", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Claude-3.5-Sonnet/README.md b/results/GenericAgent-Claude-3.5-Sonnet/README.md deleted file mode 100644 index a247c51570651697a0165c50ba57d11ed643652a..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Claude-3.5-Sonnet/README.md +++ /dev/null @@ -1,46 +0,0 @@ -### GenericAgent-Claude-3.5-Sonnet - -This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab) - -It uses Claude-3.5-sonnet as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py): -```python -BASE_FLAGS = GenericPromptFlags( - obs=dp.ObsFlags( - use_html=False, - use_ax_tree=True, - use_focused_element=True, - use_error_logs=True, - use_history=True, - use_past_error_logs=False, - use_action_history=True, - use_think_history=True, # gpt-4o config except for this line - use_diff=False, - html_type="pruned_html", - use_screenshot=False, - use_som=False, - extract_visible_tag=True, - extract_clickable_tag=True, - extract_coords="False", - filter_visible_elements_only=False, - ), - action=dp.ActionFlags( - multi_actions=False, - action_set="bid", - long_description=False, - individual_examples=False, - ), - use_plan=False, - use_criticise=False, - use_thinking=True, - use_memory=False, - use_concrete_example=True, - use_abstract_example=True, - use_hints=True, - enable_chat=False, - max_prompt_tokens=40_000, - be_cautious=True, - extra_instructions=None, -) -``` - -Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it). \ No newline at end of file diff --git a/results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json b/results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json deleted file mode 100644 index 25e6e505abf6edc37e4d4205842b41af8eefe9a6..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Claude-3.5-Sonnet", - "study_id": "d93a2398-2b70-41ce-b989-364fed988d73", - "benchmark": "AssistantBench", - "score": 5.2, - "std_err": 1.5, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Intersection of finished tasks across agents.", - "original_or_reproduced": "Original", - "date_time": "2024-11-28 19:34:58" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Claude-3.5-Sonnet/miniwob.json b/results/GenericAgent-Claude-3.5-Sonnet/miniwob.json deleted file mode 100644 index 3d92a3b23ef10422e12b47f1697bd47799f2f5b5..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Claude-3.5-Sonnet/miniwob.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Claude-3.5-Sonnet", - "study_id": "2024-10-25_06-08-16", - "benchmark": "MiniWoB", - "score": 69.8, - "std_err": 1.8, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2021-01-01 12:00:00" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json b/results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json deleted file mode 100644 index 2314e53cd7e52a492b298e2d417ca83c8e99ad4e..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Claude-3.5-Sonnet", - "study_id": "22f0611d-aeea-4ee9-a533-b45442b5e080", - "benchmark": "VisualWebArena", - "score": 21.0, - "std_err": 1.3, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2024-12-02 09:11:35" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Claude-3.5-Sonnet/webarena.json b/results/GenericAgent-Claude-3.5-Sonnet/webarena.json deleted file mode 100644 index 0f54ccd322f68c971e4182db57a9313cb9cf7c6a..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Claude-3.5-Sonnet/webarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Claude-3.5-Sonnet", - "study_id": "b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae", - "benchmark": "WebArena", - "score": 36.2, - "std_err": 1.7, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2024-11-29 22:37:46" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Claude-3.5-Sonnet/weblinx.json b/results/GenericAgent-Claude-3.5-Sonnet/weblinx.json deleted file mode 100644 index 4be0689c02e5e8e9dee2dd09966fc9cf8577f0d5..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Claude-3.5-Sonnet/weblinx.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Claude-3.5-Sonnet", - "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1", - "benchmark": "WebLINX", - "score": 13.7, - "std_err": 0.6, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2024-11-07 21:42:30" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json b/results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json deleted file mode 100644 index 1eecc0e8f9ea7aa11252e6b280ea14e358c5cb0d..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Claude-3.5-Sonnet", - "study_id": "2024-10-23_14-17-40", - "benchmark": "WorkArena-L1", - "score": 56.4, - "std_err": 2.7, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2021-01-01 12:00:00" - } -] \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o-mini/README.md b/results/GenericAgent-GPT-4o-mini/README.md deleted file mode 100644 index 24cd7f221ec1011212d0a5366d13fc1f7c5275d3..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-4o-mini/README.md +++ /dev/null @@ -1,54 +0,0 @@ -### GenericAgent-GPT-4o-mini - -This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab) - -It uses GPT-4o-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py): -```python -BASE_FLAGS = GenericPromptFlags( - obs=dp.ObsFlags( - use_html=False, - use_ax_tree=True, - use_focused_element=True, - use_error_logs=True, - use_history=True, - use_past_error_logs=False, - use_action_history=True, - use_think_history=True, # gpt-4o config except for this line - use_diff=False, - html_type="pruned_html", - use_screenshot=False, - use_som=False, - extract_visible_tag=True, - extract_clickable_tag=True, - extract_coords="False", - filter_visible_elements_only=False, - ), - action=dp.ActionFlags( - multi_actions=False, - action_set="bid", - long_description=False, - individual_examples=False, - ), - use_plan=False, - use_criticise=False, - use_thinking=True, - use_memory=False, - use_concrete_example=True, - use_abstract_example=True, - use_hints=True, - enable_chat=False, - max_prompt_tokens=40_000, - be_cautious=True, - extra_instructions=None, -) -``` -© Hugging Face -TOS -Privacy -About -Jobs -Models -Datasets -Spaces -Pricing -Docs \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o-mini/assistantbench.json b/results/GenericAgent-GPT-4o-mini/assistantbench.json deleted file mode 100644 index edfcd630cb3884c4b7902ad14f1595dfff2a092f..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-4o-mini/assistantbench.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-GPT-4o-mini", - "study_id": "d93a2398-2b70-41ce-b989-364fed988d73", - "date_time": "2024-11-28 19:34:58", - "benchmark": "AssistantBench", - "score": 2.1, - "std_err": 1.0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Intersection of finished tasks across agents.", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o-mini/visualwebarena.json b/results/GenericAgent-GPT-4o-mini/visualwebarena.json deleted file mode 100644 index 73d01f9c23c56b48eee3901be1bc0b9a8788b1a2..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-4o-mini/visualwebarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-GPT-4o-mini", - "study_id": "8d8642d3-757a-4346-ba45-01398f85b1f4", - "date_time": "2024-12-02 02:54:33", - "benchmark": "VisualWebArena", - "score": 16.9, - "std_err": 1.2, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o-mini/webarena.json b/results/GenericAgent-GPT-4o-mini/webarena.json deleted file mode 100644 index 3958a8c47219cf2856a1e73a1309d1acc3b295b4..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-4o-mini/webarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-GPT-4o-mini", - "study_id": "c6bdeb87-9879-4c06-aa70-00d895001156", - "date_time": "2024-11-29 19:25:49", - "benchmark": "WebArena", - "score": 17.4, - "std_err": 1.3, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o-mini/weblinx.json b/results/GenericAgent-GPT-4o-mini/weblinx.json deleted file mode 100644 index 20293812e70717a0c19c9e9efe5c9a9052d62153..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-4o-mini/weblinx.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-GPT-4o-mini", - "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1", - "date_time": "2024-11-07 21:42:30", - "benchmark": "WebLINX", - "score": 11.6, - "std_err": 0.6, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o/README.md b/results/GenericAgent-GPT-4o/README.md deleted file mode 100644 index dac984577c9e5a75cb3f614aa43c17841382f298..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-4o/README.md +++ /dev/null @@ -1,46 +0,0 @@ -### GenericAgent-GPT-4o - -This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab) - -It uses GPT-4o as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py): -```python -BASE_FLAGS = GenericPromptFlags( - obs=dp.ObsFlags( - use_html=False, - use_ax_tree=True, - use_focused_element=True, - use_error_logs=True, - use_history=True, - use_past_error_logs=False, - use_action_history=True, - use_think_history=True, # gpt-4o config except for this line - use_diff=False, - html_type="pruned_html", - use_screenshot=False, - use_som=False, - extract_visible_tag=True, - extract_clickable_tag=True, - extract_coords="False", - filter_visible_elements_only=False, - ), - action=dp.ActionFlags( - multi_actions=False, - action_set="bid", - long_description=False, - individual_examples=False, - ), - use_plan=False, - use_criticise=False, - use_thinking=True, - use_memory=False, - use_concrete_example=True, - use_abstract_example=True, - use_hints=True, - enable_chat=False, - max_prompt_tokens=40_000, - be_cautious=True, - extra_instructions=None, -) -``` - -Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it). \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o/assistantbench.json b/results/GenericAgent-GPT-4o/assistantbench.json deleted file mode 100644 index b6a57ca0ad20cd092f5260ebb168a8e058fc4e40..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-4o/assistantbench.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-GPT-4o", - "study_id": "d93a2398-2b70-41ce-b989-364fed988d73", - "date_time": "2024-11-28 19:34:58", - "benchmark": "AssistantBench", - "score": 4.8, - "std_err": 2.4, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Intersection of finished tasks across agents.", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o/visualwebarena.json b/results/GenericAgent-GPT-4o/visualwebarena.json deleted file mode 100644 index 730550b0d16f9421b0372671f1a17b93fd17dc7a..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-4o/visualwebarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-GPT-4o", - "study_id": "7fb7eac8-4bbd-4ebe-be32-15901a7678f2", - "date_time": "2024-12-02 07:17:28", - "benchmark": "VisualWebArena", - "score": 26.7, - "std_err": 1.5, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o/webarena.json b/results/GenericAgent-GPT-4o/webarena.json deleted file mode 100644 index 53542859b5677c179c3edb195d17b5dd63d7db11..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-4o/webarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-GPT-4o", - "study_id": "d2eed215-91bb-4603-b69c-8ef8f9d57f34", - "date_time": "2024-11-29 22:28:32", - "benchmark": "WebArena", - "score": 31.4, - "std_err": 1.6, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-GPT-4o/weblinx.json b/results/GenericAgent-GPT-4o/weblinx.json deleted file mode 100644 index 2b75c3a1e8a455277a5b15c1208bc3c10d31f369..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-4o/weblinx.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-GPT-4o", - "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1", - "date_time": "2024-11-07 21:42:30", - "benchmark": "WebLINX", - "score": 12.5, - "std_err": 0.6, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-GPT-o1-mini/README.md b/results/GenericAgent-GPT-o1-mini/README.md deleted file mode 100644 index 05753e4c73ce5b5c318dd57d7c6cde978ce8dbba..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-o1-mini/README.md +++ /dev/null @@ -1,46 +0,0 @@ -### GenericAgent-GPT-o1-mini - -This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab) - -It uses o1-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py): -```python -BASE_FLAGS = GenericPromptFlags( - obs=dp.ObsFlags( - use_html=False, - use_ax_tree=True, - use_focused_element=True, - use_error_logs=True, - use_history=True, - use_past_error_logs=False, - use_action_history=True, - use_think_history=True, # gpt-4o config except for this line - use_diff=False, - html_type="pruned_html", - use_screenshot=False, - use_som=False, - extract_visible_tag=True, - extract_clickable_tag=True, - extract_coords="False", - filter_visible_elements_only=False, - ), - action=dp.ActionFlags( - multi_actions=False, - action_set="bid", - long_description=False, - individual_examples=False, - ), - use_plan=False, - use_criticise=False, - use_thinking=True, - use_memory=False, - use_concrete_example=True, - use_abstract_example=True, - use_hints=True, - enable_chat=False, - max_prompt_tokens=40_000, - be_cautious=True, - extra_instructions=None, -) -``` - -Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it). \ No newline at end of file diff --git a/results/GenericAgent-GPT-o1-mini/assistantbench.json b/results/GenericAgent-GPT-o1-mini/assistantbench.json deleted file mode 100644 index 60ef7f9ca466e4d11b00f04c652bb57ce685124d..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-o1-mini/assistantbench.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-GPT-o1-mini", - "study_id": "d93a2398-2b70-41ce-b989-364fed988d73", - "date_time": "2024-11-28 19:34:58", - "benchmark": "AssistantBench", - "score": 6.9, - "std_err": 2.2, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Intersection of finished tasks across agents.", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-GPT-o1-mini/miniwob.json b/results/GenericAgent-GPT-o1-mini/miniwob.json deleted file mode 100644 index a0d53790c92d64ec24d58bd4cb35bf898df7b11d..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-o1-mini/miniwob.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-GPT-o1-mini", - "study_id": "2024-10-25_06-08-16", - "date_time": "2024-10-25 17:16:23", - "benchmark": "MiniWoB", - "score": 67.8, - "std_err": 1.9, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-GPT-o1-mini/webarena.json b/results/GenericAgent-GPT-o1-mini/webarena.json deleted file mode 100644 index d094313542f88a8b827452d7f26f917b19211a22..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-o1-mini/webarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-GPT-o1-mini", - "study_id": "1827983d-5e84-4b63-ad49-bf45ec2a6348", - "date_time": "2024-11-30 00:22:44", - "benchmark": "WebArena", - "score": 28.6, - "std_err": 1.6, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-GPT-o1-mini/weblinx.json b/results/GenericAgent-GPT-o1-mini/weblinx.json deleted file mode 100644 index f5f14748f819dfe37e65fd2a63d23c120cd00e8b..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-o1-mini/weblinx.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-GPT-o1-mini", - "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1", - "date_time": "2024-11-07 21:42:30", - "benchmark": "WebLINX", - "score": 12.5, - "std_err": 0.6, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-GPT-o1-mini/workarena-l1.json b/results/GenericAgent-GPT-o1-mini/workarena-l1.json deleted file mode 100644 index 1b9e8f224d4936436e9de1f6ef7342eddcb91e25..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-o1-mini/workarena-l1.json +++ /dev/null @@ -1,30 +0,0 @@ -[ - { - "agent_name": "GenericAgent-GPT-o1-mini", - "study_id": "2024-10-23_14-17-40", - "date_time": "2024-10-23 22:30:06", - "benchmark": "WorkArena-L1", - "score": 56.7, - "std_err": 2.7, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - }, - { - "agent_name": "GenericAgent-GPT-o1-mini", - "study_id": "f3e1fcb8-5fc5-4115-9e00-27251508e2c7", - "date_time": "2025-02-07 14:00:00", - "benchmark": "WorkArena-L1", - "score": 51.8, - "std_err": 2.80, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Additional details", - "original_or_reproduced": "Reproduced" - } -] \ No newline at end of file diff --git a/results/GenericAgent-GPT-o1-mini/workarena-l2.json b/results/GenericAgent-GPT-o1-mini/workarena-l2.json deleted file mode 100644 index 4132ca278aed23f95fb38659f044e3990f82e0fe..0000000000000000000000000000000000000000 --- a/results/GenericAgent-GPT-o1-mini/workarena-l2.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-GPT-o1-mini", - "study_id": "2024-10-23_17-10-46", - "date_time": "2024-10-24 17:08:53", - "benchmark": "WorkArena-L2", - "score": 14.9, - "std_err": 2.3, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-405b/README.md b/results/GenericAgent-Llama-3.1-405b/README.md deleted file mode 100644 index 95c6107a0f831653d7cd0a5f848a2cc99754109c..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-405b/README.md +++ /dev/null @@ -1,46 +0,0 @@ -### GenericAgent-Llama-3.1-405b - -This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab) - -It uses Llama-3.1-405b as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py): -```python -BASE_FLAGS = GenericPromptFlags( - obs=dp.ObsFlags( - use_html=False, - use_ax_tree=True, - use_focused_element=True, - use_error_logs=True, - use_history=True, - use_past_error_logs=False, - use_action_history=True, - use_think_history=True, # gpt-4o config except for this line - use_diff=False, - html_type="pruned_html", - use_screenshot=False, - use_som=False, - extract_visible_tag=True, - extract_clickable_tag=True, - extract_coords="False", - filter_visible_elements_only=False, - ), - action=dp.ActionFlags( - multi_actions=False, - action_set="bid", - long_description=False, - individual_examples=False, - ), - use_plan=False, - use_criticise=False, - use_thinking=True, - use_memory=False, - use_concrete_example=True, - use_abstract_example=True, - use_hints=True, - enable_chat=False, - max_prompt_tokens=40_000, - be_cautious=True, - extra_instructions=None, -) -``` - -Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it). \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-405b/assistantbench.json b/results/GenericAgent-Llama-3.1-405b/assistantbench.json deleted file mode 100644 index 0a7c507210cf2e1b594136e43a5936407467f274..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-405b/assistantbench.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Llama-3.1-405b", - "study_id": "d93a2398-2b70-41ce-b989-364fed988d73", - "benchmark": "AssistantBench", - "score": 3.9, - "std_err": 1.0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Intersection of finished tasks across agents.", - "original_or_reproduced": "Original", - "date_time": "2024-11-28 19:34:58" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-405b/miniwob.json b/results/GenericAgent-Llama-3.1-405b/miniwob.json deleted file mode 100644 index 3f2c22229b7d1e67e3c2f63855bb8d2df78543eb..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-405b/miniwob.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Llama-3.1-405b", - "study_id": "4d748972-6d35-4489-a197-138b656a7db3", - "benchmark": "MiniWoB", - "score": 64.6, - "std_err": 1.9, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2024-11-29 16:14:00" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-405b/webarena.json b/results/GenericAgent-Llama-3.1-405b/webarena.json deleted file mode 100644 index ddffdb10c856b72a8896fe80b39aeb1e91b123b8..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-405b/webarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Llama-3.1-405b", - "study_id": "aaeca13d-0cf5-444f-8445-590350b54746", - "benchmark": "WebArena", - "score": 24.0, - "std_err": 1.5, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2024-12-01 00:04:43" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-405b/weblinx.json b/results/GenericAgent-Llama-3.1-405b/weblinx.json deleted file mode 100644 index bd6801536a4fc931b4fb77126bd635441f177c92..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-405b/weblinx.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Llama-3.1-405b", - "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1", - "benchmark": "WebLINX", - "score": 7.9, - "std_err": 0.5, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2024-11-07 21:42:30" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-405b/workarena-l1.json b/results/GenericAgent-Llama-3.1-405b/workarena-l1.json deleted file mode 100644 index 95927554a7efacf122b2f553d72b593503d4b571..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-405b/workarena-l1.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Llama-3.1-405b", - "study_id": "2024-10-25_17-34-45", - "benchmark": "WorkArena-L1", - "score": 43.3, - "std_err": 2.7, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2024-10-25 20:32:26" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-405b/workarena-l2.json b/results/GenericAgent-Llama-3.1-405b/workarena-l2.json deleted file mode 100644 index 1209bfc5d44a82b0788f889992e3a46d99f62c6e..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-405b/workarena-l2.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Llama-3.1-405b", - "study_id": "528da1f2-1949-41dc-b988-85f19f435af2", - "date_time": "2024-11-29 14:28:47", - "benchmark": "WorkArena-L2", - "score": 7.2, - "std_err": 1.7, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-405b/workarena-l3.json b/results/GenericAgent-Llama-3.1-405b/workarena-l3.json deleted file mode 100644 index 7e6ca5fc0aadba1ac1ad6224eb36b3b7d36080b6..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-405b/workarena-l3.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Llama-3.1-405b", - "study_id": "-", - "benchmark": "WorkArena-L3", - "score": 0.0, - "std_err": 0.0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2024-10-24 23:03:30" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-70b/README.md b/results/GenericAgent-Llama-3.1-70b/README.md deleted file mode 100644 index 9bd1ba56e9020df0717ff233c948a70650135b8d..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-70b/README.md +++ /dev/null @@ -1,46 +0,0 @@ -### GenericAgent-Llama-3.1-70b - -This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab) - -It uses Llama-3.1-70b as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py): -```python -BASE_FLAGS = GenericPromptFlags( - obs=dp.ObsFlags( - use_html=False, - use_ax_tree=True, - use_focused_element=True, - use_error_logs=True, - use_history=True, - use_past_error_logs=False, - use_action_history=True, - use_think_history=True, # gpt-4o config except for this line - use_diff=False, - html_type="pruned_html", - use_screenshot=False, - use_som=False, - extract_visible_tag=True, - extract_clickable_tag=True, - extract_coords="False", - filter_visible_elements_only=False, - ), - action=dp.ActionFlags( - multi_actions=False, - action_set="bid", - long_description=False, - individual_examples=False, - ), - use_plan=False, - use_criticise=False, - use_thinking=True, - use_memory=False, - use_concrete_example=True, - use_abstract_example=True, - use_hints=True, - enable_chat=False, - max_prompt_tokens=40_000, - be_cautious=True, - extra_instructions=None, -) -``` - -Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it). \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-70b/assistantbench.json b/results/GenericAgent-Llama-3.1-70b/assistantbench.json deleted file mode 100644 index 9b26a6c6c8b0cf81c1767f50f0e67ed3551a8304..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-70b/assistantbench.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Llama-3.1-70b", - "study_id": "d93a2398-2b70-41ce-b989-364fed988d73", - "benchmark": "AssistantBench", - "score": 2.8, - "std_err": 1.1, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Intersection of finished tasks across agents.", - "original_or_reproduced": "Original", - "date_time": "2024-11-28 19:34:58" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-70b/miniwob.json b/results/GenericAgent-Llama-3.1-70b/miniwob.json deleted file mode 100644 index e5d35d7e28c7b14826373ef2bfc296e12be23d79..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-70b/miniwob.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Llama-3.1-70b", - "study_id": "2024-10-25_06-08-16", - "benchmark": "MiniWoB", - "score": 57.6, - "std_err": 2.0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2024-10-25 17:16:23" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-70b/webarena.json b/results/GenericAgent-Llama-3.1-70b/webarena.json deleted file mode 100644 index cfe83f8c36849d49317072af5664e764877ea6a4..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-70b/webarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Llama-3.1-70b", - "study_id": "fc5747bc-d998-4942-a0eb-e55a3ccc1cb3", - "benchmark": "WebArena", - "score": 18.4, - "std_err": 1.4, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2024-12-02 23:18:38" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-70b/weblinx.json b/results/GenericAgent-Llama-3.1-70b/weblinx.json deleted file mode 100644 index f1787b30189e1ea0fdfbcfc1023ab55397016923..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-70b/weblinx.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Llama-3.1-70b", - "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1", - "benchmark": "WebLINX", - "score": 8.9, - "std_err": 0.5, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2024-11-07 21:42:30" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-70b/workarena-l1.json b/results/GenericAgent-Llama-3.1-70b/workarena-l1.json deleted file mode 100644 index 58628881ea0cc3a127112cd2846be8104b3374df..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-70b/workarena-l1.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Llama-3.1-70b", - "study_id": "2024-10-23_14-17-40", - "benchmark": "WorkArena-L1", - "score": 27.9, - "std_err": 2.5, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2024-10-23 22:30:06" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-70b/workarena-l2.json b/results/GenericAgent-Llama-3.1-70b/workarena-l2.json deleted file mode 100644 index 6b8c5e0373cb06a95b49504ff09ff0caf81b14a2..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-70b/workarena-l2.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Llama-3.1-70b", - "study_id": "2024-10-23_17-10-46", - "date_time": "2024-10-24 17:08:53", - "benchmark": "WorkArena-L2", - "score": 2.1, - "std_err": 0.9, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/GenericAgent-Llama-3.1-70b/workarena-l3.json b/results/GenericAgent-Llama-3.1-70b/workarena-l3.json deleted file mode 100644 index 7b3e635d0ceccda73484a603c7f679412645c86b..0000000000000000000000000000000000000000 --- a/results/GenericAgent-Llama-3.1-70b/workarena-l3.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-Llama-3.1-70b", - "study_id": "-", - "benchmark": "WorkArena-L3", - "score": 0.0, - "std_err": 0.0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2024-10-24 23:03:30" - } -] \ No newline at end of file diff --git a/results/GenericAgent-o3-mini/README.md b/results/GenericAgent-o3-mini/README.md deleted file mode 100644 index a9d14e4e4aef62c60e7ff233550db96c3fe301af..0000000000000000000000000000000000000000 --- a/results/GenericAgent-o3-mini/README.md +++ /dev/null @@ -1,46 +0,0 @@ -### GenericAgent-o3-mini - -This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab) - -It uses o1-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/agent_configs.py): -```python -BASE_FLAGS = FLAGS_GPT_4o = GenericPromptFlags( - obs=dp.ObsFlags( - use_html=False, - use_ax_tree=True, - use_focused_element=True, - use_error_logs=True, - use_history=True, - use_past_error_logs=False, - use_action_history=True, - use_think_history=False, - use_diff=False, - html_type="pruned_html", - use_screenshot=False, - use_som=False, - extract_visible_tag=True, - extract_clickable_tag=True, - extract_coords="False", - filter_visible_elements_only=False, - ), - action=dp.ActionFlags( - action_set=bgym.HighLevelActionSetArgs( - subsets=["bid"], - multiaction=False, - ), - long_description=False, - individual_examples=False, - ), - use_plan=False, - use_criticise=False, - use_thinking=True, - use_memory=False, - use_concrete_example=True, - use_abstract_example=True, - use_hints=True, - enable_chat=False, - max_prompt_tokens=40_000, - be_cautious=True, - extra_instructions=None, -) -``` diff --git a/results/GenericAgent-o3-mini/workarena-l1.json b/results/GenericAgent-o3-mini/workarena-l1.json deleted file mode 100644 index 40fb1a60c37f04f0186948c72d63090fca80d3e5..0000000000000000000000000000000000000000 --- a/results/GenericAgent-o3-mini/workarena-l1.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "GenericAgent-o3-mini", - "study_id": "f3e1fcb8-5fc5-4115-9e00-27251508e2c7", - "date_time": "2025-02-07 14:00:00", - "benchmark": "WorkArena-L1", - "score": 48.2, - "std_err": 2.80, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Additional details", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/OrbyAgent-ActIO-72b/README.md b/results/OrbyAgent-ActIO-72b/README.md deleted file mode 100644 index 5235a274088f5346c5c5125e6ebaca62fd9e27fa..0000000000000000000000000000000000000000 --- a/results/OrbyAgent-ActIO-72b/README.md +++ /dev/null @@ -1,7 +0,0 @@ -### OrbyAgent-ActIO-72b - -This agent is developed by [Orby AI](https://www.orby.ai/). - -The agent does not use any benchmark-specific information in the prompts. For WebArena benchmark, we use the original evaluator and task definitions for fair comparison. - -It uses the ActIO model of 72B parameters as a backend, with both screenshot and HTML as inputs. More details can be found in our [research blog](https://www.orby.ai/resources/elevating-automation-orby-ais-generic-agent-framework-and-self-adaptive-interface-learning-technique). diff --git a/results/OrbyAgent-ActIO-72b/miniwob.json b/results/OrbyAgent-ActIO-72b/miniwob.json deleted file mode 100644 index 3c1031b683ff5e324e3d20d4dc98719cff0183fd..0000000000000000000000000000000000000000 --- a/results/OrbyAgent-ActIO-72b/miniwob.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "OrbyAgent-ActIO-72b", - "study_id": "orby-agent-v0-actio-v0-miniwob", - "benchmark": "MiniWoB", - "score": 64.2, - "std_err": 1.4, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2025-02-21 15:03:35" - } -] \ No newline at end of file diff --git a/results/OrbyAgent-ActIO-72b/webarena.json b/results/OrbyAgent-ActIO-72b/webarena.json deleted file mode 100644 index 0aaf0d23fca4d6af41a7f7dfa983042e24cf0a08..0000000000000000000000000000000000000000 --- a/results/OrbyAgent-ActIO-72b/webarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "OrbyAgent-ActIO-72b", - "study_id": "b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae", - "benchmark": "WebArena", - "score": 34.7, - "std_err": 0.25, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Use original WebArena eval protocol and task definitions", - "original_or_reproduced": "Original", - "date_time": "2025-02-21 15:05:12" - } -] \ No newline at end of file diff --git a/results/OrbyAgent-Claude-3.5-Sonnet/README.md b/results/OrbyAgent-Claude-3.5-Sonnet/README.md deleted file mode 100644 index 4d6752f14c3116b4d4749aff95c3fbb00331f4c7..0000000000000000000000000000000000000000 --- a/results/OrbyAgent-Claude-3.5-Sonnet/README.md +++ /dev/null @@ -1,7 +0,0 @@ -### OrbyAgent-Claude-3.5-Sonnet - -This agent is developed by [Orby AI](https://www.orby.ai/). - -The agent does not use any benchmark-specific information in the prompts. For WebArena benchmark, we use the original evaluator and task definitions for fair comparison. - -It uses Claude-3.5-sonnet-20241022 as a backend, with both screenshot and HTML as inputs. More details can be found in our [research blog](https://www.orby.ai/resources/elevating-automation-orby-ais-generic-agent-framework-and-self-adaptive-interface-learning-technique). diff --git a/results/OrbyAgent-Claude-3.5-Sonnet/miniwob.json b/results/OrbyAgent-Claude-3.5-Sonnet/miniwob.json deleted file mode 100644 index 3b3f0b3a2df8e492f62c595cdbcb20b3dedc34bf..0000000000000000000000000000000000000000 --- a/results/OrbyAgent-Claude-3.5-Sonnet/miniwob.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "OrbyAgent-Claude-3.5-Sonnet", - "study_id": "orby-agent-v0-claude-3.5-miniwob", - "benchmark": "MiniWoB", - "score": 74.9, - "std_err": 1.2, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2025-02-21 14:54:16" - } -] \ No newline at end of file diff --git a/results/OrbyAgent-Claude-3.5-Sonnet/webarena.json b/results/OrbyAgent-Claude-3.5-Sonnet/webarena.json deleted file mode 100644 index 312bcfa3aea24fa0faa45e54aecc7b7a721381db..0000000000000000000000000000000000000000 --- a/results/OrbyAgent-Claude-3.5-Sonnet/webarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "OrbyAgent-Claude-3.5-Sonnet", - "study_id": "orby-agent-v0-claude-3.5-webarena", - "benchmark": "WebArena", - "score": 36.5, - "std_err": 0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "Use original WebArena eval protocol and task definitions", - "original_or_reproduced": "Original", - "date_time": "2025-02-21 15:00:22" - } -] \ No newline at end of file diff --git a/results/test-agent/README.md b/results/test-agent/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ef031f56262cb73d52252e8886211670fb3a404f --- /dev/null +++ b/results/test-agent/README.md @@ -0,0 +1 @@ +### Test agent \ No newline at end of file diff --git a/results/test-agent/miniwob.json b/results/test-agent/miniwob.json new file mode 100644 index 0000000000000000000000000000000000000000..9b97057f9235e436d822ef77b46ceeeb7e7906c9 --- /dev/null +++ b/results/test-agent/miniwob.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "test-agent", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "MiniWoB", + "score": 43.4, + "std_err": 0.1, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/test-agent/webarena.json b/results/test-agent/webarena.json new file mode 100644 index 0000000000000000000000000000000000000000..794b67ba1052c2b689d83a899b44915d6083df2c --- /dev/null +++ b/results/test-agent/webarena.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "test-agent", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "WebArena", + "score": 6.7, + "std_err": 0.2, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/test-agent/workarena-l1.json b/results/test-agent/workarena-l1.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1962b66be4242e529fbac430c7419dc3195bab --- /dev/null +++ b/results/test-agent/workarena-l1.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "test-agent", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "WorkArena-L1", + "score": 6.1, + "std_err": 0.3, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/test-agent/workarena-l2.json b/results/test-agent/workarena-l2.json new file mode 100644 index 0000000000000000000000000000000000000000..2995fd43638f3a04b99245a5856ef2b4133d7472 --- /dev/null +++ b/results/test-agent/workarena-l2.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "test-agent", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "WorkArena-L2", + "score": 0.0, + "std_err": 0.0, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file diff --git a/results/test-agent/workarena-l3.json b/results/test-agent/workarena-l3.json new file mode 100644 index 0000000000000000000000000000000000000000..c702e073fe98ef218dd665637ca315a69f9ff265 --- /dev/null +++ b/results/test-agent/workarena-l3.json @@ -0,0 +1,16 @@ +[ + { + "agent_name": "test-agent", + "study_id": "study_id", + "date_time": "2021-01-01 12:00:00", + "benchmark": "WorkArena-L3", + "score": 0.0, + "std_err": 0.0, + "benchmark_specific": "No", + "benchmark_tuned": "No", + "followed_evaluation_protocol": "Yes", + "reproducible": "Yes", + "comments": "NA", + "original_or_reproduced": "Original" + } +] \ No newline at end of file