diff --git a/README.md b/README.md
index cbf37025861706298dfabaa5fa2dd49af6c4d394..b2a401ca9600216b4414459fbcadda8ecd4575fc 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,6 @@ colorTo: green
 sdk: docker
 pinned: false
 license: mit
-short_description: Tracks perf of LLMs, VLMs and agents on web navigation tasks
 ---
 
 # BrowserGym Leaderboard
diff --git a/app.py b/app.py
index 6535dde0d99e6c6cb22bc7c2c47361d2e138b56e..017017def98deddc2ba5309a97180f6859a9e9a1 100644
--- a/app.py
+++ b/app.py
@@ -9,7 +9,6 @@ import plotly.graph_objs as go
 from huggingface_hub import HfApi
 from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
 import streamlit.components.v1 as components
-from datetime import datetime
 
 from urllib.parse import quote
 from pathlib import Path
@@ -17,7 +16,7 @@ import re
 import html
 from typing import Dict, Any
 
-BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "VisualWebArena", "AssistantBench"]
+BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB",]
 
 def sanitize_agent_name(agent_name):
     # Only allow alphanumeric chars, hyphen, underscore
@@ -44,34 +43,12 @@ def sanitize_column_name(col: str) -> str:
     return html.escape(str(col))
 
 def sanitize_cell_value(value: Any) -> str:
+    """Sanitize cell values for HTML display"""
     if isinstance(value, (int, float)):
         return str(value)
-    if isinstance(value, str) and '±' in value:
-        score, std_err = value.split('±')
-        return f'{score.strip()} <span style="font-size: smaller; color: var(--lighter-color);">±{std_err.strip()}</span>'
     return html.escape(str(value))
 
 def create_html_table_main(df):
-    col1, col2 = st.columns([2,6])
-    with col1:
-        sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column")
-    with col2:
-        sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order")
-    
-    def get_sort_value(row):
-            if row == "-":
-                return float('-inf')
-            else:
-                try:
-                    return float(row)
-                except ValueError:
-                    return row
-                
-    # Sort dataframe
-    if sort_order == "Ascending":
-        df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
-    else:
-        df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
     html = '''
     <style>
         table {
@@ -110,28 +87,7 @@ def create_html_table_main(df):
     html += '</div>'
     return html
 
-def create_html_table_benchmark(df, benchmark):
-    col1, col2 = st.columns([2,6])
-    with col1:
-        sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("Score"), key=f"benchmark_sort_column_{benchmark}")
-    with col2:
-        sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key=f"benchmark_sort_order_{benchmark}")
-    
-    def get_sort_value(row):
-            if row == "-":
-                return float('-inf')
-            else:
-                try:
-                    return float(row)
-                except ValueError:
-                    return row
-                
-    # Sort dataframe
-    if sort_order == "Ascending":
-        df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
-    else:
-        df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
-
+def create_html_table_benchmark(df):
     html = '''
     <style>
         table {
@@ -155,9 +111,8 @@ def create_html_table_benchmark(df, benchmark):
     html += '<table>'
     html += '<thead><tr>'
     for column in df.columns:
-        if column == "Reproduced_all" or column == "std_err":
-            continue
-        html += f'<th>{sanitize_column_name(column)}</th>'
+        if column != "Reproduced_all":
+            html += f'<th>{sanitize_column_name(column)}</th>'
     html += '</tr></thead>'
     html += '<tbody>'
     for _, row in df.iterrows():
@@ -170,11 +125,8 @@ def create_html_table_benchmark(df, benchmark):
                     summary = sanitize_cell_value(row[column])
                     details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
                     html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
-            elif column == "Reproduced_all" or column == "std_err":
+            elif column == "Reproduced_all":
                 continue
-            elif column == "Score":
-                score_with_std_err = f'{row[column]} ± {row["std_err"]}'
-                html += f'<td>{sanitize_cell_value(score_with_std_err)}</td>'
             else:
                 html += f'<td>{sanitize_cell_value(row[column])}</td>'
         html += '</tr>'
@@ -209,19 +161,6 @@ def check_sanity(agent):
 
 def main():
     st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
-    st.markdown("""
-        <style>
-        :root {
-            --lighter-color: #888; /* Default for light theme */
-        }
-        @media (prefers-color-scheme: dark) {
-            :root {
-                --lighter-color: #ccc; /* Default for dark theme */
-            }
-        }
-        </style>
-    """, unsafe_allow_html=True)
-
     st.markdown("""
         <head>
             <meta http-equiv="Content-Security-Policy" 
@@ -244,10 +183,7 @@ def main():
             continue
         agent_results = []
         for benchmark in BENCHMARKS:
-            file_path = safe_path_join(agent, f"{benchmark.lower()}.json")
-            if not file_path.is_file():
-                continue
-            with open(file_path) as f:
+            with open(f"results/{agent}/{benchmark.lower()}.json") as f:
                 agent_results.extend(json.load(f))
         all_results[agent] = agent_results
 
@@ -281,9 +217,11 @@ def main():
         if dfs_to_concat:
             df = pd.concat(dfs_to_concat, ignore_index=True)
 
-        for benchmark in BENCHMARKS:
-            df[benchmark] = df[benchmark].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
-            df[benchmark] = df[benchmark].astype(str)
+        # df['Average'] = sum(df[column] for column in BENCHMARKS)/len(BENCHMARKS)
+        # df['Average'] = df['Average'].round(2)
+        # Sort values
+        df = df.sort_values(by='WebArena', ascending=False)
+
         # Add a search bar
         search_query = st.text_input("Search agents", "", key="search_main")
 
@@ -302,6 +240,14 @@ def main():
                 return ""
         
         df['Agent'] = df['Agent'].apply(make_hyperlink)
+        # st.dataframe(
+        #     df[['Agent'] + BENCHMARKS],
+        #     use_container_width=True,
+        #     column_config={benchmark: {'alignment': 'center'} for benchmark in BENCHMARKS},
+        #     hide_index=True,
+        #     # height=int(len(df) * 36.2),
+        # )
+        # st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
         html_table = create_html_table_main(df)
         st.markdown(html_table, unsafe_allow_html=True)
 
@@ -449,21 +395,18 @@ MIT
                     for value in values:
                         if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
                             result_dict["Score"] = value["score"]
-                            result_dict["std_err"] = value["std_err"]
                             result_dict["Benchmark Specific"] = value["benchmark_specific"]
                             result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
                             result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
                             result_dict["Reproducible"] = value["reproducible"]
                             result_dict["Comments"] = value["comments"]
                             result_dict["Study ID"] = value["study_id"]
-                            value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
                             result_dict["Date"] = value["date_time"]
                             result_dict["Reproduced"] = []
                             result_dict["Reproduced_all"] = []
                             flag = 1
                         if not flag:
                             result_dict["Score"] = "-"
-                            result_dict["std_err"] = "-"
                             result_dict["Benchmark Specific"] = "-"
                             result_dict["Benchmark Tuned"] = "-"
                             result_dict["Followed Evaluation Protocol"] = "-"
@@ -475,7 +418,6 @@ MIT
                             result_dict["Reproduced_all"] = []
                         if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
                             result_dict["Reproduced"].append(value["score"])
-                            value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
                             result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
                     if result_dict["Reproduced"]:
                         result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
@@ -493,10 +435,14 @@ MIT
             # Concatenate the DataFrames
             if dfs_to_concat:
                 df_ = pd.concat(dfs_to_concat, ignore_index=True)
-            df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
-            df_['std_err'] = df_['std_err'].apply(lambda x: f"{x:.1f}" if x != "-" else "-")
-            df_['Score'] = df_['Score'].astype(str)
-            html_table = create_html_table_benchmark(df_, benchmark)
+            # st.markdown(f"<h2 id='{benchmark.lower()}'>{benchmark}</h2>", unsafe_allow_html=True)
+            # st.dataframe(
+            #     df_,
+            #     use_container_width=True,
+            #     column_config={benchmark: {'alignment': 'center'}},
+            #     hide_index=True,
+            # )
+            html_table = create_html_table_benchmark(df_)
             st.markdown(html_table, unsafe_allow_html=True)
                 
         
diff --git a/results/Bgym-GPT-3.5/README.md b/results/Bgym-GPT-3.5/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f15589f8889e87445e98f51746ac426bcb81b9c0
--- /dev/null
+++ b/results/Bgym-GPT-3.5/README.md
@@ -0,0 +1 @@
+## GPT-3.5 model
\ No newline at end of file
diff --git a/results/Bgym-GPT-3.5/config.json b/results/Bgym-GPT-3.5/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ea3825d4f9cc8568d9cdfb93eac5fcb48a07f86
--- /dev/null
+++ b/results/Bgym-GPT-3.5/config.json
@@ -0,0 +1,4 @@
+{
+    "agent_name": "GPT-3.5",
+    "backend_llm": "GPT-3.5"
+}
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o/miniwob.json b/results/Bgym-GPT-3.5/miniwob.json
similarity index 69%
rename from results/GenericAgent-GPT-4o/miniwob.json
rename to results/Bgym-GPT-3.5/miniwob.json
index b3aa2029432187b2737e378d5a7127aae52230b1..b4d117fa36474116ffc4e8392cd375ccfeb52e6d 100644
--- a/results/GenericAgent-GPT-4o/miniwob.json
+++ b/results/Bgym-GPT-3.5/miniwob.json
@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o",
-        "study_id": "2024-10-25_06-08-16",
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",
-        "score": 63.8,
-        "std_err": 1.9,
+        "score": 43.4,
+        "std_err": 0.1,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",
diff --git a/results/Bgym-GPT-3.5/webarena.json b/results/Bgym-GPT-3.5/webarena.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b352122f2d3056156d07f11f086e85753b090e4
--- /dev/null
+++ b/results/Bgym-GPT-3.5/webarena.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 6.7,
+        "std_err": 0.2,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/Bgym-GPT-3.5/workarena-l1.json b/results/Bgym-GPT-3.5/workarena-l1.json
new file mode 100644
index 0000000000000000000000000000000000000000..81d2975b3eef0a9f509633ca1bd68d19ac7a689a
--- /dev/null
+++ b/results/Bgym-GPT-3.5/workarena-l1.json
@@ -0,0 +1,44 @@
+[
+    {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L1",
+        "score": 6.1,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    },
+    {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 5.7,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    },
+    {
+        "benchmark": "WorkArena-L1",
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "score": 5.1,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    }
+]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o/workarena-l2.json b/results/Bgym-GPT-3.5/workarena-l2.json
similarity index 70%
rename from results/GenericAgent-GPT-4o/workarena-l2.json
rename to results/Bgym-GPT-3.5/workarena-l2.json
index 9c8f1c07bef2ac72f54b9243f71427ea5bd65a04..ad6ab82a380e20dd6ffad919d2d9860703bce2ee 100644
--- a/results/GenericAgent-GPT-4o/workarena-l2.json
+++ b/results/Bgym-GPT-3.5/workarena-l2.json
@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o",
-        "study_id": "2024-10-23_17-10-46",
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",
-        "score": 8.5,
-        "std_err": 1.8,
+        "score": 0.0,
+        "std_err": 0.0,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",
diff --git a/results/GenericAgent-GPT-4o/workarena-l3.json b/results/Bgym-GPT-3.5/workarena-l3.json
similarity index 73%
rename from results/GenericAgent-GPT-4o/workarena-l3.json
rename to results/Bgym-GPT-3.5/workarena-l3.json
index 01650347b957bcca9ab6fe3d587f2f4f56ad4858..40093a485842f340d16d25af5768e8d066377a05 100644
--- a/results/GenericAgent-GPT-4o/workarena-l3.json
+++ b/results/Bgym-GPT-3.5/workarena-l3.json
@@ -1,8 +1,8 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o",
-        "study_id": "-",
-        "date_time": "2024-10-24 23:03:30",
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",
         "score": 0.0,
         "std_err": 0.0,
diff --git a/results/Bgym-GPT-4o-V/README.md b/results/Bgym-GPT-4o-V/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..065c2f2bbfe5c0845debe1baa0a086f2dd2c019a
--- /dev/null
+++ b/results/Bgym-GPT-4o-V/README.md
@@ -0,0 +1 @@
+## GPT-4o-V model
\ No newline at end of file
diff --git a/results/Bgym-GPT-4o-V/config.json b/results/Bgym-GPT-4o-V/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ebdbb1db77cb11d5a01b6f7568f6c8196e881818
--- /dev/null
+++ b/results/Bgym-GPT-4o-V/config.json
@@ -0,0 +1,4 @@
+{
+    "agent_name": "GPT-4o-V",
+    "backend_llm": "GPT-4o-V"
+}
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o-mini/miniwob.json b/results/Bgym-GPT-4o-V/miniwob.json
similarity index 68%
rename from results/GenericAgent-GPT-4o-mini/miniwob.json
rename to results/Bgym-GPT-4o-V/miniwob.json
index 8fd1b099c7968b9f41faad3dfd46fbe340ab535a..1090c29ca8017fddd7eb43d7de424c4ef5115f7c 100644
--- a/results/GenericAgent-GPT-4o-mini/miniwob.json
+++ b/results/Bgym-GPT-4o-V/miniwob.json
@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "2024-10-25_06-08-16",
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",
-        "score": 56.6,
-        "std_err": 2.0,
+        "score": 72.5,
+        "std_err": 0.5,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",
diff --git a/results/Bgym-GPT-4o-V/webarena.json b/results/Bgym-GPT-4o-V/webarena.json
new file mode 100644
index 0000000000000000000000000000000000000000..4908982e7d053542eeaaf8f2410aa794dc05d52b
--- /dev/null
+++ b/results/Bgym-GPT-4o-V/webarena.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 24.0,
+        "std_err": 0.4,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o/workarena-l1.json b/results/Bgym-GPT-4o-V/workarena-l1.json
similarity index 69%
rename from results/GenericAgent-GPT-4o/workarena-l1.json
rename to results/Bgym-GPT-4o-V/workarena-l1.json
index 17de417aa4cacdc9f4bfde75c3d22b26820324b3..ed6776d3ca134d76b4e69886ba372792b5ea212a 100644
--- a/results/GenericAgent-GPT-4o/workarena-l1.json
+++ b/results/Bgym-GPT-4o-V/workarena-l1.json
@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o",
-        "study_id": "2024-10-23_14-17-40",
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L1",
-        "score": 45.5,
-        "std_err": 2.7,
+        "score": 41.8,
+        "std_err": 0.4,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",
diff --git a/results/GenericAgent-GPT-4o-mini/workarena-l2.json b/results/Bgym-GPT-4o-V/workarena-l2.json
similarity index 69%
rename from results/GenericAgent-GPT-4o-mini/workarena-l2.json
rename to results/Bgym-GPT-4o-V/workarena-l2.json
index ba423ad1010b93ecf159e2db44e428376b66a90c..25e2c312fd03d6b61211943add71430d7bbf1003 100644
--- a/results/GenericAgent-GPT-4o-mini/workarena-l2.json
+++ b/results/Bgym-GPT-4o-V/workarena-l2.json
@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "2024-10-23_17-10-46",
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",
-        "score": 1.3,
-        "std_err": 0.7,
+        "score": 3.8,
+        "std_err": 0.6,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",
diff --git a/results/GenericAgent-GPT-4o-mini/workarena-l3.json b/results/Bgym-GPT-4o-V/workarena-l3.json
similarity index 72%
rename from results/GenericAgent-GPT-4o-mini/workarena-l3.json
rename to results/Bgym-GPT-4o-V/workarena-l3.json
index 43aab700e44b6fc39160b306b7bbbcf56c507b05..e9b990349435d7c131cee59d0dc559d4dafbd377 100644
--- a/results/GenericAgent-GPT-4o-mini/workarena-l3.json
+++ b/results/Bgym-GPT-4o-V/workarena-l3.json
@@ -1,8 +1,8 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "-",
-        "date_time": "2024-10-24 23:03:30",
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",
         "score": 0.0,
         "std_err": 0.0,
diff --git a/results/Bgym-GPT-4o/README.md b/results/Bgym-GPT-4o/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f521ba2aaa82109c0f67a1c19ff0811ffb0e1085
--- /dev/null
+++ b/results/Bgym-GPT-4o/README.md
@@ -0,0 +1 @@
+## GPT-4o model
\ No newline at end of file
diff --git a/results/Bgym-GPT-4o/config.json b/results/Bgym-GPT-4o/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a08adf507dc482df79c852e11b393c756a6fd71
--- /dev/null
+++ b/results/Bgym-GPT-4o/config.json
@@ -0,0 +1,4 @@
+{
+    "agent_name": "GPT-4o",
+    "backend_llm": "GPT-4o"
+}
\ No newline at end of file
diff --git a/results/Bgym-GPT-4o/miniwob.json b/results/Bgym-GPT-4o/miniwob.json
new file mode 100644
index 0000000000000000000000000000000000000000..400f137bef1d0fdceb662fafbeebe73a743934b5
--- /dev/null
+++ b/results/Bgym-GPT-4o/miniwob.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "MiniWoB",
+        "score": 71.3,
+        "std_err": 0.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/Bgym-GPT-4o/webarena.json b/results/Bgym-GPT-4o/webarena.json
new file mode 100644
index 0000000000000000000000000000000000000000..388494e96508a4273f5974e83c45fad8f84cf172
--- /dev/null
+++ b/results/Bgym-GPT-4o/webarena.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 23.5,
+        "std_err": 0.4,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o-mini/workarena-l1.json b/results/Bgym-GPT-4o/workarena-l1.json
similarity index 69%
rename from results/GenericAgent-GPT-4o-mini/workarena-l1.json
rename to results/Bgym-GPT-4o/workarena-l1.json
index ca27a541ef9ffe198860ad51a9db327b9928adbd..20cc70c2a512e602e3c3fa4781ca0f9d635002ab 100644
--- a/results/GenericAgent-GPT-4o-mini/workarena-l1.json
+++ b/results/Bgym-GPT-4o/workarena-l1.json
@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "2024-10-23_14-17-40",
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L1",
-        "score": 27,
-        "std_err": 2.4,
+        "score": 42.7,
+        "std_err": 0.4,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",
diff --git a/results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json b/results/Bgym-GPT-4o/workarena-l2.json
similarity index 68%
rename from results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json
rename to results/Bgym-GPT-4o/workarena-l2.json
index a0927edab449cd654a9ab66b6fb4524d3fd6bc6d..26495ed99ecb04d7d68dbcec2228cdb0f72596a3 100644
--- a/results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json
+++ b/results/Bgym-GPT-4o/workarena-l2.json
@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "2024-10-23_17-10-46",
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",
-        "score": 39.1,
-        "std_err": 3.2,
+        "score": 3.0,
+        "std_err": 0.6,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",
diff --git a/results/GenericAgent-GPT-o1-mini/workarena-l3.json b/results/Bgym-GPT-4o/workarena-l3.json
similarity index 72%
rename from results/GenericAgent-GPT-o1-mini/workarena-l3.json
rename to results/Bgym-GPT-4o/workarena-l3.json
index 4f824ed4a043f2b39890ce4c9eb2ea4fd98008e7..a5ec6780ea9ca807b4a8ef4d401ca14d33f7e3d6 100644
--- a/results/GenericAgent-GPT-o1-mini/workarena-l3.json
+++ b/results/Bgym-GPT-4o/workarena-l3.json
@@ -1,8 +1,8 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-o1-mini",
-        "study_id": "-",
-        "date_time": "2024-10-24 23:03:30",
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",
         "score": 0.0,
         "std_err": 0.0,
diff --git a/results/Bgym-Llama-3-70b/README.md b/results/Bgym-Llama-3-70b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8798ff4c72825c5049ef24cf16b818faa9ae5d2b
--- /dev/null
+++ b/results/Bgym-Llama-3-70b/README.md
@@ -0,0 +1 @@
+### Llama-3-70B
\ No newline at end of file
diff --git a/results/Bgym-Llama-3-70b/config.json b/results/Bgym-Llama-3-70b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6aa02bf47d3ad80e2616b77e62c40f0103d722c6
--- /dev/null
+++ b/results/Bgym-Llama-3-70b/config.json
@@ -0,0 +1,4 @@
+{
+    "agent_name": "Llama-3-70B",
+    "backend_llm": "Llama-3-70B"
+}
\ No newline at end of file
diff --git a/results/Bgym-Llama-3-70b/miniwob.json b/results/Bgym-Llama-3-70b/miniwob.json
new file mode 100644
index 0000000000000000000000000000000000000000..5dadad99d077784d8a2c662c6262722c34e834f8
--- /dev/null
+++ b/results/Bgym-Llama-3-70b/miniwob.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "MiniWoB",
+        "score": 68.2,
+        "std_err": 0.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/Bgym-Llama-3-70b/webarena.json b/results/Bgym-Llama-3-70b/webarena.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c229ed5d2d97ac3623759b8f7f7131a3830abea
--- /dev/null
+++ b/results/Bgym-Llama-3-70b/webarena.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 11.0,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/Bgym-Llama-3-70b/workarena-l1.json b/results/Bgym-Llama-3-70b/workarena-l1.json
new file mode 100644
index 0000000000000000000000000000000000000000..59c5280fdc8de77cdb38d2c73398a50addaa5220
--- /dev/null
+++ b/results/Bgym-Llama-3-70b/workarena-l1.json
@@ -0,0 +1,58 @@
+[
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 17.9,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    },
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 15.9,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    },
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 19.9,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-05 2:07:00"
+    },
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 17.9,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-12 12:00:00"
+    }
+]
\ No newline at end of file
diff --git a/results/Bgym-Llama-3-70b/workarena-l2.json b/results/Bgym-Llama-3-70b/workarena-l2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f0f8451f47f7022fe3b85923f9356c2f1617c15
--- /dev/null
+++ b/results/Bgym-Llama-3-70b/workarena-l2.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L2",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json b/results/Bgym-Llama-3-70b/workarena-l3.json
similarity index 68%
rename from results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json
rename to results/Bgym-Llama-3-70b/workarena-l3.json
index dc85da14be3c4f1722f11f2c30e28db8ceeaf670..acf0a81f58b3ab0e03178ad4f7ae27cb9fec4e97 100644
--- a/results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json
+++ b/results/Bgym-Llama-3-70b/workarena-l3.json
@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "2024-10-24_18-06-57",
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",
-        "score": 0.4,
-        "std_err": 0.4,
+        "score": 0.0,
+        "std_err": 0.0,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",
diff --git a/results/Bgym-Mixtral-8x22b/README.md b/results/Bgym-Mixtral-8x22b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..25b17de698790810b5a434228a08691aa048e4ca
--- /dev/null
+++ b/results/Bgym-Mixtral-8x22b/README.md
@@ -0,0 +1 @@
+## Mixtral 8x22B
\ No newline at end of file
diff --git a/results/Bgym-Mixtral-8x22b/config.json b/results/Bgym-Mixtral-8x22b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..77a92a7ee77cc1a96eeb37a6923290294b7a3fdb
--- /dev/null
+++ b/results/Bgym-Mixtral-8x22b/config.json
@@ -0,0 +1,4 @@
+{
+    "agent_name": "Mixtral-8x22B",
+    "backend_llm": "Mixtral-8x22B"
+}
\ No newline at end of file
diff --git a/results/Bgym-Mixtral-8x22b/miniwob.json b/results/Bgym-Mixtral-8x22b/miniwob.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b6ea125b66d8032f19d3922284cadcfe7e5b957
--- /dev/null
+++ b/results/Bgym-Mixtral-8x22b/miniwob.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "MiniWoB",
+        "score": 62.4,
+        "std_err": 0.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/Bgym-Mixtral-8x22b/webarena.json b/results/Bgym-Mixtral-8x22b/webarena.json
new file mode 100644
index 0000000000000000000000000000000000000000..823344e109e805942efd8e34caa90d2e4a0c4d33
--- /dev/null
+++ b/results/Bgym-Mixtral-8x22b/webarena.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 12.6,
+        "std_err": 0.9,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/Bgym-Mixtral-8x22b/workarena-l1.json b/results/Bgym-Mixtral-8x22b/workarena-l1.json
new file mode 100644
index 0000000000000000000000000000000000000000..eccaa19d2a1980051186312bdf50a545fac60836
--- /dev/null
+++ b/results/Bgym-Mixtral-8x22b/workarena-l1.json
@@ -0,0 +1,44 @@
+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 12.4,
+        "std_err": 0.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-04 12:06:00"
+    },
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 11.4,
+        "std_err": 0.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    },
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 13.4,
+        "std_err": 0.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    }
+]
\ No newline at end of file
diff --git a/results/Bgym-Mixtral-8x22b/workarena-l2.json b/results/Bgym-Mixtral-8x22b/workarena-l2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fbc2324755d166804d3586bf0f54766c3c940232
--- /dev/null
+++ b/results/Bgym-Mixtral-8x22b/workarena-l2.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L2",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/Bgym-Mixtral-8x22b/workarena-l3.json b/results/Bgym-Mixtral-8x22b/workarena-l3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2cfe04fa51a97f69d7172f239484939e0716102a
--- /dev/null
+++ b/results/Bgym-Mixtral-8x22b/workarena-l3.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/GenericAgent-AgentTrek-1.0-32b/README.md b/results/GenericAgent-AgentTrek-1.0-32b/README.md
deleted file mode 100644
index 572348c9ec90c9c9c37366a60916c3c3f76b3bd3..0000000000000000000000000000000000000000
--- a/results/GenericAgent-AgentTrek-1.0-32b/README.md
+++ /dev/null
@@ -1,85 +0,0 @@
-### GenericAgent-AgentTrek-1.0-32b
-
-this agent is GenericAgent from Agentlab
-
-- **Base Model:**
-
-  - Qwen/Qwen2.5-32B-Instruct
-- **Architecture:**
-
-  - Type: Causal Language Models
-  - Training Stage: Pretraining & Post-training
-  - Architecture: transformers with RoPE, SwiGLU, RMSNorm, and Attention QKV bias
-  - Number of Parameters: 32.5B
-  - Number of Paramaters (Non-Embedding): 31.0B
-  - Number of Layers: 64
-  - Number of Attention Heads (GQA): 40 for Q and 8 for KV
-- Input/Output Format:
-
-  - with the following flags:
-    ```txt
-    flags=GenericPromptFlags(
-        obs=ObsFlags(
-            use_html=True,
-            use_ax_tree=True,
-            use_tabs=False,
-            use_focused_element=False,
-            use_error_logs=True,
-            use_history=True,
-            use_past_error_logs=False,
-            use_action_history=True,
-            use_think_history=False,
-            use_diff=False,
-            html_type='pruned_html',
-            use_screenshot=False,
-            use_som=False,
-            extract_visible_tag=False,
-            extract_clickable_tag=False,
-            extract_coords='False',
-            filter_visible_elements_only=False,
-            openai_vision_detail='auto',
-            filter_with_bid_only=False,
-            filter_som_only=False
-        ),
-        action=ActionFlags(
-            action_set=HighLevelActionSetArgs(
-                subsets=('miniwob_all',),
-                multiaction=False,
-                strict=False,
-                retry_with_force=True,
-                demo_mode='off'
-            ),
-            long_description=False,
-            individual_examples=False,
-            multi_actions=None,
-            is_strict=None
-        ),
-        use_plan=False,
-        use_criticise=False,
-        use_thinking=True,
-        use_memory=True,
-        use_concrete_example=True,
-        use_abstract_example=True,
-        use_hints=False,
-        enable_chat=False,
-        max_prompt_tokens=40000,
-        be_cautious=True,
-        extra_instructions=None,
-        add_missparsed_messages=True,
-        max_trunc_itr=20,
-        flag_group=None
-    )
-    ```
-- Training Details
-
-  - Dataset used: [AgentTrek-6K](https://agenttrek.github.io)
-  - Number of training steps: 3 Epochs
-- Paper Link:
-
-  - https://arxiv.org/abs/2412.09605
-- Code Repository:
-
-  - https://agenttrek.github.io
-- Lisense:
-
-  - apache2.0
diff --git a/results/GenericAgent-AgentTrek-1.0-32b/miniwob.json b/results/GenericAgent-AgentTrek-1.0-32b/miniwob.json
deleted file mode 100644
index 0877890bd9296bb436a8afbd6cc5ae2614408df2..0000000000000000000000000000000000000000
--- a/results/GenericAgent-AgentTrek-1.0-32b/miniwob.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-AgentTrek-1.0-32b",
-        "study_id": "4c636aa0-ea52-429d-9d7e-301b7bf0ac74", 
-        "date_time": "2025-01-22 04:27:37",
-        "benchmark": "MiniWoB",
-        "score": 60.0,
-        "std_err": 2.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes", 
-        "reproducible": "Yes",
-        "comments": "Additional details",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-AgentTrek-1.0-32b/webarena.json b/results/GenericAgent-AgentTrek-1.0-32b/webarena.json
deleted file mode 100644
index 0634bdfc8304a9148264ced6498c59868388910b..0000000000000000000000000000000000000000
--- a/results/GenericAgent-AgentTrek-1.0-32b/webarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-AgentTrek-1.0-32b",
-        "study_id": "ac309635-f3fd-417e-ac16-1e0fc943a54f", 
-        "date_time": "2025-01-25 10:16:41",
-        "benchmark": "WebArena",
-        "score": 22.4,
-        "std_err": 1.5,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes", 
-        "reproducible": "Yes",
-        "comments": "Additional details",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-AgentTrek-1.0-32b/workarena-l1.json b/results/GenericAgent-AgentTrek-1.0-32b/workarena-l1.json
deleted file mode 100644
index 9c5a3ff367959ebb56b9f606bc9da0576606399d..0000000000000000000000000000000000000000
--- a/results/GenericAgent-AgentTrek-1.0-32b/workarena-l1.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-AgentTrek-1.0-32b",
-        "study_id": "ed14232c-cd7e-4708-b334-ebaf1f220000", 
-        "date_time": "2025-01-12 00:37:04",
-        "benchmark": "WorkArena-L1",
-        "score": 38.29,
-        "std_err": 2.70,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes", 
-        "reproducible": "Yes",
-        "comments": "Additional details",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-AgentTrek-1.0-32b/workarena-l2.json b/results/GenericAgent-AgentTrek-1.0-32b/workarena-l2.json
deleted file mode 100644
index 15e30b9ae065a131118a164a22008fc6f7e3e578..0000000000000000000000000000000000000000
--- a/results/GenericAgent-AgentTrek-1.0-32b/workarena-l2.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-AgentTrek-1.0-32b",
-        "study_id": "957fb895-8548-46f4-92f0-5de6be7ceb61", 
-        "date_time": "2025-01-12 09:39:21",
-        "benchmark": "WorkArena-L2",
-        "score": 2.98,
-        "std_err": 1.10,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes", 
-        "reproducible": "Yes",
-        "comments": "Additional details",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-AgentTrek-1.0-32b/workarena-l3.json b/results/GenericAgent-AgentTrek-1.0-32b/workarena-l3.json
deleted file mode 100644
index fca096b12f2212917f97efd35769d470244be488..0000000000000000000000000000000000000000
--- a/results/GenericAgent-AgentTrek-1.0-32b/workarena-l3.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-AgentTrek-1.0-32b",
-        "study_id": "a951b33f-d118-4cf4-a2ef-cc2ef204eeb0", 
-        "date_time": "2025-01-13 12:11:45",
-        "benchmark": "WorkArena-L3",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes", 
-        "reproducible": "Yes",
-        "comments": "Additional details",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Claude-3.5-Sonnet/README.md b/results/GenericAgent-Claude-3.5-Sonnet/README.md
deleted file mode 100644
index a247c51570651697a0165c50ba57d11ed643652a..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Claude-3.5-Sonnet/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-### GenericAgent-Claude-3.5-Sonnet
-
-This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
-
-It uses Claude-3.5-sonnet as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
-```python
-BASE_FLAGS = GenericPromptFlags(
-    obs=dp.ObsFlags(
-        use_html=False,
-        use_ax_tree=True,
-        use_focused_element=True,
-        use_error_logs=True,
-        use_history=True,
-        use_past_error_logs=False,
-        use_action_history=True,
-        use_think_history=True,  # gpt-4o config except for this line
-        use_diff=False,
-        html_type="pruned_html",
-        use_screenshot=False,
-        use_som=False,
-        extract_visible_tag=True,
-        extract_clickable_tag=True,
-        extract_coords="False",
-        filter_visible_elements_only=False,
-    ),
-    action=dp.ActionFlags(
-        multi_actions=False,
-        action_set="bid",
-        long_description=False,
-        individual_examples=False,
-    ),
-    use_plan=False,
-    use_criticise=False,
-    use_thinking=True,
-    use_memory=False,
-    use_concrete_example=True,
-    use_abstract_example=True,
-    use_hints=True,
-    enable_chat=False,
-    max_prompt_tokens=40_000,
-    be_cautious=True,
-    extra_instructions=None,
-)
-```
-
-Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
\ No newline at end of file
diff --git a/results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json b/results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json
deleted file mode 100644
index 25e6e505abf6edc37e4d4205842b41af8eefe9a6..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
-        "benchmark": "AssistantBench",
-        "score": 5.2,
-        "std_err": 1.5,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "Intersection of finished tasks across agents.",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-11-28 19:34:58"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Claude-3.5-Sonnet/miniwob.json b/results/GenericAgent-Claude-3.5-Sonnet/miniwob.json
deleted file mode 100644
index 3d92a3b23ef10422e12b47f1697bd47799f2f5b5..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Claude-3.5-Sonnet/miniwob.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "2024-10-25_06-08-16",
-        "benchmark": "MiniWoB",
-        "score": 69.8,
-        "std_err": 1.8,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2021-01-01 12:00:00"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json b/results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json
deleted file mode 100644
index 2314e53cd7e52a492b298e2d417ca83c8e99ad4e..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "22f0611d-aeea-4ee9-a533-b45442b5e080",
-        "benchmark": "VisualWebArena",
-        "score": 21.0,
-        "std_err": 1.3,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-12-02 09:11:35"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Claude-3.5-Sonnet/webarena.json b/results/GenericAgent-Claude-3.5-Sonnet/webarena.json
deleted file mode 100644
index 0f54ccd322f68c971e4182db57a9313cb9cf7c6a..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Claude-3.5-Sonnet/webarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae",
-        "benchmark": "WebArena",
-        "score": 36.2,
-        "std_err": 1.7,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-11-29 22:37:46"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Claude-3.5-Sonnet/weblinx.json b/results/GenericAgent-Claude-3.5-Sonnet/weblinx.json
deleted file mode 100644
index 4be0689c02e5e8e9dee2dd09966fc9cf8577f0d5..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Claude-3.5-Sonnet/weblinx.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
-        "benchmark": "WebLINX",
-        "score": 13.7,
-        "std_err": 0.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-11-07 21:42:30"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json b/results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json
deleted file mode 100644
index 1eecc0e8f9ea7aa11252e6b280ea14e358c5cb0d..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "2024-10-23_14-17-40",
-        "benchmark": "WorkArena-L1",
-        "score": 56.4,
-        "std_err": 2.7,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2021-01-01 12:00:00"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o-mini/README.md b/results/GenericAgent-GPT-4o-mini/README.md
deleted file mode 100644
index 24cd7f221ec1011212d0a5366d13fc1f7c5275d3..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-4o-mini/README.md
+++ /dev/null
@@ -1,54 +0,0 @@
-### GenericAgent-GPT-4o-mini
-
-This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
-
-It uses GPT-4o-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
-```python
-BASE_FLAGS = GenericPromptFlags(
-    obs=dp.ObsFlags(
-        use_html=False,
-        use_ax_tree=True,
-        use_focused_element=True,
-        use_error_logs=True,
-        use_history=True,
-        use_past_error_logs=False,
-        use_action_history=True,
-        use_think_history=True,  # gpt-4o config except for this line
-        use_diff=False,
-        html_type="pruned_html",
-        use_screenshot=False,
-        use_som=False,
-        extract_visible_tag=True,
-        extract_clickable_tag=True,
-        extract_coords="False",
-        filter_visible_elements_only=False,
-    ),
-    action=dp.ActionFlags(
-        multi_actions=False,
-        action_set="bid",
-        long_description=False,
-        individual_examples=False,
-    ),
-    use_plan=False,
-    use_criticise=False,
-    use_thinking=True,
-    use_memory=False,
-    use_concrete_example=True,
-    use_abstract_example=True,
-    use_hints=True,
-    enable_chat=False,
-    max_prompt_tokens=40_000,
-    be_cautious=True,
-    extra_instructions=None,
-)
-```
-© Hugging Face
-TOS
-Privacy
-About
-Jobs
-Models
-Datasets
-Spaces
-Pricing
-Docs
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o-mini/assistantbench.json b/results/GenericAgent-GPT-4o-mini/assistantbench.json
deleted file mode 100644
index edfcd630cb3884c4b7902ad14f1595dfff2a092f..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-4o-mini/assistantbench.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
-        "date_time": "2024-11-28 19:34:58",
-        "benchmark": "AssistantBench",
-        "score": 2.1,
-        "std_err": 1.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "Intersection of finished tasks across agents.",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o-mini/visualwebarena.json b/results/GenericAgent-GPT-4o-mini/visualwebarena.json
deleted file mode 100644
index 73d01f9c23c56b48eee3901be1bc0b9a8788b1a2..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-4o-mini/visualwebarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "8d8642d3-757a-4346-ba45-01398f85b1f4",
-        "date_time": "2024-12-02 02:54:33",
-        "benchmark": "VisualWebArena",
-        "score": 16.9,
-        "std_err": 1.2,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o-mini/webarena.json b/results/GenericAgent-GPT-4o-mini/webarena.json
deleted file mode 100644
index 3958a8c47219cf2856a1e73a1309d1acc3b295b4..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-4o-mini/webarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "c6bdeb87-9879-4c06-aa70-00d895001156",
-        "date_time": "2024-11-29 19:25:49",
-        "benchmark": "WebArena",
-        "score": 17.4,
-        "std_err": 1.3,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o-mini/weblinx.json b/results/GenericAgent-GPT-4o-mini/weblinx.json
deleted file mode 100644
index 20293812e70717a0c19c9e9efe5c9a9052d62153..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-4o-mini/weblinx.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
-        "date_time": "2024-11-07 21:42:30",
-        "benchmark": "WebLINX",
-        "score": 11.6,
-        "std_err": 0.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o/README.md b/results/GenericAgent-GPT-4o/README.md
deleted file mode 100644
index dac984577c9e5a75cb3f614aa43c17841382f298..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-4o/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-### GenericAgent-GPT-4o
-
-This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
-
-It uses GPT-4o as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
-```python
-BASE_FLAGS = GenericPromptFlags(
-    obs=dp.ObsFlags(
-        use_html=False,
-        use_ax_tree=True,
-        use_focused_element=True,
-        use_error_logs=True,
-        use_history=True,
-        use_past_error_logs=False,
-        use_action_history=True,
-        use_think_history=True,  # gpt-4o config except for this line
-        use_diff=False,
-        html_type="pruned_html",
-        use_screenshot=False,
-        use_som=False,
-        extract_visible_tag=True,
-        extract_clickable_tag=True,
-        extract_coords="False",
-        filter_visible_elements_only=False,
-    ),
-    action=dp.ActionFlags(
-        multi_actions=False,
-        action_set="bid",
-        long_description=False,
-        individual_examples=False,
-    ),
-    use_plan=False,
-    use_criticise=False,
-    use_thinking=True,
-    use_memory=False,
-    use_concrete_example=True,
-    use_abstract_example=True,
-    use_hints=True,
-    enable_chat=False,
-    max_prompt_tokens=40_000,
-    be_cautious=True,
-    extra_instructions=None,
-)
-```
-
-Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o/assistantbench.json b/results/GenericAgent-GPT-4o/assistantbench.json
deleted file mode 100644
index b6a57ca0ad20cd092f5260ebb168a8e058fc4e40..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-4o/assistantbench.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-4o",
-        "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
-        "date_time": "2024-11-28 19:34:58",
-        "benchmark": "AssistantBench",
-        "score": 4.8,
-        "std_err": 2.4,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "Intersection of finished tasks across agents.",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o/visualwebarena.json b/results/GenericAgent-GPT-4o/visualwebarena.json
deleted file mode 100644
index 730550b0d16f9421b0372671f1a17b93fd17dc7a..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-4o/visualwebarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-4o",
-        "study_id": "7fb7eac8-4bbd-4ebe-be32-15901a7678f2",
-        "date_time": "2024-12-02 07:17:28",
-        "benchmark": "VisualWebArena",
-        "score": 26.7,
-        "std_err": 1.5,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o/webarena.json b/results/GenericAgent-GPT-4o/webarena.json
deleted file mode 100644
index 53542859b5677c179c3edb195d17b5dd63d7db11..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-4o/webarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-4o",
-        "study_id": "d2eed215-91bb-4603-b69c-8ef8f9d57f34",
-        "date_time": "2024-11-29 22:28:32",
-        "benchmark": "WebArena",
-        "score": 31.4,
-        "std_err": 1.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-4o/weblinx.json b/results/GenericAgent-GPT-4o/weblinx.json
deleted file mode 100644
index 2b75c3a1e8a455277a5b15c1208bc3c10d31f369..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-4o/weblinx.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-4o",
-        "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
-        "date_time": "2024-11-07 21:42:30",
-        "benchmark": "WebLINX",
-        "score": 12.5,
-        "std_err": 0.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-o1-mini/README.md b/results/GenericAgent-GPT-o1-mini/README.md
deleted file mode 100644
index 05753e4c73ce5b5c318dd57d7c6cde978ce8dbba..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-o1-mini/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-### GenericAgent-GPT-o1-mini
-
-This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
-
-It uses o1-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
-```python
-BASE_FLAGS = GenericPromptFlags(
-    obs=dp.ObsFlags(
-        use_html=False,
-        use_ax_tree=True,
-        use_focused_element=True,
-        use_error_logs=True,
-        use_history=True,
-        use_past_error_logs=False,
-        use_action_history=True,
-        use_think_history=True,  # gpt-4o config except for this line
-        use_diff=False,
-        html_type="pruned_html",
-        use_screenshot=False,
-        use_som=False,
-        extract_visible_tag=True,
-        extract_clickable_tag=True,
-        extract_coords="False",
-        filter_visible_elements_only=False,
-    ),
-    action=dp.ActionFlags(
-        multi_actions=False,
-        action_set="bid",
-        long_description=False,
-        individual_examples=False,
-    ),
-    use_plan=False,
-    use_criticise=False,
-    use_thinking=True,
-    use_memory=False,
-    use_concrete_example=True,
-    use_abstract_example=True,
-    use_hints=True,
-    enable_chat=False,
-    max_prompt_tokens=40_000,
-    be_cautious=True,
-    extra_instructions=None,
-)
-```
-
-Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-o1-mini/assistantbench.json b/results/GenericAgent-GPT-o1-mini/assistantbench.json
deleted file mode 100644
index 60ef7f9ca466e4d11b00f04c652bb57ce685124d..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-o1-mini/assistantbench.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-o1-mini",
-        "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
-        "date_time": "2024-11-28 19:34:58",
-        "benchmark": "AssistantBench",
-        "score": 6.9,
-        "std_err": 2.2,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "Intersection of finished tasks across agents.",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-o1-mini/miniwob.json b/results/GenericAgent-GPT-o1-mini/miniwob.json
deleted file mode 100644
index a0d53790c92d64ec24d58bd4cb35bf898df7b11d..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-o1-mini/miniwob.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-o1-mini",
-        "study_id": "2024-10-25_06-08-16",
-        "date_time": "2024-10-25 17:16:23",
-        "benchmark": "MiniWoB",
-        "score": 67.8,
-        "std_err": 1.9,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-o1-mini/webarena.json b/results/GenericAgent-GPT-o1-mini/webarena.json
deleted file mode 100644
index d094313542f88a8b827452d7f26f917b19211a22..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-o1-mini/webarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-o1-mini",
-        "study_id": "1827983d-5e84-4b63-ad49-bf45ec2a6348",
-        "date_time": "2024-11-30 00:22:44",
-        "benchmark": "WebArena",
-        "score": 28.6,
-        "std_err": 1.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-o1-mini/weblinx.json b/results/GenericAgent-GPT-o1-mini/weblinx.json
deleted file mode 100644
index f5f14748f819dfe37e65fd2a63d23c120cd00e8b..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-o1-mini/weblinx.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-o1-mini",
-        "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
-        "date_time": "2024-11-07 21:42:30",
-        "benchmark": "WebLINX",
-        "score": 12.5,
-        "std_err": 0.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-o1-mini/workarena-l1.json b/results/GenericAgent-GPT-o1-mini/workarena-l1.json
deleted file mode 100644
index 1b9e8f224d4936436e9de1f6ef7342eddcb91e25..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-o1-mini/workarena-l1.json
+++ /dev/null
@@ -1,30 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-o1-mini",
-        "study_id": "2024-10-23_14-17-40",
-        "date_time": "2024-10-23 22:30:06",
-        "benchmark": "WorkArena-L1",
-        "score": 56.7,
-        "std_err": 2.7,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    },
-    {
-        "agent_name": "GenericAgent-GPT-o1-mini",
-        "study_id": "f3e1fcb8-5fc5-4115-9e00-27251508e2c7", 
-        "date_time": "2025-02-07 14:00:00",
-        "benchmark": "WorkArena-L1",
-        "score": 51.8,
-        "std_err": 2.80,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes", 
-        "reproducible": "Yes",
-        "comments": "Additional details",
-        "original_or_reproduced": "Reproduced"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-GPT-o1-mini/workarena-l2.json b/results/GenericAgent-GPT-o1-mini/workarena-l2.json
deleted file mode 100644
index 4132ca278aed23f95fb38659f044e3990f82e0fe..0000000000000000000000000000000000000000
--- a/results/GenericAgent-GPT-o1-mini/workarena-l2.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-o1-mini",
-        "study_id": "2024-10-23_17-10-46",
-        "date_time": "2024-10-24 17:08:53",
-        "benchmark": "WorkArena-L2",
-        "score": 14.9,
-        "std_err": 2.3,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-405b/README.md b/results/GenericAgent-Llama-3.1-405b/README.md
deleted file mode 100644
index 95c6107a0f831653d7cd0a5f848a2cc99754109c..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-405b/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-### GenericAgent-Llama-3.1-405b
-
-This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
-
-It uses Llama-3.1-405b as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
-```python
-BASE_FLAGS = GenericPromptFlags(
-    obs=dp.ObsFlags(
-        use_html=False,
-        use_ax_tree=True,
-        use_focused_element=True,
-        use_error_logs=True,
-        use_history=True,
-        use_past_error_logs=False,
-        use_action_history=True,
-        use_think_history=True,  # gpt-4o config except for this line
-        use_diff=False,
-        html_type="pruned_html",
-        use_screenshot=False,
-        use_som=False,
-        extract_visible_tag=True,
-        extract_clickable_tag=True,
-        extract_coords="False",
-        filter_visible_elements_only=False,
-    ),
-    action=dp.ActionFlags(
-        multi_actions=False,
-        action_set="bid",
-        long_description=False,
-        individual_examples=False,
-    ),
-    use_plan=False,
-    use_criticise=False,
-    use_thinking=True,
-    use_memory=False,
-    use_concrete_example=True,
-    use_abstract_example=True,
-    use_hints=True,
-    enable_chat=False,
-    max_prompt_tokens=40_000,
-    be_cautious=True,
-    extra_instructions=None,
-)
-```
-
-Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-405b/assistantbench.json b/results/GenericAgent-Llama-3.1-405b/assistantbench.json
deleted file mode 100644
index 0a7c507210cf2e1b594136e43a5936407467f274..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-405b/assistantbench.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Llama-3.1-405b",
-        "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
-        "benchmark": "AssistantBench",
-        "score": 3.9,
-        "std_err": 1.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "Intersection of finished tasks across agents.",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-11-28 19:34:58"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-405b/miniwob.json b/results/GenericAgent-Llama-3.1-405b/miniwob.json
deleted file mode 100644
index 3f2c22229b7d1e67e3c2f63855bb8d2df78543eb..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-405b/miniwob.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Llama-3.1-405b",
-        "study_id": "4d748972-6d35-4489-a197-138b656a7db3",
-        "benchmark": "MiniWoB",
-        "score": 64.6,
-        "std_err": 1.9,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-11-29 16:14:00"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-405b/webarena.json b/results/GenericAgent-Llama-3.1-405b/webarena.json
deleted file mode 100644
index ddffdb10c856b72a8896fe80b39aeb1e91b123b8..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-405b/webarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Llama-3.1-405b",
-        "study_id": "aaeca13d-0cf5-444f-8445-590350b54746",
-        "benchmark": "WebArena",
-        "score": 24.0,
-        "std_err": 1.5,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-12-01 00:04:43"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-405b/weblinx.json b/results/GenericAgent-Llama-3.1-405b/weblinx.json
deleted file mode 100644
index bd6801536a4fc931b4fb77126bd635441f177c92..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-405b/weblinx.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Llama-3.1-405b",
-        "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
-        "benchmark": "WebLINX",
-        "score": 7.9,
-        "std_err": 0.5,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-11-07 21:42:30"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-405b/workarena-l1.json b/results/GenericAgent-Llama-3.1-405b/workarena-l1.json
deleted file mode 100644
index 95927554a7efacf122b2f553d72b593503d4b571..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-405b/workarena-l1.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Llama-3.1-405b",
-        "study_id": "2024-10-25_17-34-45",
-        "benchmark": "WorkArena-L1",
-        "score": 43.3,
-        "std_err": 2.7,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-10-25 20:32:26"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-405b/workarena-l2.json b/results/GenericAgent-Llama-3.1-405b/workarena-l2.json
deleted file mode 100644
index 1209bfc5d44a82b0788f889992e3a46d99f62c6e..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-405b/workarena-l2.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Llama-3.1-405b",
-        "study_id": "528da1f2-1949-41dc-b988-85f19f435af2",
-        "date_time": "2024-11-29 14:28:47",
-        "benchmark": "WorkArena-L2",
-        "score": 7.2,
-        "std_err": 1.7,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-405b/workarena-l3.json b/results/GenericAgent-Llama-3.1-405b/workarena-l3.json
deleted file mode 100644
index 7e6ca5fc0aadba1ac1ad6224eb36b3b7d36080b6..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-405b/workarena-l3.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Llama-3.1-405b",
-        "study_id": "-",
-        "benchmark": "WorkArena-L3",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-10-24 23:03:30"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-70b/README.md b/results/GenericAgent-Llama-3.1-70b/README.md
deleted file mode 100644
index 9bd1ba56e9020df0717ff233c948a70650135b8d..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-70b/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-### GenericAgent-Llama-3.1-70b
-
-This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
-
-It uses Llama-3.1-70b as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
-```python
-BASE_FLAGS = GenericPromptFlags(
-    obs=dp.ObsFlags(
-        use_html=False,
-        use_ax_tree=True,
-        use_focused_element=True,
-        use_error_logs=True,
-        use_history=True,
-        use_past_error_logs=False,
-        use_action_history=True,
-        use_think_history=True,  # gpt-4o config except for this line
-        use_diff=False,
-        html_type="pruned_html",
-        use_screenshot=False,
-        use_som=False,
-        extract_visible_tag=True,
-        extract_clickable_tag=True,
-        extract_coords="False",
-        filter_visible_elements_only=False,
-    ),
-    action=dp.ActionFlags(
-        multi_actions=False,
-        action_set="bid",
-        long_description=False,
-        individual_examples=False,
-    ),
-    use_plan=False,
-    use_criticise=False,
-    use_thinking=True,
-    use_memory=False,
-    use_concrete_example=True,
-    use_abstract_example=True,
-    use_hints=True,
-    enable_chat=False,
-    max_prompt_tokens=40_000,
-    be_cautious=True,
-    extra_instructions=None,
-)
-```
-
-Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-70b/assistantbench.json b/results/GenericAgent-Llama-3.1-70b/assistantbench.json
deleted file mode 100644
index 9b26a6c6c8b0cf81c1767f50f0e67ed3551a8304..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-70b/assistantbench.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Llama-3.1-70b",
-        "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
-        "benchmark": "AssistantBench",
-        "score": 2.8,
-        "std_err": 1.1,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "Intersection of finished tasks across agents.",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-11-28 19:34:58"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-70b/miniwob.json b/results/GenericAgent-Llama-3.1-70b/miniwob.json
deleted file mode 100644
index e5d35d7e28c7b14826373ef2bfc296e12be23d79..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-70b/miniwob.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Llama-3.1-70b",
-        "study_id": "2024-10-25_06-08-16",
-        "benchmark": "MiniWoB",
-        "score": 57.6,
-        "std_err": 2.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-10-25 17:16:23"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-70b/webarena.json b/results/GenericAgent-Llama-3.1-70b/webarena.json
deleted file mode 100644
index cfe83f8c36849d49317072af5664e764877ea6a4..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-70b/webarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Llama-3.1-70b",
-        "study_id": "fc5747bc-d998-4942-a0eb-e55a3ccc1cb3",
-        "benchmark": "WebArena",
-        "score": 18.4,
-        "std_err": 1.4,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-12-02 23:18:38"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-70b/weblinx.json b/results/GenericAgent-Llama-3.1-70b/weblinx.json
deleted file mode 100644
index f1787b30189e1ea0fdfbcfc1023ab55397016923..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-70b/weblinx.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Llama-3.1-70b",
-        "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
-        "benchmark": "WebLINX",
-        "score": 8.9,
-        "std_err": 0.5,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-11-07 21:42:30"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-70b/workarena-l1.json b/results/GenericAgent-Llama-3.1-70b/workarena-l1.json
deleted file mode 100644
index 58628881ea0cc3a127112cd2846be8104b3374df..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-70b/workarena-l1.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Llama-3.1-70b",
-        "study_id": "2024-10-23_14-17-40",
-        "benchmark": "WorkArena-L1",
-        "score": 27.9,
-        "std_err": 2.5,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-10-23 22:30:06"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-70b/workarena-l2.json b/results/GenericAgent-Llama-3.1-70b/workarena-l2.json
deleted file mode 100644
index 6b8c5e0373cb06a95b49504ff09ff0caf81b14a2..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-70b/workarena-l2.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Llama-3.1-70b",
-        "study_id": "2024-10-23_17-10-46",
-        "date_time": "2024-10-24 17:08:53",
-        "benchmark": "WorkArena-L2",
-        "score": 2.1,
-        "std_err": 0.9,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-Llama-3.1-70b/workarena-l3.json b/results/GenericAgent-Llama-3.1-70b/workarena-l3.json
deleted file mode 100644
index 7b3e635d0ceccda73484a603c7f679412645c86b..0000000000000000000000000000000000000000
--- a/results/GenericAgent-Llama-3.1-70b/workarena-l3.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Llama-3.1-70b",
-        "study_id": "-",
-        "benchmark": "WorkArena-L3",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-10-24 23:03:30"
-    }
-]
\ No newline at end of file
diff --git a/results/GenericAgent-o3-mini/README.md b/results/GenericAgent-o3-mini/README.md
deleted file mode 100644
index a9d14e4e4aef62c60e7ff233550db96c3fe301af..0000000000000000000000000000000000000000
--- a/results/GenericAgent-o3-mini/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-### GenericAgent-o3-mini
-
-This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
-
-It uses o1-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/agent_configs.py):
-```python
-BASE_FLAGS = FLAGS_GPT_4o = GenericPromptFlags(
-    obs=dp.ObsFlags(
-        use_html=False,
-        use_ax_tree=True,
-        use_focused_element=True,
-        use_error_logs=True,
-        use_history=True,
-        use_past_error_logs=False,
-        use_action_history=True,
-        use_think_history=False,
-        use_diff=False,
-        html_type="pruned_html",
-        use_screenshot=False,
-        use_som=False,
-        extract_visible_tag=True,
-        extract_clickable_tag=True,
-        extract_coords="False",
-        filter_visible_elements_only=False,
-    ),
-    action=dp.ActionFlags(
-        action_set=bgym.HighLevelActionSetArgs(
-            subsets=["bid"],
-            multiaction=False,
-        ),
-        long_description=False,
-        individual_examples=False,
-    ),
-    use_plan=False,
-    use_criticise=False,
-    use_thinking=True,
-    use_memory=False,
-    use_concrete_example=True,
-    use_abstract_example=True,
-    use_hints=True,
-    enable_chat=False,
-    max_prompt_tokens=40_000,
-    be_cautious=True,
-    extra_instructions=None,
-)
-```
diff --git a/results/GenericAgent-o3-mini/workarena-l1.json b/results/GenericAgent-o3-mini/workarena-l1.json
deleted file mode 100644
index 40fb1a60c37f04f0186948c72d63090fca80d3e5..0000000000000000000000000000000000000000
--- a/results/GenericAgent-o3-mini/workarena-l1.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-o3-mini",
-        "study_id": "f3e1fcb8-5fc5-4115-9e00-27251508e2c7", 
-        "date_time": "2025-02-07 14:00:00",
-        "benchmark": "WorkArena-L1",
-        "score": 48.2,
-        "std_err": 2.80,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes", 
-        "reproducible": "Yes",
-        "comments": "Additional details",
-        "original_or_reproduced": "Original"
-    }
-]
\ No newline at end of file
diff --git a/results/OrbyAgent-ActIO-72b/README.md b/results/OrbyAgent-ActIO-72b/README.md
deleted file mode 100644
index 5235a274088f5346c5c5125e6ebaca62fd9e27fa..0000000000000000000000000000000000000000
--- a/results/OrbyAgent-ActIO-72b/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-### OrbyAgent-ActIO-72b
-
-This agent is developed by [Orby AI](https://www.orby.ai/).
-
-The agent does not use any benchmark-specific information in the prompts. For WebArena benchmark, we use the original evaluator and task definitions for fair comparison.
-
-It uses the ActIO model of 72B parameters as a backend, with both screenshot and HTML as inputs. More details can be found in our [research blog](https://www.orby.ai/resources/elevating-automation-orby-ais-generic-agent-framework-and-self-adaptive-interface-learning-technique).
diff --git a/results/OrbyAgent-ActIO-72b/miniwob.json b/results/OrbyAgent-ActIO-72b/miniwob.json
deleted file mode 100644
index 3c1031b683ff5e324e3d20d4dc98719cff0183fd..0000000000000000000000000000000000000000
--- a/results/OrbyAgent-ActIO-72b/miniwob.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "OrbyAgent-ActIO-72b",
-        "study_id": "orby-agent-v0-actio-v0-miniwob",
-        "benchmark": "MiniWoB",
-        "score": 64.2,
-        "std_err": 1.4,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2025-02-21 15:03:35"
-    }
-]
\ No newline at end of file
diff --git a/results/OrbyAgent-ActIO-72b/webarena.json b/results/OrbyAgent-ActIO-72b/webarena.json
deleted file mode 100644
index 0aaf0d23fca4d6af41a7f7dfa983042e24cf0a08..0000000000000000000000000000000000000000
--- a/results/OrbyAgent-ActIO-72b/webarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "OrbyAgent-ActIO-72b",
-        "study_id": "b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae",
-        "benchmark": "WebArena",
-        "score": 34.7,
-        "std_err": 0.25,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "Use original WebArena eval protocol and task definitions",
-        "original_or_reproduced": "Original",
-        "date_time": "2025-02-21 15:05:12"
-    }
-]
\ No newline at end of file
diff --git a/results/OrbyAgent-Claude-3.5-Sonnet/README.md b/results/OrbyAgent-Claude-3.5-Sonnet/README.md
deleted file mode 100644
index 4d6752f14c3116b4d4749aff95c3fbb00331f4c7..0000000000000000000000000000000000000000
--- a/results/OrbyAgent-Claude-3.5-Sonnet/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-### OrbyAgent-Claude-3.5-Sonnet
-
-This agent is developed by [Orby AI](https://www.orby.ai/).
-
-The agent does not use any benchmark-specific information in the prompts. For WebArena benchmark, we use the original evaluator and task definitions for fair comparison.
-
-It uses Claude-3.5-sonnet-20241022 as a backend, with both screenshot and HTML as inputs. More details can be found in our [research blog](https://www.orby.ai/resources/elevating-automation-orby-ais-generic-agent-framework-and-self-adaptive-interface-learning-technique).
diff --git a/results/OrbyAgent-Claude-3.5-Sonnet/miniwob.json b/results/OrbyAgent-Claude-3.5-Sonnet/miniwob.json
deleted file mode 100644
index 3b3f0b3a2df8e492f62c595cdbcb20b3dedc34bf..0000000000000000000000000000000000000000
--- a/results/OrbyAgent-Claude-3.5-Sonnet/miniwob.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "OrbyAgent-Claude-3.5-Sonnet",
-        "study_id": "orby-agent-v0-claude-3.5-miniwob",
-        "benchmark": "MiniWoB",
-        "score": 74.9,
-        "std_err": 1.2,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2025-02-21 14:54:16"
-    }
-]
\ No newline at end of file
diff --git a/results/OrbyAgent-Claude-3.5-Sonnet/webarena.json b/results/OrbyAgent-Claude-3.5-Sonnet/webarena.json
deleted file mode 100644
index 312bcfa3aea24fa0faa45e54aecc7b7a721381db..0000000000000000000000000000000000000000
--- a/results/OrbyAgent-Claude-3.5-Sonnet/webarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "OrbyAgent-Claude-3.5-Sonnet",
-        "study_id": "orby-agent-v0-claude-3.5-webarena",
-        "benchmark": "WebArena",
-        "score": 36.5,
-        "std_err": 0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "Use original WebArena eval protocol and task definitions",
-        "original_or_reproduced": "Original",
-        "date_time": "2025-02-21 15:00:22"
-    }
-]
\ No newline at end of file
diff --git a/results/test-agent/README.md b/results/test-agent/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ef031f56262cb73d52252e8886211670fb3a404f
--- /dev/null
+++ b/results/test-agent/README.md
@@ -0,0 +1 @@
+### Test agent
\ No newline at end of file
diff --git a/results/test-agent/miniwob.json b/results/test-agent/miniwob.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b97057f9235e436d822ef77b46ceeeb7e7906c9
--- /dev/null
+++ b/results/test-agent/miniwob.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "test-agent",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "MiniWoB",
+        "score": 43.4,
+        "std_err": 0.1,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/test-agent/webarena.json b/results/test-agent/webarena.json
new file mode 100644
index 0000000000000000000000000000000000000000..794b67ba1052c2b689d83a899b44915d6083df2c
--- /dev/null
+++ b/results/test-agent/webarena.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "test-agent",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 6.7,
+        "std_err": 0.2,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/test-agent/workarena-l1.json b/results/test-agent/workarena-l1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c1962b66be4242e529fbac430c7419dc3195bab
--- /dev/null
+++ b/results/test-agent/workarena-l1.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "test-agent",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L1",
+        "score": 6.1,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/test-agent/workarena-l2.json b/results/test-agent/workarena-l2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2995fd43638f3a04b99245a5856ef2b4133d7472
--- /dev/null
+++ b/results/test-agent/workarena-l2.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "test-agent",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L2",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file
diff --git a/results/test-agent/workarena-l3.json b/results/test-agent/workarena-l3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c702e073fe98ef218dd665637ca315a69f9ff265
--- /dev/null
+++ b/results/test-agent/workarena-l3.json
@@ -0,0 +1,16 @@
+[
+    {
+        "agent_name": "test-agent",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]
\ No newline at end of file