import json |
import re |
import os |
import streamlit as st |
import requests |
import pandas as pd |
from io import StringIO |
import plotly.graph_objs as go |
from huggingface_hub import HfApi |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError |
import streamlit.components.v1 as components |
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB",] |
def create_html_table_main(df, benchmarks): |
col1, col2 = st.columns([2,6]) |
with col1: |
sort_column = st.selectbox("Sort by", df.columns.tolist()) |
with col2: |
sort_order = st.radio("Order", ["Ascending", "Descending"], horizontal=True) |
if sort_order == "Ascending": |
df = df.sort_values(by=sort_column) |
else: |
df = df.sort_values(by=sort_column, ascending=False) |
html = ''' |
<style> |
table { |
width: 100%; |
border-collapse: collapse; |
} |
th, td { |
border: 1px solid #ddd; |
padding: 8px; |
text-align: center; |
} |
th { |
font-weight: bold; |
} |
.table-container { |
padding-bottom: 20px; |
} |
</style> |
''' |
html += '<div class="table-container">' |
html += '<table>' |
html += '<thead><tr>' |
for column in df.columns: |
html += f'<th>{column}</th>' |
html += '</tr></thead>' |
html += '<tbody>' |
for _, row in df.iterrows(): |
html += '<tr>' |
for col in df.columns: |
html += f'<td>{row[col]}</td>' |
html += '</tr>' |
html += '</tbody></table>' |
html += '</div>' |
return html |
def create_html_table_benchmark(df, benchmarks): |
html = ''' |
<style> |
table { |
width: 100%; |
border-collapse: collapse; |
} |
th, td { |
border: 1px solid #ddd; |
padding: 8px; |
text-align: center; |
} |
th { |
font-weight: bold; |
} |
.table-container { |
padding-bottom: 20px; |
} |
</style> |
''' |
html += '<div class="table-container">' |
html += '<table>' |
html += '<thead><tr>' |
for column in df.columns: |
if column != "Reproduced_all": |
html += f'<th>{column}</th>' |
html += '</tr></thead>' |
html += '<tbody>' |
for _, row in df.iterrows(): |
html += '<tr>' |
for column in df.columns: |
if column == "Reproduced": |
if row[column] == "-": |
html += f'<td>{row[column]}</td>' |
else: |
html += f'<td><details><summary>{row[column]}</summary>{"<br>".join(map(str, row["Reproduced_all"]))}</details></td>' |
elif column == "Reproduced_all": |
continue |
else: |
html += f'<td>{row[column]}</td>' |
html += '</tr>' |
html += '</tbody></table>' |
html += '</div>' |
return html |
def check_sanity(agent): |
for benchmark in BENCHMARKS: |
file_path = f"results/{agent}/{benchmark.lower()}.json" |
if not os.path.exists(file_path): |
continue |
original_count = 0 |
with open(file_path) as f: |
results = json.load(f) |
for result in results: |
if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]): |
return False |
if result["agent_name"] != agent: |
return False |
if result["benchmark"] != benchmark: |
return False |
if result["original_or_reproduced"] == "Original": |
original_count += 1 |
if original_count != 1: |
return False |
return True |
def main(): |
st.set_page_config(page_title="WebAgent Leaderboard", layout="wide") |
all_agents = os.listdir("results") |
all_results = {} |
for agent in all_agents: |
if not check_sanity(agent): |
st.error(f"Results for {agent} are not in the correct format.") |
continue |
agent_results = [] |
for benchmark in BENCHMARKS: |
with open(f"results/{agent}/{benchmark.lower()}.json") as f: |
agent_results.extend(json.load(f)) |
all_results[agent] = agent_results |
st.title("π BrowserGym Leaderboard") |
st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.") |
tabs = st.tabs(["π WebAgent Leaderboard",] + BENCHMARKS + ["π About"]) |
with tabs[0]: |
def get_leaderboard_dict(results): |
leaderboard_dict = [] |
for key, values in results.items(): |
result_dict = {"Agent": key} |
for benchmark in BENCHMARKS: |
if any(value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original" for value in values): |
result_dict[benchmark] = [value["score"] for value in values if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original"][0] |
else: |
result_dict[benchmark] = "-" |
leaderboard_dict.append(result_dict) |
return leaderboard_dict |
leaderboard_dict = get_leaderboard_dict(all_results) |
full_df = pd.DataFrame.from_dict(leaderboard_dict) |
df = pd.DataFrame(columns=full_df.columns) |
dfs_to_concat = [] |
dfs_to_concat.append(full_df) |
if dfs_to_concat: |
df = pd.concat(dfs_to_concat, ignore_index=True) |
df = df.sort_values(by='WebArena', ascending=False) |
search_query = st.text_input("Search agents", "", key="search_main") |
if search_query: |
df = df[df['Agent'].str.contains(search_query, case=False)] |
def make_hyperlink(agent_name): |
url = f"https://huggingface.co/spaces/meghsn/WebAgent-Leaderboard/blob/main/results/{agent_name}/README.md" |
return f'<a href="{url}" target="_blank">{agent_name}</a>' |
df['Agent'] = df['Agent'].apply(make_hyperlink) |
html_table = create_html_table_main(df, BENCHMARKS) |
st.markdown(html_table, unsafe_allow_html=True) |
if st.button("Export to CSV", key="export_main"): |
csv_data = df.to_csv(index=False) |
st.download_button( |
label="Download CSV", |
data=csv_data, |
file_name="leaderboard.csv", |
key="download-csv", |
help="Click to download the CSV file", |
) |
with tabs[-1]: |
st.markdown(''' |
### Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks. |
''') |
for i, benchmark in enumerate(BENCHMARKS, start=1): |
with tabs[i]: |
def get_benchmark_dict(results, benchmark): |
benchmark_dict = [] |
for key, values in results.items(): |
result_dict = {"Agent": key} |
flag = 0 |
for value in values: |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original": |
result_dict["Score"] = value["score"] |
result_dict["Benchmark Specific"] = value["benchmark_specific"] |
result_dict["Benchmark Tuned"] = value["benchmark_tuned"] |
result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"] |
result_dict["Reproducible"] = value["reproducible"] |
result_dict["Comments"] = value["comments"] |
result_dict["Study ID"] = value["study_id"] |
result_dict["Date"] = value["date_time"] |
result_dict["Reproduced"] = [] |
result_dict["Reproduced_all"] = [] |
flag = 1 |
if not flag: |
result_dict["Score"] = "-" |
result_dict["Benchmark Specific"] = "-" |
result_dict["Benchmark Tuned"] = "-" |
result_dict["Followed Evaluation Protocol"] = "-" |
result_dict["Reproducible"] = "-" |
result_dict["Comments"] = "-" |
result_dict["Study ID"] = "-" |
result_dict["Date"] = "-" |
result_dict["Reproduced"] = [] |
result_dict["Reproduced_all"] = [] |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced": |
result_dict["Reproduced"].append(value["score"]) |
result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])])) |
if result_dict["Reproduced"]: |
result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"])) |
else: |
result_dict["Reproduced"] = "-" |
benchmark_dict.append(result_dict) |
return benchmark_dict |
benchmark_dict = get_benchmark_dict(all_results, benchmark=benchmark) |
full_df = pd.DataFrame.from_dict(benchmark_dict) |
df_ = pd.DataFrame(columns=full_df.columns) |
dfs_to_concat = [] |
dfs_to_concat.append(full_df) |
if dfs_to_concat: |
df_ = pd.concat(dfs_to_concat, ignore_index=True) |
html_table = create_html_table_benchmark(df_, BENCHMARKS) |
st.markdown(html_table, unsafe_allow_html=True) |
if __name__ == "__main__": |
main() |