Spaces:

ServiceNow
/

browsergym-leaderboard

Running

File size: 11,739 Bytes

import json
import re
import os
import streamlit as st
import requests
import pandas as pd
from io import StringIO
import plotly.graph_objs as go
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
import streamlit.components.v1 as components

# BENCHMARKS = ["WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB", "WebArena"]
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB",]

def create_html_table_main(df, benchmarks):
    col1, col2 = st.columns([2,6])
    with col1:
        sort_column = st.selectbox("Sort by", df.columns.tolist())
    with col2:
        sort_order = st.radio("Order", ["Ascending", "Descending"], horizontal=True)
    
    # Sort dataframe
    if sort_order == "Ascending":
        df = df.sort_values(by=sort_column)
    else:
        df = df.sort_values(by=sort_column, ascending=False)
    
    # Create HTML table without JavaScript sorting
    html = '''
    <style>
        table {
            width: 100%;
            border-collapse: collapse;
        }
        th, td {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: center;
        }
        th {
            font-weight: bold;
        }
        .table-container {
            padding-bottom: 20px;
        }
    </style>
    '''
    html += '<div class="table-container">'
    html += '<table>'
    html += '<thead><tr>'
    for column in df.columns:
        html += f'<th>{column}</th>'
    html += '</tr></thead>'
    html += '<tbody>'
    for _, row in df.iterrows():
        html += '<tr>'
        for col in df.columns:
            html += f'<td>{row[col]}</td>'
        html += '</tr>'
    html += '</tbody></table>'
    html += '</div>'
    return html

def create_html_table_benchmark(df, benchmarks):
    # Create HTML table without JavaScript sorting
    html = '''
    <style>
        table {
            width: 100%;
            border-collapse: collapse;
        }
        th, td {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: center;
        }
        th {
            font-weight: bold;
        }
        .table-container {
            padding-bottom: 20px;
        }
    </style>
    '''
    html += '<div class="table-container">'
    html += '<table>'
    html += '<thead><tr>'
    for column in df.columns:
        if column != "Reproduced_all":
            html += f'<th>{column}</th>'
    html += '</tr></thead>'
    html += '<tbody>'
    for _, row in df.iterrows():
        html += '<tr>'
        for column in df.columns:
            if column == "Reproduced":
                if row[column] == "-":
                    html += f'<td>{row[column]}</td>'
                else:
                    html += f'<td><details><summary>{row[column]}</summary>{"<br>".join(map(str, row["Reproduced_all"]))}</details></td>'
            elif column == "Reproduced_all":
                continue
            else:
                html += f'<td>{row[column]}</td>'
        html += '</tr>'
    html += '</tbody></table>'
    html += '</div>'
    return html

def check_sanity(agent):
    for benchmark in BENCHMARKS:
        file_path = f"results/{agent}/{benchmark.lower()}.json"
        if not os.path.exists(file_path):
            continue
        original_count = 0
        with open(file_path) as f:
            results = json.load(f)
            for result in results:
                if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]):
                    return False
                if result["agent_name"] != agent:
                    return False
                if result["benchmark"] != benchmark:
                    return False
                if result["original_or_reproduced"] == "Original":
                    original_count += 1
        if original_count != 1:
            return False
    return True

def main():
    st.set_page_config(page_title="WebAgent Leaderboard", layout="wide")

    all_agents = os.listdir("results")
    all_results = {}
    for agent in all_agents:
        if not check_sanity(agent):
            st.error(f"Results for {agent} are not in the correct format.")
            continue
        agent_results = []
        for benchmark in BENCHMARKS:
            with open(f"results/{agent}/{benchmark.lower()}.json") as f:
                agent_results.extend(json.load(f))
        all_results[agent] = agent_results

    st.title("🏆 BrowserGym Leaderboard")
    st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
    # content = create_yall()
    # tab1, tab2, tab3, tab4 = st.tabs(["🏆 WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "📝 About"])
    tabs = st.tabs(["🏆 WebAgent Leaderboard",] +  BENCHMARKS + ["📝 About"])

    with tabs[0]:
        # Leaderboard tab
        def get_leaderboard_dict(results):
            leaderboard_dict = []
            for key, values in results.items():
                result_dict = {"Agent": key}
                for benchmark in BENCHMARKS:
                    if any(value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original" for value in values):
                        result_dict[benchmark] = [value["score"] for value in values if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original"][0]
                    else:
                        result_dict[benchmark] = "-"
                leaderboard_dict.append(result_dict)
            return leaderboard_dict
        leaderboard_dict = get_leaderboard_dict(all_results)
        # print (leaderboard_dict)
        full_df = pd.DataFrame.from_dict(leaderboard_dict)

        df = pd.DataFrame(columns=full_df.columns)
        dfs_to_concat = []
        dfs_to_concat.append(full_df)

        # Concatenate the DataFrames
        if dfs_to_concat:
            df = pd.concat(dfs_to_concat, ignore_index=True)

        # df['Average'] = sum(df[column] for column in BENCHMARKS)/len(BENCHMARKS)
        # df['Average'] = df['Average'].round(2)
        # Sort values
        df = df.sort_values(by='WebArena', ascending=False)

        # Add a search bar
        search_query = st.text_input("Search agents", "", key="search_main")

        # Filter the DataFrame based on the search query
        if search_query:
            df = df[df['Agent'].str.contains(search_query, case=False)]

        # Display the filtered DataFrame or the entire leaderboard

        def make_hyperlink(agent_name):
            url = f"https://huggingface.co/spaces/meghsn/WebAgent-Leaderboard/blob/main/results/{agent_name}/README.md"
            return f'<a href="{url}" target="_blank">{agent_name}</a>'
        df['Agent'] = df['Agent'].apply(make_hyperlink)
        # st.dataframe(
        #     df[['Agent'] + BENCHMARKS],
        #     use_container_width=True,
        #     column_config={benchmark: {'alignment': 'center'} for benchmark in BENCHMARKS},
        #     hide_index=True,
        #     # height=int(len(df) * 36.2),
        # )
        # st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
        html_table = create_html_table_main(df, BENCHMARKS)
        # print (html_table)
        st.markdown(html_table, unsafe_allow_html=True)
        # components.html(html_table, height=600, scrolling=True)

        if st.button("Export to CSV", key="export_main"):
            # Export the DataFrame to CSV
            csv_data = df.to_csv(index=False)

            # Create a link to download the CSV file
            st.download_button(
                label="Download CSV",
                data=csv_data,
                file_name="leaderboard.csv",
                key="download-csv",
                help="Click to download the CSV file",
            )

    with tabs[-1]:
            st.markdown('''
                    ### Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.
                ''')
    for i, benchmark in enumerate(BENCHMARKS, start=1):
        with tabs[i]:
            def get_benchmark_dict(results, benchmark):
                benchmark_dict = []
                for key, values in results.items():
                    result_dict = {"Agent": key}
                    flag = 0
                    for value in values:
                        if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
                            result_dict["Score"] = value["score"]
                            result_dict["Benchmark Specific"] = value["benchmark_specific"]
                            result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
                            result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
                            result_dict["Reproducible"] = value["reproducible"]
                            result_dict["Comments"] = value["comments"]
                            result_dict["Study ID"] = value["study_id"]
                            result_dict["Date"] = value["date_time"]
                            result_dict["Reproduced"] = []
                            result_dict["Reproduced_all"] = []
                            flag = 1
                        if not flag:
                            result_dict["Score"] = "-"
                            result_dict["Benchmark Specific"] = "-"
                            result_dict["Benchmark Tuned"] = "-"
                            result_dict["Followed Evaluation Protocol"] = "-"
                            result_dict["Reproducible"] = "-"
                            result_dict["Comments"] = "-"
                            result_dict["Study ID"] = "-"
                            result_dict["Date"] = "-"
                            result_dict["Reproduced"] = []
                            result_dict["Reproduced_all"] = []
                        if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
                            result_dict["Reproduced"].append(value["score"])
                            result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
                    if result_dict["Reproduced"]:
                        result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
                    else:
                        result_dict["Reproduced"] = "-"
                    benchmark_dict.append(result_dict)
                return benchmark_dict
            benchmark_dict = get_benchmark_dict(all_results, benchmark=benchmark)
            # print (leaderboard_dict)
            full_df = pd.DataFrame.from_dict(benchmark_dict)
            df_ = pd.DataFrame(columns=full_df.columns)
            dfs_to_concat = []
            dfs_to_concat.append(full_df)

            # Concatenate the DataFrames
            if dfs_to_concat:
                df_ = pd.concat(dfs_to_concat, ignore_index=True)
            # st.markdown(f"<h2 id='{benchmark.lower()}'>{benchmark}</h2>", unsafe_allow_html=True)
            # st.dataframe(
            #     df_,
            #     use_container_width=True,
            #     column_config={benchmark: {'alignment': 'center'}},
            #     hide_index=True,
            # )
            html_table = create_html_table_benchmark(df_, BENCHMARKS)
            st.markdown(html_table, unsafe_allow_html=True)
                
        
if __name__ == "__main__":
    main()