leaderboard

Runtime error

File size: 6,189 Bytes

import glob
import json
import os
import pprint

import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download

from src.css_html_js import dark_mode_gradio_js
from src.envs import API, RESULTS_PATH, RESULTS_REPO, TOKEN
from src.evaluation import ALL_ENV_IDS, evaluate
from src.logging import configure_root_logger, setup_logger

configure_root_logger()
logger = setup_logger(__name__)

pp = pprint.PrettyPrinter(width=80)


def model_hyperlink(link, model_id):
    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_id}</a>'


def make_clickable_model(model_id):
    link = f"https://huggingface.co/{model_id}"
    return model_hyperlink(link, model_id)


def _backend_routine():
    # List only the text classification models
    rl_models = list(API.list_models(filter="reinforcement-learning"))
    logger.info(f"Found {len(rl_models)} RL models")
    compatible_models = []
    for model in rl_models:
        filenames = [sib.rfilename for sib in model.siblings]
        if "agent.pt" in filenames:
            compatible_models.append((model.modelId, model.sha))

    logger.info(f"Found {len(compatible_models)} compatible models")

    # Get the results
    snapshot_download(
        repo_id=RESULTS_REPO,
        revision="main",
        local_dir=RESULTS_PATH,
        repo_type="dataset",
        max_workers=60,
        token=TOKEN,
    )
    json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True)

    evaluated_models = set()
    for json_filepath in json_files:
        with open(json_filepath) as fp:
            data = json.load(fp)
        evaluated_models.add((data["config"]["model_id"], data["config"]["model_sha"]))

    # Find the models that are not associated with any results
    pending_models = set(compatible_models) - evaluated_models
    logger.info(f"Found {len(pending_models)} pending models")

    # Run an evaluation on the models
    for model_id, sha in pending_models:
        logger.info(f"Running evaluation on {model_id}")
        report = {"config": {"model_id": model_id, "model_sha": sha}}
        try:
            evaluations = evaluate(model_id, revision=sha)
        except Exception as e:
            logger.error(f"Error evaluating {model_id}: {e}")
            evaluations = None

        if evaluations is not None:
            report["results"] = evaluations
            report["status"] = "DONE"
        else:
            report["status"] = "FAILED"

        # Update the results
        dumped = json.dumps(report, indent=2)
        output_path = os.path.join(RESULTS_PATH, model_id, f"results_{sha}.json")
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, "w") as f:
            f.write(dumped)

        # Upload the results to the results repo
        API.upload_file(
            path_or_fileobj=output_path,
            path_in_repo=f"{model_id}/results_{sha}.json",
            repo_id=RESULTS_REPO,
            repo_type="dataset",
        )


def backend_routine():
    try:
        _backend_routine()
    except Exception as e:
        logger.error(f"{e.__class__.__name__}: {str(e)}")


def get_leaderboard_df():
    snapshot_download(
        repo_id=RESULTS_REPO,
        revision="main",
        local_dir=RESULTS_PATH,
        repo_type="dataset",
        max_workers=60,
        token=TOKEN,
    )

    json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True)
    data = []

    for json_filepath in json_files:
        with open(json_filepath) as fp:
            report = json.load(fp)
        model_id = report["config"]["model_id"]
        row = {"Agent": model_id, "Status": report["status"]}
        if report["status"] == "DONE":
            results = {env_id: result["episodic_return_mean"] for env_id, result in report["results"].items()}
            row.update(results)
        data.append(row)

    # Create DataFrame
    df = pd.DataFrame(data)
    # Replace NaN values with empty strings
    df = df.fillna("")
    return df


TITLE = """
🚀 Open RL Leaderboard
"""

INTRODUCTION_TEXT = """
Welcome to the Open RL Leaderboard! This is a community-driven benchmark for reinforcement learning models.
"""

ABOUT_TEXT = """
The Open RL Leaderboard is a community-driven benchmark for reinforcement learning models.
"""


def select_column(column_name: str, data: pd.DataFrame):
    # column_names = [col for col in column_names if col in data.columns]
    column_names = ["Agent"] + [column_name]  # add model name column
    df = data[column_names]

    def check_row(row):
        return not (row.drop("Agent") == "").all()

    mask = df.apply(check_row, axis=1)
    df = df[mask]
    df = df.sort_values(by=column_name, ascending=False)
    return df


with gr.Blocks(js=dark_mode_gradio_js) as demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
            hidden_df = gr.components.Dataframe(get_leaderboard_df, visible=False, every=5 * 60)  # hidden dataframe

            env_selector = gr.components.Dropdown(
                label="Environments",
                choices=ALL_ENV_IDS,
                value=ALL_ENV_IDS[0],
                # interactive=True,
            )
            leaderboard = gr.components.Dataframe(select_column(ALL_ENV_IDS[0], get_leaderboard_df()))

            # Events
            env_selector.change(select_column, [env_selector, hidden_df], leaderboard)
            # Update hidden dataframe
            # hidden_df.change(get_leaderboard_df, [], hidden_df, every=10)

        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
            gr.Markdown(ABOUT_TEXT)


scheduler = BackgroundScheduler()
scheduler.add_job(func=backend_routine, trigger="interval", seconds=0.5 * 60)
scheduler.start()


if __name__ == "__main__":
    demo.queue().launch()  # server_name="0.0.0.0", show_error=True, server_port=7860)