Spaces:
Runtime error
Runtime error
import glob | |
import json | |
import os | |
import pprint | |
import gradio as gr | |
import pandas as pd | |
from apscheduler.schedulers.background import BackgroundScheduler | |
from huggingface_hub import snapshot_download | |
from src.css_html_js import dark_mode_gradio_js | |
from src.envs import API, RESULTS_PATH, RESULTS_REPO, TOKEN | |
from src.evaluation import ALL_ENV_IDS, evaluate | |
from src.logging import configure_root_logger, setup_logger | |
configure_root_logger() | |
logger = setup_logger(__name__) | |
pp = pprint.PrettyPrinter(width=80) | |
def model_hyperlink(link, model_id): | |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_id}</a>' | |
def make_clickable_model(model_id): | |
link = f"https://huggingface.co/{model_id}" | |
return model_hyperlink(link, model_id) | |
def _backend_routine(): | |
# List only the text classification models | |
rl_models = list(API.list_models(filter="reinforcement-learning")) | |
logger.info(f"Found {len(rl_models)} RL models") | |
compatible_models = [] | |
for model in rl_models: | |
filenames = [sib.rfilename for sib in model.siblings] | |
if "agent.pt" in filenames: | |
compatible_models.append((model.modelId, model.sha)) | |
logger.info(f"Found {len(compatible_models)} compatible models") | |
# Get the results | |
snapshot_download( | |
repo_id=RESULTS_REPO, | |
revision="main", | |
local_dir=RESULTS_PATH, | |
repo_type="dataset", | |
max_workers=60, | |
token=TOKEN, | |
) | |
json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True) | |
evaluated_models = set() | |
for json_filepath in json_files: | |
with open(json_filepath) as fp: | |
data = json.load(fp) | |
evaluated_models.add((data["config"]["model_id"], data["config"]["model_sha"])) | |
# Find the models that are not associated with any results | |
pending_models = set(compatible_models) - evaluated_models | |
logger.info(f"Found {len(pending_models)} pending models") | |
# Run an evaluation on the models | |
for model_id, sha in pending_models: | |
logger.info(f"Running evaluation on {model_id}") | |
report = {"config": {"model_id": model_id, "model_sha": sha}} | |
try: | |
evaluations = evaluate(model_id, revision=sha) | |
except Exception as e: | |
logger.error(f"Error evaluating {model_id}: {e}") | |
evaluations = None | |
if evaluations is not None: | |
report["results"] = evaluations | |
report["status"] = "DONE" | |
else: | |
report["status"] = "FAILED" | |
# Update the results | |
dumped = json.dumps(report, indent=2) | |
output_path = os.path.join(RESULTS_PATH, model_id, f"results_{sha}.json") | |
os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
with open(output_path, "w") as f: | |
f.write(dumped) | |
# Upload the results to the results repo | |
API.upload_file( | |
path_or_fileobj=output_path, | |
path_in_repo=f"{model_id}/results_{sha}.json", | |
repo_id=RESULTS_REPO, | |
repo_type="dataset", | |
) | |
def backend_routine(): | |
try: | |
_backend_routine() | |
except Exception as e: | |
logger.error(f"{e.__class__.__name__}: {str(e)}") | |
def get_leaderboard_df(): | |
snapshot_download( | |
repo_id=RESULTS_REPO, | |
revision="main", | |
local_dir=RESULTS_PATH, | |
repo_type="dataset", | |
max_workers=60, | |
token=TOKEN, | |
) | |
json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True) | |
data = [] | |
for json_filepath in json_files: | |
with open(json_filepath) as fp: | |
report = json.load(fp) | |
model_id = report["config"]["model_id"] | |
row = {"Agent": model_id, "Status": report["status"]} | |
if report["status"] == "DONE": | |
results = {env_id: result["episodic_return_mean"] for env_id, result in report["results"].items()} | |
row.update(results) | |
data.append(row) | |
# Create DataFrame | |
df = pd.DataFrame(data) | |
# Replace NaN values with empty strings | |
df = df.fillna("") | |
return df | |
TITLE = """ | |
🚀 Open RL Leaderboard | |
""" | |
INTRODUCTION_TEXT = """ | |
Welcome to the Open RL Leaderboard! This is a community-driven benchmark for reinforcement learning models. | |
""" | |
ABOUT_TEXT = """ | |
The Open RL Leaderboard is a community-driven benchmark for reinforcement learning models. | |
""" | |
def select_column(column_name: str, data: pd.DataFrame): | |
# column_names = [col for col in column_names if col in data.columns] | |
column_names = ["Agent"] + [column_name] # add model name column | |
df = data[column_names] | |
def check_row(row): | |
return not (row.drop("Agent") == "").all() | |
mask = df.apply(check_row, axis=1) | |
df = df[mask] | |
df = df.sort_values(by=column_name, ascending=False) | |
return df | |
with gr.Blocks(js=dark_mode_gradio_js) as demo: | |
gr.HTML(TITLE) | |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0): | |
hidden_df = gr.components.Dataframe(get_leaderboard_df, visible=False, every=5 * 60) # hidden dataframe | |
env_selector = gr.components.Dropdown( | |
label="Environments", | |
choices=ALL_ENV_IDS, | |
value=ALL_ENV_IDS[0], | |
# interactive=True, | |
) | |
leaderboard = gr.components.Dataframe(select_column(ALL_ENV_IDS[0], get_leaderboard_df())) | |
# Events | |
env_selector.change(select_column, [env_selector, hidden_df], leaderboard) | |
# Update hidden dataframe | |
# hidden_df.change(get_leaderboard_df, [], hidden_df, every=10) | |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2): | |
gr.Markdown(ABOUT_TEXT) | |
scheduler = BackgroundScheduler() | |
scheduler.add_job(func=backend_routine, trigger="interval", seconds=0.5 * 60) | |
scheduler.start() | |
if __name__ == "__main__": | |
demo.queue().launch() # server_name="0.0.0.0", show_error=True, server_port=7860) | |