|
import gradio as gr |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
from huggingface_hub import snapshot_download |
|
import pandas as pd |
|
|
|
from src.about import ( |
|
REPRODUCIBILITY_TEXT, |
|
INTRODUCTION_TEXT, |
|
ABOUT_TEXT, |
|
TITLE, |
|
) |
|
from src.display.css_html_js import custom_css, custom_js |
|
from src.display.utils import ( |
|
COLS, |
|
ST_BENCHMARK_COLS, |
|
AGENTIC_BENCHMARK_COLS, |
|
EVAL_COLS, |
|
AutoEvalColumn, |
|
fields, |
|
) |
|
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN |
|
from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP |
|
from src.submission.submit import add_new_eval |
|
|
|
|
|
def restart_space(): |
|
API.restart_space(repo_id=REPO_ID) |
|
|
|
|
|
try: |
|
print(EVAL_REQUESTS_PATH) |
|
snapshot_download( |
|
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN |
|
) |
|
except Exception: |
|
restart_space() |
|
try: |
|
print(EVAL_RESULTS_PATH) |
|
snapshot_download( |
|
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN |
|
) |
|
except Exception: |
|
restart_space() |
|
|
|
|
|
ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, ST_BENCHMARK_COLS) |
|
AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, AGENTIC_BENCHMARK_COLS) |
|
|
|
( |
|
finished_eval_queue_df, |
|
running_eval_queue_df, |
|
pending_eval_queue_df, |
|
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) |
|
|
|
def bold_max(s): |
|
is_max = s == s.max() |
|
return ['font-weight: bold' if v else '' for v in is_max] |
|
|
|
def init_leaderboard(dataframe, benchmark_type): |
|
if dataframe is None or dataframe.empty: |
|
raise ValueError("Leaderboard DataFrame is empty or None.") |
|
|
|
AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name=="Model") or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))] |
|
|
|
|
|
|
|
return gr.components.Dataframe( |
|
value=dataframe, |
|
datatype=[c.type for c in AutoEvalColumnSubset], |
|
column_widths=["150px" if c.name != "Model" else "250px" for c in AutoEvalColumnSubset], |
|
wrap=False, |
|
) |
|
|
|
black_logo_path = "src/assets/logo-icon-black.png" |
|
white_logo_path = "src/assets/logo-icon-white.png" |
|
|
|
demo = gr.Blocks( |
|
css=custom_css, |
|
js=custom_js, |
|
theme=gr.themes.Default(primary_hue=gr.themes.colors.pink), |
|
fill_height=True, |
|
fill_width=True, |
|
) |
|
with demo: |
|
gr.HTML(f""" |
|
<div id="page-header"> |
|
<div id="header-container"> |
|
<div id="left-container"> |
|
<img id="black-logo" src="/gradio_api/file={black_logo_path}"> |
|
<img id="white-logo" src="/gradio_api/file={white_logo_path}"> |
|
</div> |
|
<div id="centre-container"> |
|
<h1 style="margin-bottom: 0.25rem;">{TITLE}</h1> |
|
<p style="color:#eb088a; margin:0; font-size:1.2rem;">Explore Interactive Results & Traces</p> |
|
</div> |
|
<div id="right-container"> |
|
</div> |
|
</div> |
|
</div> |
|
""") |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="intro-text", sanitize_html=False) |
|
|
|
with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs: |
|
with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0): |
|
leaderboard = init_leaderboard(ST_LEADERBOARD_DF, "base") |
|
|
|
with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1): |
|
leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic") |
|
|
|
with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2): |
|
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False) |
|
|
|
with gr.TabItem("Reproducibility", elem_classes="llm-benchmark-tab-table", id=3): |
|
gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False) |
|
|
|
assets = [black_logo_path, white_logo_path] |
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(restart_space, "interval", seconds=1800) |
|
scheduler.start() |
|
demo.queue(default_concurrency_limit=40).launch(allowed_paths=assets) |