jwilles's picture
Add right container
f4ed2d4
raw
history blame
4.41 kB
import gradio as gr
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
import pandas as pd
from src.about import (
REPRODUCIBILITY_TEXT,
INTRODUCTION_TEXT,
ABOUT_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css, custom_js
from src.display.utils import (
COLS,
ST_BENCHMARK_COLS,
AGENTIC_BENCHMARK_COLS,
EVAL_COLS,
AutoEvalColumn,
fields,
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP
from src.submission.submit import add_new_eval
def restart_space():
API.restart_space(repo_id=REPO_ID)
### Space initialisation
try:
print(EVAL_REQUESTS_PATH)
snapshot_download(
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
try:
print(EVAL_RESULTS_PATH)
snapshot_download(
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, ST_BENCHMARK_COLS)
AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, AGENTIC_BENCHMARK_COLS)
(
finished_eval_queue_df,
running_eval_queue_df,
pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
def bold_max(s):
is_max = s == s.max() # Boolean Series: True for the max value(s)
return ['font-weight: bold' if v else '' for v in is_max]
def init_leaderboard(dataframe, benchmark_type):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name=="Model") or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]
# styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]])
return gr.components.Dataframe(
value=dataframe,
datatype=[c.type for c in AutoEvalColumnSubset],
column_widths=["150px" if c.name != "Model" else "250px" for c in AutoEvalColumnSubset],
wrap=False,
)
black_logo_path = "src/assets/logo-icon-black.png"
white_logo_path = "src/assets/logo-icon-white.png"
demo = gr.Blocks(
css=custom_css,
js=custom_js,
theme=gr.themes.Default(primary_hue=gr.themes.colors.pink),
fill_height=True,
fill_width=True,
)
with demo:
gr.HTML(f"""
<div id="page-header">
<div id="header-container">
<div id="left-container">
<img id="black-logo" src="/gradio_api/file={black_logo_path}">
<img id="white-logo" src="/gradio_api/file={white_logo_path}">
</div>
<div id="centre-container">
<h1 style="margin-bottom: 0.25rem;">{TITLE}</h1>
<p style="color:#eb088a; margin:0; font-size:1.2rem;">Explore Interactive Results &amp; Traces</p>
</div>
<div id="right-container">
</div>
</div>
</div>
""")
gr.Markdown(INTRODUCTION_TEXT, elem_classes="intro-text", sanitize_html=False)
with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs:
with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0):
leaderboard = init_leaderboard(ST_LEADERBOARD_DF, "base")
with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1):
leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2):
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False)
with gr.TabItem("Reproducibility", elem_classes="llm-benchmark-tab-table", id=3):
gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False)
assets = [black_logo_path, white_logo_path]
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch(allowed_paths=assets)