|
import gradio as gr |
|
import pandas as pd |
|
import json |
|
|
|
from src.about import ( |
|
REPRODUCIBILITY_TEXT, |
|
INTRODUCTION_TEXT, |
|
ABOUT_TEXT, |
|
TITLE, |
|
) |
|
from src.display.css_html_js import custom_css, custom_js |
|
from src.display.formatting import make_clickable_field |
|
|
|
def build_leaderboard(type): |
|
with open('data/results.json', 'r') as f: |
|
results = json.load(f) |
|
|
|
with open('data/tasks.json', 'r') as f: |
|
tasks = json.load(f) |
|
|
|
|
|
filtered_tasks = {k: v for k, v in tasks.items() if v['type'] == type} |
|
|
|
data = [] |
|
for model_name, model_data in results.items(): |
|
|
|
if type == "agentic": |
|
has_agentic_results = any( |
|
model_data['results'].get(task, {}).get(tasks[task]['metric']) is not None |
|
for task in filtered_tasks |
|
) |
|
if not has_agentic_results: |
|
continue |
|
|
|
model_sha = model_data["config"]["model_sha"] |
|
model_name = model_data["config"]["model_name"] |
|
row = { |
|
'Model': make_clickable_field(model_name, model_sha) |
|
} |
|
|
|
for dataset, metrics in model_data['results'].items(): |
|
|
|
if dataset in filtered_tasks: |
|
value = next(iter(metrics.values())) |
|
log_url = metrics.get('log_url') |
|
|
|
display_name = filtered_tasks[dataset]['display_name'] |
|
|
|
if value is not None: |
|
value = round(value*100, 2) |
|
if log_url: |
|
value = make_clickable_field(value, log_url) |
|
row[display_name] = value |
|
data.append(row) |
|
|
|
results_df = pd.DataFrame(data) |
|
|
|
|
|
numeric_cols = results_df.select_dtypes(include=['float64', 'float32']).columns |
|
results_df[numeric_cols] = results_df[numeric_cols].round(2) |
|
|
|
|
|
results_df = results_df.fillna("--") |
|
|
|
if type == "agentic": |
|
|
|
results_df.insert(1, 'Agent', make_clickable_field('Basic Agent', 'https://inspect.ai-safety-institute.org.uk/agents.html#sec-basic-agent')) |
|
|
|
return gr.components.Dataframe( |
|
value=results_df, |
|
datatype=["html" for _ in results_df.columns], |
|
column_widths=["250px" if c == "Model" else "150px" for c in results_df.columns], |
|
wrap=False, |
|
) |
|
|
|
|
|
black_logo_path = "src/assets/logo-icon-black.png" |
|
white_logo_path = "src/assets/logo-icon-white.png" |
|
|
|
demo = gr.Blocks( |
|
css=custom_css, |
|
js=custom_js, |
|
theme=gr.themes.Default(primary_hue=gr.themes.colors.pink), |
|
fill_height=True, |
|
fill_width=True, |
|
) |
|
with demo: |
|
gr.HTML(f""" |
|
<div id="page-header"> |
|
<div id="header-container"> |
|
<div id="left-container"> |
|
<img id="black-logo" src="/gradio_api/file={black_logo_path}"> |
|
<img id="white-logo" src="/gradio_api/file={white_logo_path}"> |
|
</div> |
|
<div id="centre-container"> |
|
<h1 style="margin-bottom: 0.25rem;">{TITLE}</h1> |
|
<p style="color:#eb088a; margin:0; font-size:1.2rem;">Explore Interactive Results & Traces</p> |
|
</div> |
|
<div id="right-container"> |
|
</div> |
|
</div> |
|
</div> |
|
""") |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="intro-text", sanitize_html=False) |
|
|
|
with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs: |
|
with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0): |
|
build_leaderboard("base") |
|
|
|
with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1): |
|
build_leaderboard("agentic") |
|
|
|
with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2): |
|
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False) |
|
|
|
with gr.TabItem("Reproducibility", elem_classes="llm-benchmark-tab-table", id=3): |
|
gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False) |
|
|
|
assets = [black_logo_path, white_logo_path] |
|
demo.launch(allowed_paths=assets) |
|
|
|
|