File size: 4,541 Bytes
4a78d34 f4ed2d4 b0b7fbb 4a78d34 c8da037 4a78d34 c8da037 4a78d34 8471f6d b0b7fbb eba4aa7 b0b7fbb 4410a31 b0b7fbb 11c3aa7 4a78d34 b0b7fbb 853a51e 4a78d34 d5a5b95 8471f6d d5a5b95 4a78d34 6fa5c81 d5a5b95 c8da037 d5a5b95 66f1c0c 6fa5c81 5cff478 4a78d34 66f1c0c c8da037 b0b7fbb 8ad1a09 c8da037 b0b7fbb 4a78d34 d5a5b95 a2bb1ef c8da037 50ce699 4a78d34 853a51e e1c3a09 4a78d34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import gradio as gr
import pandas as pd
import json
from src.about import (
REPRODUCIBILITY_TEXT,
INTRODUCTION_TEXT,
ABOUT_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css, custom_js
from src.display.formatting import make_clickable_field
def build_leaderboard(type):
with open('data/results.json', 'r') as f:
results = json.load(f)
with open('data/tasks.json', 'r') as f:
tasks = json.load(f)
# Filter tasks based on type
filtered_tasks = {k: v for k, v in tasks.items() if v['type'] == type}
data = []
for model_name, model_data in results.items():
# For agentic type, skip models that have all null values for agentic tasks
if type == "agentic":
has_agentic_results = any(
model_data['results'].get(task, {}).get(tasks[task]['metric']) is not None
for task in filtered_tasks
)
if not has_agentic_results:
continue
model_sha = model_data["config"]["model_sha"]
model_name = model_data["config"]["model_name"]
row = {
'Model': make_clickable_field(model_name, model_sha)
}
for dataset, metrics in model_data['results'].items():
# Only include metrics for tasks of the specified type
if dataset in filtered_tasks:
value = next(iter(metrics.values()))
log_url = metrics.get('log_url')
# Use display name from tasks.json instead of raw dataset name
display_name = filtered_tasks[dataset]['display_name']
# Round non-null values to 2 decimal places and make clickable if log_url exists
if value is not None:
value = round(value*100, 2)
if log_url:
value = make_clickable_field(value, log_url)
row[display_name] = value
data.append(row)
results_df = pd.DataFrame(data)
# Round all numeric columns to 2 decimal places
numeric_cols = results_df.select_dtypes(include=['float64', 'float32']).columns
results_df[numeric_cols] = results_df[numeric_cols].round(2)
# Fill null values with "-"
results_df = results_df.fillna("--")
if type == "agentic":
# Include agent column as second column after Model
results_df.insert(1, 'Agent', make_clickable_field('Basic Agent', 'https://inspect.ai-safety-institute.org.uk/agents.html#sec-basic-agent'))
return gr.components.Dataframe(
value=results_df,
datatype=["html" for _ in results_df.columns],
column_widths=["250px" if c == "Model" else "150px" for c in results_df.columns],
wrap=False,
)
black_logo_path = "src/assets/logo-icon-black.png"
white_logo_path = "src/assets/logo-icon-white.png"
demo = gr.Blocks(
css=custom_css,
js=custom_js,
theme=gr.themes.Default(primary_hue=gr.themes.colors.pink),
fill_height=True,
fill_width=True,
)
with demo:
gr.HTML(f"""
<div id="page-header">
<div id="header-container">
<div id="left-container">
<img id="black-logo" src="/gradio_api/file={black_logo_path}">
<img id="white-logo" src="/gradio_api/file={white_logo_path}">
</div>
<div id="centre-container">
<h1 style="margin-bottom: 0.25rem;">{TITLE}</h1>
<p style="color:#eb088a; margin:0; font-size:1.2rem;">Explore Interactive Results & Traces</p>
</div>
<div id="right-container">
</div>
</div>
</div>
""")
gr.Markdown(INTRODUCTION_TEXT, elem_classes="intro-text", sanitize_html=False)
with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs:
with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0):
build_leaderboard("base")
with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1):
build_leaderboard("agentic")
with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2):
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False)
# with gr.TabItem("Reproducibility", elem_classes="llm-benchmark-tab-table", id=3):
# gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False)
assets = [black_logo_path, white_logo_path]
demo.launch(allowed_paths=assets)
|