eval-leaderboard

Running

App Files Files Community

jwilles commited on Feb 10

Commit

c8da037

1 Parent(s): 0b8ba75

Update content and style

Browse files

Files changed (3) hide show

app.py +14 -17
src/about.py +54 -49
src/display/css_html_js.py +33 -221

app.py CHANGED Viewed

@@ -1,15 +1,11 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
-import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
     INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css, custom_js
@@ -18,12 +14,8 @@ from src.display.utils import (
     ST_BENCHMARK_COLS,
     AGENTIC_BENCHMARK_COLS,
     EVAL_COLS,
-    EVAL_TYPES,
     AutoEvalColumn,
-    ModelType,
     fields,
-    WeightType,
-    Precision
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP
@@ -92,26 +84,31 @@ with demo:
             </div>
             <div id="centre-container">
                 <h1 style="margin-bottom: 0.25rem;">{TITLE}</h1>
-                <p style="color:#eb088a; margin:0; font-size:1.2rem;">Performance Insights &amp; Comparison</p>
             </div>
-            <div id="right-container"></div>
         </div>
     </div>
     """)
     # gr.HTML(TITLE)
-    with gr.Group(elem_classes="intro-block"):
-        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs:
-        with gr.TabItem("Base Benchmark", elem_classes="llm-benchmark-tab-table", id=0):
             leaderboard = init_leaderboard(ST_LEADERBOARD_DF, "base")
-        with gr.TabItem("Agentic Benchmark", elem_classes="llm-benchmark-tab-table", id=1):
             leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
         with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 assets = [black_logo_path, white_logo_path]

 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
+    REPRODUCIBILITY_TEXT,
     INTRODUCTION_TEXT,
+    ABOUT_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css, custom_js
     ST_BENCHMARK_COLS,
     AGENTIC_BENCHMARK_COLS,
     EVAL_COLS,
     AutoEvalColumn,
     fields,
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP
             </div>
             <div id="centre-container">
                 <h1 style="margin-bottom: 0.25rem;">{TITLE}</h1>
+                <p style="color:#eb088a; margin:0; font-size:1.2rem;">Explore Interactive Results &amp; Traces</p>
+            </div>
+            <div id="right-container">
             </div>
         </div>
     </div>
     """)
     # gr.HTML(TITLE)
+    # with gr.Group(elem_classes="intro-block"):
+    #     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="intro-text")
     with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs:
+        with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0):
             leaderboard = init_leaderboard(ST_LEADERBOARD_DF, "base")
+        with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1):
             leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
         with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2):
+            gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("Reproducibility", elem_classes="llm-benchmark-tab-table", id=3):
+            gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text")
 assets = [black_logo_path, white_logo_path]

src/about.py CHANGED Viewed

@@ -46,66 +46,74 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Vector State of Evaluation Leaderboard</h1>"""
 SINGLE_TURN_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "base"])
 AGENTIC_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "agentic"])
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = f"""
-This leaderboard presents the performance of selected LLM models on a set of tasks. The tasks are divided into two categories: base and agentic. The base tasks are: {SINGLE_TURN_TASK_NAMES}. The agentic tasks are: {AGENTIC_TASK_NAMES}."""
 # Which evaluations are you running? how can people reproduce what you have?
-LLM_BENCHMARKS_TEXT = f"""
-# Vector State of Evaluation Leaderboard
-## Overview
-The **Vector State of Evaluation Leaderboard** presents the performance of selected LLM models on a variety of tasks. These tasks are divided into two categories:
-- **Base Tasks**: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond, MMMU-Multiple-Choice, MMMU-Open-Ended
-- **Agentic Tasks**: GAIA, GDM-InterCode-CTF, GDM-In-House-CTF, AgentHarm, AgentHarm-Benign, SWE-Bench
-Users can compare models side by side to see how they perform on both base-level understanding tasks and more advanced, “agentic” tasks.
-## Vector Institute
-The **Vector Institute** is dedicated to advancing the fields of artificial intelligence and machine learning through cutting-edge research, collaborative projects, and open-source contributions. This leaderboard is part of Vector’s broader effort to promote transparency and progress in AI research.
-## Model
-We evaluate a variety of **Large Language Models (LLMs)** across the included benchmarks. Each model:
-- Is tested on the same set of tasks.
-- Has standardized prompts or evaluation methodologies.
-- Generates performance metrics (accuracy, F1, etc.) for comparison.
-Our goal is to provide clear, reproducible metrics that shed light on how each model handles different task complexities and reasoning requirements.
-## Benchmarks
-Here is a closer look at each benchmark included in the leaderboard:
-### Base Benchmarks
-- **ARC-Easy / ARC-Challenge**: A set of multiple-choice science questions designed to measure a model’s scientific and commonsense reasoning.
-- **DROP**: A reading comprehension benchmark emphasizing discrete reasoning steps.
-- **WinoGrande**: A commonsense reasoning challenge focused on co-reference resolution.
-- **GSM8K**: Grade-school math word problems testing arithmetic and multi-step reasoning.
-- **HellaSwag**: A commonsense inference task centered on action completion.
-- **HumanEval**: Evaluates code generation and reasoning in a programming context.
-- **IFEval**: A specialized benchmark for incremental formal reasoning.
-- **MATH**: High school-level math questions requiring detailed solutions.
-- **MMLU / MMLU-Pro**: Multi-subject multiple-choice tests covering advanced high school and collegiate-level knowledge.
-- **GPQA-Diamond**: A question-answering benchmark that assesses deeper reasoning and knowledge linking.
-- **MMMU (Multiple-Choice / Open-Ended)**: A suite of multilingual and multi-domain tasks testing both structured and open-form responses.
-### Agentic Benchmarks
-- **GAIA**: Evaluates more autonomous or “agentic” reasoning, including planning and problem-solving.
-- **GDM-InterCode-CTF**: A capture-the-flag style challenge focusing on code interpretation and generative debugging strategies.
-- **GDM-In-House-CTF**: A capture-the-flag style challenge testing a variety of security skills pertaining to basic web application security.
-- **AgentHarm / AgentHarm-Benign**: A benchmark for measuring harmfulness of LLM agents.
-- **SWE-Bench**: A benchmark for testing the ability of AI agents to solve software engineering tasks.
----
 """
-EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
 ### 1) Make sure you can load your model and tokenizer using AutoClasses:
 ```python
@@ -134,6 +142,3 @@ Make sure you have followed the above steps first.
 If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
-CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
-CITATION_BUTTON_TEXT = r"""
-"""

 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">State of Evaluation Leaderboard</h1>"""
 SINGLE_TURN_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "base"])
 AGENTIC_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "agentic"])
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = f"""
+Powered by **Inspect** and **Inspect Evals**, the **Vector State of Evaluation Leaderboard** presents an objective evaluation of leading frontier models across a comprehensive suite of benchmarks. Go beyond the summary metrics: click through to interactive reporting for each model and benchmark to explore sample-level performance and detailed traces."""
 # Which evaluations are you running? how can people reproduce what you have?
+ABOUT_TEXT = f"""
+## Vector Institute
+The **Vector Institute** is dedicated to advancing the fields of artificial intelligence and machine learning through cutting-edge research, collaborative projects, and open-source contributions. Our mission is to drive excellence and innovation in AI, fostering a vibrant community of researchers, developers, and industry partners.
+## 🎯 Benchmarks
+This leaderboard showcases performance across a comprehensive suite of benchmarks, designed to rigorously evaluate different aspects of AI model capabilities. Let's explore the benchmarks we use:
+### Inspect Evals
+This leaderboard leverages [Inspect Evals](https://ukgovernmentbeis.github.io/inspect_evals/) to power evaluation. Inspect Evals is an open-source repository built upon the Inspect AI framework. Developed in collaboration between the Vector Institute, Arcadia Impact and the UK AI Safety Institute, Inspect Evals provides a comprehensive suite of high-quality benchmarks spanning diverse domains like coding, mathematics, cybersecurity, reasoning, and general knowledge.
+#### Transparent and Detailed Insights
+All evaluations presented on this leaderboard are run using Inspect Evals. To facilitate in-depth analysis and promote transparency, we provide [Inspect Logs](https://inspect.ai-safety-institute.org.uk/log-viewer.html) for every benchmark run. These logs offer sample and trace level reporting, allowing the community to explore the granular details of model performance.
+### ⚙️ Base Benchmarks
+These benchmarks assess fundamental reasoning and knowledge capabilities of models.
+<div class="benchmark-table-container">
+| Benchmark           | Description                                                                      | Key Skills                                   |
+|--------------------|----------------------------------------------------------------------------------|-----------------------------------------------|
+| **ARC-Easy** / **ARC-Challenge** | Multiple-choice science questions measuring scientific & commonsense reasoning. | Scientific Reasoning, Commonsense Reasoning |
+| **DROP**             | Reading comprehension benchmark emphasizing discrete reasoning steps.             | Reading Comprehension, Discrete Reasoning    |
+| **WinoGrande**        | Commonsense reasoning challenge focused on co-reference resolution.             | Commonsense Reasoning, Co-reference Resolution|
+| **GSM8K**             | Grade-school math word problems testing arithmetic & multi-step reasoning.         | Arithmetic Reasoning, Multi-step Reasoning    |
+| **HellaSwag**        | Commonsense inference task centered on action completion.                        | Commonsense Inference, Action Completion       |
+| **HumanEval**         | Evaluates code generation and reasoning in a programming context.                | Code Generation, Code Reasoning               |
+| **IFEval**            | Specialized benchmark for incremental formal reasoning.                          | Incremental Formal Reasoning                  |
+| **MATH**              | High school-level math questions requiring detailed solutions.                  | Mathematical Reasoning, Problem Solving       |
+| **MMLU** / **MMLU-Pro**| Multi-subject multiple-choice tests of advanced knowledge.                     | Broad Knowledge, Multi-domain Understanding   |
+| **GPQA-Diamond**      | Question-answering benchmark assessing deeper reasoning & knowledge linking.      | Deep Reasoning, Knowledge Linking             |
+| **MMMU** (Multi-Choice / Open-Ended) | Multilingual & multi-domain tasks testing structured & open responses.   | Multilingual Understanding, Open-form Response |
+</div>
+### 🚀 Agentic Benchmarks
+These benchmarks go beyond basic reasoning and evaluate more advanced, autonomous, or "agentic" capabilities of models, such as planning and interaction.
+<div class="benchmark-table-container">
+| Benchmark              | Description                                                                 | Key Skills                                      |
+|-----------------------|-----------------------------------------------------------------------------|-------------------------------------------------|
+| **GAIA**                | Evaluates autonomous reasoning, planning, problem-solving, & multi-turn interactions. | Autonomous Reasoning, Planning, Problem Solving  |
+| [**InterCode-CTF**](https://ukgovernmentbeis.github.io/inspect_evals/evals/cybersecurity/in_house_ctf/)   | Capture-the-flag challenge focused on code interpretation & debugging.       | Code Interpretation, Generative Debugging       |
+| **GDM-In-House-CTF**    | Capture-the-flag challenge testing web application security skills.         | Web Security Skills, Security Awareness         |
+| **AgentHarm** / **AgentHarm-Benign** | Measures harmfulness of LLM agents (and benign behavior baseline).   | Harmfulness Detection, Agent Safety             |
+| **SWE-Bench**           | Tests AI agent ability to solve software engineering tasks.                 | Software Engineering Tasks, Bug Fixing          |
+</div>
 """
+REPRODUCIBILITY_TEXT = """
+## Reproduce and Extend the Leaderboard
 ### 1) Make sure you can load your model and tokenizer using AutoClasses:
 ```python
 If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """

src/display/css_html_js.py CHANGED Viewed

@@ -1,219 +1,4 @@
-# custom_css = """
-# .markdown-text {
-#     font-size: 16px !important;
-# }
-# #models-to-add-text {
-#     font-size: 18px !important;
-# }
-# #citation-button span {
-#     font-size: 16px !important;
-# }
-# #citation-button textarea {
-#     font-size: 16px !important;
-# }
-# #citation-button > label > button {
-#     margin: 6px;
-#     transform: scale(1.3);
-# }
-# #leaderboard-table {
-#     margin-top: 15px
-# }
-# #leaderboard-table-lite {
-#     margin-top: 15px
-# }
-# #search-bar-table-box > div:first-child {
-#     background: none;
-#     border: none;
-# }
-# #search-bar {
-#     padding: 0px;
-# }
-# /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
-# #leaderboard-table td:nth-child(2),
-# #leaderboard-table th:nth-child(2) {
-#     max-width: 400px;
-#     overflow: auto;
-#     white-space: nowrap;
-# }
-# .tab-buttons button {
-#     font-size: 20px;
-# }
-# #scale-logo {
-#     border-style: none !important;
-#     box-shadow: none;
-#     display: block;
-#     margin-left: auto;
-#     margin-right: auto;
-#     max-width: 600px;
-# }
-# #scale-logo .download {
-#     display: none;
-# }
-# #filter_type{
-#     border: 0;
-#     padding-left: 0;
-#     padding-top: 0;
-# }
-# #filter_type label {
-#     display: flex;
-# }
-# #filter_type label > span{
-#     margin-top: var(--spacing-lg);
-#     margin-right: 0.5em;
-# }
-# #filter_type label > .wrap{
-#     width: 103px;
-# }
-# #filter_type label > .wrap .wrap-inner{
-#     padding: 2px;
-# }
-# #filter_type label > .wrap .wrap-inner input{
-#     width: 1px
-# }
-# #filter-columns-type{
-#     border:0;
-#     padding:0.5;
-# }
-# #filter-columns-size{
-#     border:0;
-#     padding:0.5;
-# }
-# #box-filter > .form{
-#     border: 0
-# }
-# body, .gradio-container {
-#     font-family: Roboto, sans-serif;
-#     background-color: #ffffff;
-#     color: #000000; /* main text color */
-#     margin: 0;
-#     padding: 0;
-# }
-# h1, h2, h3, h4, h5, h6 {
-#     color: #eb088a; /* your brand color for headings */
-#     font-weight: 600;
-#     margin-bottom: 1rem;
-# }
-# /* Example ‘intro-block’ styling if you want extra flair */
-# .intro-block {
-#     background-color: #eb088a10; /* light tinted background */
-#     padding: 1.5rem;
-#     border-radius: 10px;
-#     margin-bottom: 2rem;
-# }
-# """
-# custom_css = """
-# /* 1) Load Karbon Font: Make sure this points to your actual font files */
-# @font-face {
-#     font-family: 'Karbon';
-#     src: url('path/to/Karbon.woff2') format('woff2'),
-#          url('path/to/Karbon.woff') format('woff');
-#     font-weight: normal;
-#     font-style: normal;
-# }
-# /* 2) Global Container */
-# body, .gradio-container {
-#     font-family: 'Karbon', sans-serif;
-#     margin: 0;
-#     padding: 0;
-#     background-color: #fafafa; /* Light background */
-#     color: #000000;
-# }
-# .gradio-container {
-#     max-width: 1200px;
-#     margin: 0 auto;
-#     padding: 2rem 1rem;
-# }
-# /* 3) Headings, with brand color #eb088a */
-# h1, h2, h3, h4, h5, h6 {
-#     color: #000000;
-#     margin-bottom: 1rem;
-#     font-weight: 600;
-# }
-# /* 4) Intro Block for a slight highlight */
-# .intro-block {
-#     background-color: #ffe2f1; /* lighter tint of #eb088a */
-#     padding: 1.5rem;
-#     border-radius: 8px;
-#     border: 1px solid #f8badb;
-#     margin-bottom: 2rem;
-# }
-# /* 5) Tab styling - remove default orange styling */
-# .tab-buttons {
-#     margin-top: 1rem;
-#     margin-bottom: 1rem;
-#     display: flex;
-# }
-# .tab-buttons > .tabitem {
-#     padding: 0.6rem 1.2rem;
-#     background-color: #ffffff;
-#     border: 1px solid #eb088a;
-#     border-radius: 6px;
-#     color: #eb088a;
-#     margin-right: 5px;
-#     cursor: pointer;
-#     transition: background-color 0.2s ease, color 0.2s ease;
-#     font-weight: 500;
-# }
-# .tab-buttons > .tabitem.selected {
-#     background-color: #eb088a;
-#     color: #ffffff;
-# }
-# .tab-buttons > .tabitem:hover {
-#     background-color: #eb088a;
-#     color: #ffffff;
-# }
-# /* 6) Dataframe Styling */
-# .gr-dataframe table {
-#     width: 100%;
-#     border-collapse: collapse;
-#     border: 1px solid #cccccc;
-#     margin-bottom: 2rem;
-# }
-# .gr-dataframe th {
-#     background-color: #eb088a;
-#     color: #ffffff;
-#     padding: 0.6rem;
-#     text-align: left;
-#     font-weight: 600;
-# }
-# .gr-dataframe td {
-#     padding: 0.6rem;
-#     border-bottom: 1px solid #e0e0e0;
-# }
-# .gr-dataframe tr:nth-child(even) {
-#     background-color: #fdfdfd;
-# }
-# /* 7) Make default markdown text nice */
-# .markdown-text p {
-#     margin-bottom: 1rem;
-#     line-height: 1.6;
-# }
-# """
 custom_js = """
 function tableLinkHack() {
     // This is a hack to make the table links work
@@ -228,6 +13,18 @@ function tableLinkHack() {
 custom_css = """
 .tab-buttons button {
     font-size: 20px;
 }
@@ -311,15 +108,30 @@ custom_css = """
     }
 }
-"""
-# .selected.svelte-1tcem6n.svelte-1tcem6n {
-#     background-color: #000000 !important; /* Desired background color */
-#     color: #eb088a !important; /* Desired text color */
-#     border-color: #eb088a !important; /* Desired border color */
-# }
 get_window_url_params = """
     function(url_params) {

 custom_js = """
 function tableLinkHack() {
     // This is a hack to make the table links work
 custom_css = """
+.intro-text {
+    text-align: center; /* Center the text */
+    font-size: 3rem; /* Slightly larger font size if desired */
+    color: #555;      /* A slightly softer color than black */
+    margin-bottom: 5px; /* Add some space below the text before the tabs */
+    padding: 0 10px;    /* Add some horizontal padding for wider screens */
+    line-height: 1.2;   /* Improve readability with line height */
+    max-width: 1200px;   /* Limit width for better readability on large screens */
+    margin-left: auto;  /* Center the block horizontally */
+    margin-right: auto; /* Center the block horizontally */
+}
 .tab-buttons button {
     font-size: 20px;
 }
     }
 }
+.benchmark-table-container table {
+    width: 100%; /* Make table take full width of its container */
+    border-collapse: collapse; /* Remove spacing between table cells */
+    margin-bottom: 20px; /* Add some space below the table */
+}
+.benchmark-table-container th, .benchmark-table-container td {
+    border: 1px solid #ddd; /* Light gray border for cells */
+    padding: 8px; /* Padding within cells for better spacing */
+    text-align: left; /* Align text to the left within cells */
+    vertical-align: top; /* Align content to the top of cells */
+}
+.benchmark-table-container th {
+    background-color: #f2f2f2; /* Light gray background for header row */
+    font-weight: bold; /* Make header text bold */
+}
+.benchmark-table-container tbody tr:nth-child(even) {
+    background-color: #f9f9f9; /* Very light gray background for even rows for zebra striping */
+}
+"""
 get_window_url_params = """
     function(url_params) {