Update about page
Browse files- app.py +1 -1
- src/about.py +39 -7
- src/display/css_html_js.py +1 -0
app.py
CHANGED
@@ -85,7 +85,7 @@ with demo:
|
|
85 |
with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
|
86 |
leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
|
87 |
|
88 |
-
with gr.TabItem("
|
89 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
90 |
|
91 |
|
|
|
85 |
with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
|
86 |
leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
|
87 |
|
88 |
+
with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
89 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
90 |
|
91 |
|
src/about.py
CHANGED
@@ -53,16 +53,48 @@ This leaderboard presents the performance of selected LLM models on a set of tas
|
|
53 |
|
54 |
# Which evaluations are you running? how can people reproduce what you have?
|
55 |
LLM_BENCHMARKS_TEXT = f"""
|
56 |
-
|
57 |
-
The following benchmarks are included:
|
58 |
|
59 |
-
|
|
|
60 |
|
61 |
-
|
|
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
"""
|
67 |
|
68 |
EVALUATION_QUEUE_TEXT = """
|
|
|
53 |
|
54 |
# Which evaluations are you running? how can people reproduce what you have?
|
55 |
LLM_BENCHMARKS_TEXT = f"""
|
56 |
+
# Vector State of Evaluation Leaderboard
|
|
|
57 |
|
58 |
+
## Overview
|
59 |
+
The **Vector State of Evaluation Leaderboard** presents the performance of selected LLM models on a variety of tasks. These tasks are divided into two categories:
|
60 |
|
61 |
+
- **Base Tasks**: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond, MMMU-Multiple-Choice, MMMU-Open-Ended
|
62 |
+
- **Agentic Tasks**: GAIA, GDM-InterCode-CTF
|
63 |
|
64 |
+
Users can compare models side by side to see how they perform on both base-level understanding tasks and more advanced, “agentic” tasks.
|
65 |
+
|
66 |
+
## Vector Institute
|
67 |
+
The **Vector Institute** is dedicated to advancing the fields of artificial intelligence and machine learning through cutting-edge research, collaborative projects, and open-source contributions. This leaderboard is part of Vector’s broader effort to promote transparency and progress in AI research.
|
68 |
+
|
69 |
+
## Model
|
70 |
+
We evaluate a variety of **Large Language Models (LLMs)** across the included benchmarks. Each model:
|
71 |
+
- Is tested on the same set of tasks.
|
72 |
+
- Has standardized prompts or evaluation methodologies.
|
73 |
+
- Generates performance metrics (accuracy, F1, etc.) for comparison.
|
74 |
+
|
75 |
+
Our goal is to provide clear, reproducible metrics that shed light on how each model handles different task complexities and reasoning requirements.
|
76 |
+
|
77 |
+
## Benchmarks
|
78 |
+
Here is a closer look at each benchmark included in the leaderboard:
|
79 |
+
|
80 |
+
### Base Benchmarks
|
81 |
+
- **ARC-Easy / ARC-Challenge**: A set of multiple-choice science questions designed to measure a model’s scientific and commonsense reasoning.
|
82 |
+
- **DROP**: A reading comprehension benchmark emphasizing discrete reasoning steps.
|
83 |
+
- **WinoGrande**: A commonsense reasoning challenge focused on co-reference resolution.
|
84 |
+
- **GSM8K**: Grade-school math word problems testing arithmetic and multi-step reasoning.
|
85 |
+
- **HellaSwag**: A commonsense inference task centered on action completion.
|
86 |
+
- **HumanEval**: Evaluates code generation and reasoning in a programming context.
|
87 |
+
- **IFEval**: A specialized benchmark for incremental formal reasoning.
|
88 |
+
- **MATH**: High school-level math questions requiring detailed solutions.
|
89 |
+
- **MMLU / MMLU-Pro**: Multi-subject multiple-choice tests covering advanced high school and collegiate-level knowledge.
|
90 |
+
- **GPQA-Diamond**: A question-answering benchmark that assesses deeper reasoning and knowledge linking.
|
91 |
+
- **MMMU (Multiple-Choice / Open-Ended)**: A suite of multilingual and multi-domain tasks testing both structured and open-form responses.
|
92 |
+
|
93 |
+
### Agentic Benchmarks
|
94 |
+
- **GAIA**: Evaluates more autonomous or “agentic” reasoning, including planning and problem-solving.
|
95 |
+
- **GDM-InterCode-CTF**: A capture-the-flag style challenge focusing on code interpretation and generative debugging strategies.
|
96 |
+
|
97 |
+
---
|
98 |
"""
|
99 |
|
100 |
EVALUATION_QUEUE_TEXT = """
|
src/display/css_html_js.py
CHANGED
@@ -94,6 +94,7 @@ custom_css = """
|
|
94 |
#box-filter > .form{
|
95 |
border: 0
|
96 |
}
|
|
|
97 |
"""
|
98 |
|
99 |
get_window_url_params = """
|
|
|
94 |
#box-filter > .form{
|
95 |
border: 0
|
96 |
}
|
97 |
+
|
98 |
"""
|
99 |
|
100 |
get_window_url_params = """
|