Spaces:
Runtime error
Runtime error
Merge remote-tracking branch 'upstream/main'
Browse files- README.md +1 -1
- app.py +38 -48
- requirements.txt +3 -3
- scripts/create_request_file.py +5 -4
- src/display/about.py +78 -37
- src/display/formatting.py +4 -52
- src/display/utils.py +100 -56
- src/envs.py +6 -2
- src/leaderboard/read_evals.py +57 -40
- src/populate.py +10 -9
- src/submission/check_validity.py +30 -13
- src/submission/submit.py +32 -26
- src/tools/collections.py +3 -3
- src/tools/plots.py +50 -119
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🏆
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.4.0
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -1,56 +1,59 @@
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
-
import gradio as gr
|
6 |
import pandas as pd
|
7 |
from apscheduler.schedulers.background import BackgroundScheduler
|
8 |
from huggingface_hub import snapshot_download
|
9 |
|
10 |
-
from src.display.utils import (
|
11 |
-
COLS,
|
12 |
-
TYPES,
|
13 |
-
BENCHMARK_COLS,
|
14 |
-
EVAL_COLS,
|
15 |
-
EVAL_TYPES,
|
16 |
-
AutoEvalColumn,
|
17 |
-
ModelType,
|
18 |
-
NUMERIC_INTERVALS,
|
19 |
-
fields,
|
20 |
-
)
|
21 |
-
from src.display.css_html_js import custom_css, get_window_url_params
|
22 |
from src.display.about import (
|
23 |
CITATION_BUTTON_LABEL,
|
24 |
CITATION_BUTTON_TEXT,
|
25 |
EVALUATION_QUEUE_TEXT,
|
26 |
INTRODUCTION_TEXT,
|
27 |
LLM_BENCHMARKS_TEXT,
|
|
|
28 |
TITLE,
|
29 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
from src.tools.plots import (
|
31 |
create_metric_plot_obj,
|
32 |
-
create_scores_df,
|
33 |
create_plot_df,
|
34 |
-
|
35 |
-
HUMAN_BASELINES,
|
36 |
)
|
37 |
-
from src.tools.collections import update_collections
|
38 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
39 |
-
from src.envs import H4_TOKEN, QUEUE_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO, API, REPO_ID, IS_PUBLIC
|
40 |
-
from src.submission.submit import add_new_eval
|
41 |
|
42 |
|
43 |
def restart_space():
|
44 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
45 |
|
46 |
-
|
47 |
try:
|
|
|
48 |
snapshot_download(
|
49 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
50 |
)
|
51 |
except Exception:
|
52 |
restart_space()
|
53 |
try:
|
|
|
54 |
snapshot_download(
|
55 |
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
56 |
)
|
@@ -58,13 +61,11 @@ except Exception:
|
|
58 |
restart_space()
|
59 |
|
60 |
|
61 |
-
original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
62 |
update_collections(original_df.copy())
|
63 |
leaderboard_df = original_df.copy()
|
64 |
|
65 |
-
|
66 |
-
# plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
|
67 |
-
#to_be_dumped = f"models = {repr(models)}\n"
|
68 |
|
69 |
(
|
70 |
finished_eval_queue_df,
|
@@ -73,26 +74,15 @@ leaderboard_df = original_df.copy()
|
|
73 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
74 |
|
75 |
|
76 |
-
# Basics
|
77 |
-
def change_tab(query_param: str):
|
78 |
-
query_param = query_param.replace("'", '"')
|
79 |
-
query_param = json.loads(query_param)
|
80 |
-
|
81 |
-
if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "evaluation":
|
82 |
-
return gr.Tabs.update(selected=1)
|
83 |
-
else:
|
84 |
-
return gr.Tabs.update(selected=0)
|
85 |
-
|
86 |
-
|
87 |
# Searching and filtering
|
88 |
def update_table(
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
):
|
97 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
98 |
filtered_df = filter_queries(query, filtered_df)
|
@@ -112,7 +102,7 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
|
112 |
# We use COLS to maintain sorting
|
113 |
filtered_df = df[
|
114 |
always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
|
115 |
-
|
116 |
return filtered_df
|
117 |
|
118 |
|
@@ -137,7 +127,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
|
|
137 |
|
138 |
|
139 |
def filter_models(
|
140 |
-
|
141 |
) -> pd.DataFrame:
|
142 |
# Show all models
|
143 |
if show_deleted:
|
@@ -146,8 +136,8 @@ def filter_models(
|
|
146 |
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
147 |
|
148 |
type_emoji = [t[0] for t in type_query]
|
149 |
-
filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
150 |
-
filtered_df = filtered_df[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
151 |
|
152 |
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
153 |
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
@@ -478,7 +468,7 @@ dummy1 = gr.Textbox(visible=False)
|
|
478 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
479 |
headers=COLS,
|
480 |
datatype=TYPES,
|
481 |
-
|
482 |
visible=False,
|
483 |
)
|
484 |
|
|
|
1 |
+
import gradio as gr
|
2 |
import json
|
3 |
import os
|
4 |
from datetime import datetime, timezone
|
5 |
|
|
|
6 |
import pandas as pd
|
7 |
from apscheduler.schedulers.background import BackgroundScheduler
|
8 |
from huggingface_hub import snapshot_download
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
from src.display.about import (
|
11 |
CITATION_BUTTON_LABEL,
|
12 |
CITATION_BUTTON_TEXT,
|
13 |
EVALUATION_QUEUE_TEXT,
|
14 |
INTRODUCTION_TEXT,
|
15 |
LLM_BENCHMARKS_TEXT,
|
16 |
+
FAQ_TEXT,
|
17 |
TITLE,
|
18 |
)
|
19 |
+
from src.display.css_html_js import custom_css
|
20 |
+
from src.display.utils import (
|
21 |
+
BENCHMARK_COLS,
|
22 |
+
COLS,
|
23 |
+
EVAL_COLS,
|
24 |
+
EVAL_TYPES,
|
25 |
+
NUMERIC_INTERVALS,
|
26 |
+
TYPES,
|
27 |
+
AutoEvalColumn,
|
28 |
+
ModelType,
|
29 |
+
fields,
|
30 |
+
WeightType,
|
31 |
+
Precision
|
32 |
+
)
|
33 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
34 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
35 |
+
from src.submission.submit import add_new_eval
|
36 |
+
from src.submission.check_validity import already_submitted_models
|
37 |
+
from src.tools.collections import update_collections
|
38 |
from src.tools.plots import (
|
39 |
create_metric_plot_obj,
|
|
|
40 |
create_plot_df,
|
41 |
+
create_scores_df,
|
|
|
42 |
)
|
|
|
|
|
|
|
|
|
43 |
|
44 |
|
45 |
def restart_space():
|
46 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
47 |
|
|
|
48 |
try:
|
49 |
+
print(EVAL_REQUESTS_PATH)
|
50 |
snapshot_download(
|
51 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
52 |
)
|
53 |
except Exception:
|
54 |
restart_space()
|
55 |
try:
|
56 |
+
print(EVAL_RESULTS_PATH)
|
57 |
snapshot_download(
|
58 |
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
59 |
)
|
|
|
61 |
restart_space()
|
62 |
|
63 |
|
64 |
+
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
65 |
update_collections(original_df.copy())
|
66 |
leaderboard_df = original_df.copy()
|
67 |
|
68 |
+
plot_df = create_plot_df(create_scores_df(raw_data))
|
|
|
|
|
69 |
|
70 |
(
|
71 |
finished_eval_queue_df,
|
|
|
74 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
75 |
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
# Searching and filtering
|
78 |
def update_table(
|
79 |
+
hidden_df: pd.DataFrame,
|
80 |
+
columns: list,
|
81 |
+
type_query: list,
|
82 |
+
precision_query: str,
|
83 |
+
size_query: list,
|
84 |
+
show_deleted: bool,
|
85 |
+
query: str,
|
86 |
):
|
87 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
88 |
filtered_df = filter_queries(query, filtered_df)
|
|
|
102 |
# We use COLS to maintain sorting
|
103 |
filtered_df = df[
|
104 |
always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
|
105 |
+
]
|
106 |
return filtered_df
|
107 |
|
108 |
|
|
|
127 |
|
128 |
|
129 |
def filter_models(
|
130 |
+
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
|
131 |
) -> pd.DataFrame:
|
132 |
# Show all models
|
133 |
if show_deleted:
|
|
|
136 |
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
137 |
|
138 |
type_emoji = [t[0] for t in type_query]
|
139 |
+
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
140 |
+
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
141 |
|
142 |
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
143 |
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
|
|
468 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
469 |
headers=COLS,
|
470 |
datatype=TYPES,
|
471 |
+
max_rows=None,
|
472 |
visible=False,
|
473 |
)
|
474 |
|
requirements.txt
CHANGED
@@ -13,8 +13,8 @@ pandas==2.0.0
|
|
13 |
plotly==5.14.1
|
14 |
python-dateutil==2.8.2
|
15 |
requests==2.28.2
|
|
|
16 |
semantic-version==2.10.0
|
17 |
tqdm==4.65.0
|
18 |
-
|
19 |
-
|
20 |
-
tokenizers>=0.15.0
|
|
|
13 |
plotly==5.14.1
|
14 |
python-dateutil==2.8.2
|
15 |
requests==2.28.2
|
16 |
+
sentencepiece
|
17 |
semantic-version==2.10.0
|
18 |
tqdm==4.65.0
|
19 |
+
transformers==4.35.2
|
20 |
+
tokenizers>=0.15.0
|
|
scripts/create_request_file.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
-
from datetime import datetime, timezone
|
2 |
import json
|
3 |
import os
|
|
|
4 |
import re
|
|
|
|
|
5 |
import click
|
6 |
-
from huggingface_hub import HfApi, snapshot_download
|
7 |
from colorama import Fore
|
8 |
-
import
|
9 |
|
10 |
EVAL_REQUESTS_PATH = "eval-queue"
|
11 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
@@ -19,7 +20,7 @@ def get_model_size(model_info, precision: str):
|
|
19 |
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
20 |
try:
|
21 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
22 |
-
except AttributeError:
|
23 |
try:
|
24 |
size_match = re.search(size_pattern, model_info.modelId.lower())
|
25 |
model_size = size_match.group(0)
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
import pprint
|
4 |
import re
|
5 |
+
from datetime import datetime, timezone
|
6 |
+
|
7 |
import click
|
|
|
8 |
from colorama import Fore
|
9 |
+
from huggingface_hub import HfApi, snapshot_download
|
10 |
|
11 |
EVAL_REQUESTS_PATH = "eval-queue"
|
12 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
|
|
20 |
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
21 |
try:
|
22 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
23 |
+
except (AttributeError, TypeError):
|
24 |
try:
|
25 |
size_match = re.search(size_pattern, model_info.modelId.lower())
|
26 |
model_size = size_match.group(0)
|
src/display/about.py
CHANGED
@@ -13,20 +13,9 @@ LLM_BENCHMARKS_TEXT = f"""
|
|
13 |
# Context
|
14 |
With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
|
15 |
|
16 |
-
## Icons
|
17 |
-
{ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
|
18 |
-
{ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
|
19 |
-
Specific fine-tune subcategories (more adapted to chat):
|
20 |
-
{ModelType.IFT.to_str(" : ")} model: instruction fine-tunes, which are model fine-tuned specifically on datasets of task instruction
|
21 |
-
{ModelType.RL.to_str(" : ")} model: reinforcement fine-tunes, which usually change the model loss a bit with an added policy.
|
22 |
-
If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
|
23 |
-
|
24 |
-
"Flagged" indicates that this model has been flagged by the community, and should probably be ignored! Clicking the link will redirect you to the discussion about the model.
|
25 |
-
(For ex, the model was trained on the evaluation data, and is therefore cheating on the leaderboard.)
|
26 |
-
|
27 |
## How it works
|
28 |
|
29 |
-
📈 We evaluate models on
|
30 |
|
31 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
|
32 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
|
@@ -34,7 +23,6 @@ If there is no icon, we have not uploaded the information on the model yet, feel
|
|
34 |
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
|
35 |
- <a href="https://arxiv.org/abs/1907.10641" target="_blank"> Winogrande </a> (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
|
36 |
- <a href="https://arxiv.org/abs/2110.14168" target="_blank"> GSM8k </a> (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.
|
37 |
-
- <a href="https://arxiv.org/abs/1903.00161" target="_blank"> DROP </a> (3-shot) - English reading comprehension benchmark requiring Discrete Reasoning Over the content of Paragraphs.
|
38 |
|
39 |
For all these evaluations, a higher score is a better score.
|
40 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
@@ -47,10 +35,10 @@ You can find:
|
|
47 |
|
48 |
## Reproducibility
|
49 |
To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
|
50 |
-
`python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
|
51 |
-
` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=
|
52 |
|
53 |
-
The total batch size we get for models which fit on one A100 node is
|
54 |
*You can expect results to vary slightly for different batch sizes because of padding.*
|
55 |
|
56 |
The tasks and few shots parameters are:
|
@@ -60,29 +48,95 @@ The tasks and few shots parameters are:
|
|
60 |
- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
|
61 |
- Winogrande: 5-shot, *winogrande* (`acc`)
|
62 |
- GSM8k: 5-shot, *gsm8k* (`acc`)
|
63 |
-
- DROP: 3-shot, *drop* (`f1`)
|
64 |
|
65 |
Side note on the baseline scores:
|
66 |
- for log-likelihood evaluation, we select the random baseline
|
67 |
-
- for
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
## Quantization
|
71 |
To get more information about quantization, see:
|
72 |
- 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
|
73 |
- 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
|
74 |
|
75 |
-
##
|
76 |
-
|
77 |
-
|
78 |
"""
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
EVALUATION_QUEUE_TEXT = """
|
81 |
# Evaluation Queue for the 🤗 Open LLM Leaderboard
|
82 |
|
83 |
Models added here will be automatically evaluated on the 🤗 cluster.
|
84 |
|
85 |
-
##
|
86 |
|
87 |
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
88 |
```python
|
@@ -205,17 +259,4 @@ CITATION_BUTTON_TEXT = r"""
|
|
205 |
archivePrefix={arXiv},
|
206 |
primaryClass={cs.CL}
|
207 |
}
|
208 |
-
|
209 |
-
title={{DROP:} {A} Reading Comprehension Benchmark Requiring Discrete Reasoning
|
210 |
-
Over Paragraphs},
|
211 |
-
author={Dheeru Dua and
|
212 |
-
Yizhong Wang and
|
213 |
-
Pradeep Dasigi and
|
214 |
-
Gabriel Stanovsky and
|
215 |
-
Sameer Singh and
|
216 |
-
Matt Gardner},
|
217 |
-
year={2019},
|
218 |
-
eprinttype={arXiv},
|
219 |
-
eprint={1903.00161},
|
220 |
-
primaryClass={cs.CL}
|
221 |
-
}"""
|
|
|
13 |
# Context
|
14 |
With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
## How it works
|
17 |
|
18 |
+
📈 We evaluate models on 7 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
|
19 |
|
20 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
|
21 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
|
|
|
23 |
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
|
24 |
- <a href="https://arxiv.org/abs/1907.10641" target="_blank"> Winogrande </a> (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
|
25 |
- <a href="https://arxiv.org/abs/2110.14168" target="_blank"> GSM8k </a> (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.
|
|
|
26 |
|
27 |
For all these evaluations, a higher score is a better score.
|
28 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
|
|
35 |
|
36 |
## Reproducibility
|
37 |
To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
|
38 |
+
`python main.py --model=hf-causal-experimental --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
|
39 |
+
` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=1 --output_path=<output_path>`
|
40 |
|
41 |
+
The total batch size we get for models which fit on one A100 node is 8 (8 GPUs * 1). If you don't use parallelism, adapt your batch size to fit.
|
42 |
*You can expect results to vary slightly for different batch sizes because of padding.*
|
43 |
|
44 |
The tasks and few shots parameters are:
|
|
|
48 |
- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
|
49 |
- Winogrande: 5-shot, *winogrande* (`acc`)
|
50 |
- GSM8k: 5-shot, *gsm8k* (`acc`)
|
|
|
51 |
|
52 |
Side note on the baseline scores:
|
53 |
- for log-likelihood evaluation, we select the random baseline
|
54 |
+
- for GSM8K, we select the score obtained in the paper after finetuning a 6B model on the full GSM8K training set for 50 epochs
|
55 |
+
|
56 |
+
## Icons
|
57 |
+
- {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
|
58 |
+
- {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
|
59 |
+
Specific fine-tune subcategories (more adapted to chat):
|
60 |
+
- {ModelType.IFT.to_str(" : ")} model: instruction fine-tunes, which are model fine-tuned specifically on datasets of task instruction
|
61 |
+
- {ModelType.RL.to_str(" : ")} model: reinforcement fine-tunes, which usually change the model loss a bit with an added policy.
|
62 |
+
If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
|
63 |
+
|
64 |
+
"Flagged" indicates that this model has been flagged by the community, and should probably be ignored! Clicking the link will redirect you to the discussion about the model.
|
65 |
|
66 |
## Quantization
|
67 |
To get more information about quantization, see:
|
68 |
- 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
|
69 |
- 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
|
70 |
|
71 |
+
## Useful links
|
72 |
+
- [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)
|
73 |
+
- [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
|
74 |
"""
|
75 |
|
76 |
+
FAQ_TEXT = """
|
77 |
+
---------------------------
|
78 |
+
# FAQ
|
79 |
+
Below are some common questions - if this FAQ does not answer you, feel free to create a new issue, and we'll take care of it as soon as we can!
|
80 |
+
|
81 |
+
## 1) Submitting a model
|
82 |
+
My model requires `trust_remote_code=True`, can I submit it?
|
83 |
+
- *We only support models that have been integrated in a stable version of the `transformers` library for automatic submission, as we don't want to run possibly unsage code on our cluster.*
|
84 |
+
|
85 |
+
What about models of type X?
|
86 |
+
- *We only support models that have been integrated in a stable version of the `transformers` library for automatic submission.*
|
87 |
+
|
88 |
+
How can I follow when my model is launched?
|
89 |
+
- *You can look for its request file [here](https://huggingface.co/datasets/open-llm-leaderboard/requests) and follow the status evolution, or directly in the queues above the submit form.*
|
90 |
+
|
91 |
+
My model disappeared from all the queues, what happened?
|
92 |
+
- *A model disappearing from all the queues usually means that there has been a failure. You can check if that is the case by looking for your model [here](https://huggingface.co/datasets/open-llm-leaderboard/requests).*
|
93 |
+
|
94 |
+
What causes an evaluation failure?
|
95 |
+
- *Most of the failures we get come from problems in the submissions (corrupted files, config problems, wrong parameters selected for eval ...), so we'll be grateful if you first make sure you have followed the steps in `About`. However, from time to time, we have failures on our side (hardware/node failures, problem with an update of our backend, connectivity problem ending up in the results not being saved, ...).*
|
96 |
+
|
97 |
+
How can I report an evaluation failure?
|
98 |
+
- *As we store the logs for all models, feel free to create an issue, **where you link to the requests file of your model** (look for it [here](https://huggingface.co/datasets/open-llm-leaderboard/requests/tree/main)), so we can investigate! If the model failed due to a problem on our side, we'll relaunch it right away!*
|
99 |
+
*Note: Please do not re-upload your model under a different name, it will not help*
|
100 |
+
|
101 |
+
## 2) Model results
|
102 |
+
What kind of information can I find?
|
103 |
+
- *Let's imagine you are interested in the Yi-34B results. You have access to 3 different information categories:*
|
104 |
+
- *The [request file](https://huggingface.co/datasets/open-llm-leaderboard/requests/blob/main/01-ai/Yi-34B_eval_request_False_bfloat16_Original.json): it gives you information about the status of the evaluation*
|
105 |
+
- *The [aggregated results folder](https://huggingface.co/datasets/open-llm-leaderboard/results/tree/main/01-ai/Yi-34B): it gives you aggregated scores, per experimental run*
|
106 |
+
- *The [details dataset](https://huggingface.co/datasets/open-llm-leaderboard/details_01-ai__Yi-34B/tree/main): it gives you the full details (scores and examples for each task and a given model)*
|
107 |
+
|
108 |
+
|
109 |
+
Why do models appear several times in the leaderboard?
|
110 |
+
- *We run evaluations with user selected precision and model commit. Sometimes, users submit specific models at different commits and at different precisions (for example, in float16 and 4bit to see how quantization affects performance). You should be able to verify this by displaying the `precision` and `model sha` columns in the display. If, however, you see models appearing several time with the same precision and hash commit, this is not normal.*
|
111 |
+
|
112 |
+
What is this concept of "flagging"?
|
113 |
+
- *This mechanism allows user to report models that have unfair performance on the leaderboard. This contains several categories: exceedingly good results on the leaderboard because the model was (maybe accidentally) trained on the evaluation data, models that are copy of other models not atrributed properly, etc.*
|
114 |
+
|
115 |
+
My model has been flagged improperly, what can I do?
|
116 |
+
- *Every flagged model has a discussion associated with it - feel free to plead your case there, and we'll see what to do together with the community.*
|
117 |
+
|
118 |
+
## 3) Editing a submission
|
119 |
+
I upgraded my model and want to re-submit, how can I do that?
|
120 |
+
- *Please open an issue with the precise name of your model, and we'll remove your model from the leaderboard so you can resubmit. You can also resubmit directly with the new commit hash!*
|
121 |
+
|
122 |
+
I need to rename my model, how can I do that?
|
123 |
+
- *You can use @Weyaxi 's [super cool tool](https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-renamer) to request model name changes, then open a discussion where you link to the created pull request, and we'll check them and merge them as needed.*
|
124 |
+
|
125 |
+
## 4) Other
|
126 |
+
Why don't you display closed source model scores?
|
127 |
+
- *This is a leaderboard for Open models, both for philosophical reasons (openness is cool) and for practical reasons: we want to ensure that the results we display are accurate and reproducible, but 1) commercial closed models can change their API thus rendering any scoring at a given time incorrect 2) we re-run everything on our cluster to ensure all models are run on the same setup and you can't do that for these models.*
|
128 |
+
|
129 |
+
I have an issue about accessing the leaderboard through the Gradio API
|
130 |
+
- *Since this is not the recommended way to access the leaderboard, we won't provide support for this, but you can look at tools provided by the community for inspiration!*
|
131 |
+
"""
|
132 |
+
|
133 |
+
|
134 |
EVALUATION_QUEUE_TEXT = """
|
135 |
# Evaluation Queue for the 🤗 Open LLM Leaderboard
|
136 |
|
137 |
Models added here will be automatically evaluated on the 🤗 cluster.
|
138 |
|
139 |
+
## First steps before submitting a model
|
140 |
|
141 |
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
142 |
```python
|
|
|
259 |
archivePrefix={arXiv},
|
260 |
primaryClass={cs.CL}
|
261 |
}
|
262 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/formatting.py
CHANGED
@@ -1,24 +1,11 @@
|
|
1 |
import os
|
2 |
-
from
|
3 |
-
|
4 |
-
API = HfApi()
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
"huggingface/llama-13b",
|
9 |
-
"huggingface/llama-30b",
|
10 |
-
"huggingface/llama-65b",
|
11 |
-
]
|
12 |
|
13 |
-
KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
|
14 |
-
VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
|
15 |
-
OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
|
16 |
-
DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
|
17 |
-
MODEL_PAGE = "https://huggingface.co/models"
|
18 |
-
LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
|
19 |
-
VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
|
20 |
-
ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
|
21 |
|
|
|
22 |
|
23 |
def model_hyperlink(link, model_name):
|
24 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
@@ -27,44 +14,9 @@ def model_hyperlink(link, model_name):
|
|
27 |
def make_clickable_model(model_name):
|
28 |
link = f"https://huggingface.co/{model_name}"
|
29 |
|
30 |
-
if model_name in LLAMAS:
|
31 |
-
link = LLAMA_LINK
|
32 |
-
model_name = model_name.split("/")[1]
|
33 |
-
elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
|
34 |
-
link = VICUNA_LINK
|
35 |
-
model_name = "stable-vicuna-13b"
|
36 |
-
elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
|
37 |
-
link = ALPACA_LINK
|
38 |
-
model_name = "alpaca-13b"
|
39 |
-
if model_name == "dolly-12b":
|
40 |
-
link = DOLLY_LINK
|
41 |
-
elif model_name == "vicuna-13b":
|
42 |
-
link = VICUNA_LINK
|
43 |
-
elif model_name == "koala-13b":
|
44 |
-
link = KOALA_LINK
|
45 |
-
elif model_name == "oasst-12b":
|
46 |
-
link = OASST_LINK
|
47 |
-
|
48 |
details_model_name = model_name.replace("/", "__")
|
49 |
details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
|
50 |
|
51 |
-
if not bool(os.getenv("DEBUG", "False")):
|
52 |
-
# We only add these checks when not debugging, as they are extremely slow
|
53 |
-
print(f"details_link: {details_link}")
|
54 |
-
try:
|
55 |
-
check_path = list(
|
56 |
-
API.list_files_info(
|
57 |
-
repo_id=f"open-llm-leaderboard/details_{details_model_name}",
|
58 |
-
paths="README.md",
|
59 |
-
repo_type="dataset",
|
60 |
-
)
|
61 |
-
)
|
62 |
-
print(f"check_path: {check_path}")
|
63 |
-
except Exception as err:
|
64 |
-
# No details repo for this model
|
65 |
-
print(f"No details repo for this model: {err}")
|
66 |
-
return model_hyperlink(link, model_name)
|
67 |
-
|
68 |
return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
|
69 |
|
70 |
|
|
|
1 |
import os
|
2 |
+
from datetime import datetime, timezone
|
|
|
|
|
3 |
|
4 |
+
from huggingface_hub import HfApi
|
5 |
+
from huggingface_hub.hf_api import ModelInfo
|
|
|
|
|
|
|
|
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
API = HfApi()
|
9 |
|
10 |
def model_hyperlink(link, model_name):
|
11 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
|
|
14 |
def make_clickable_model(model_name):
|
15 |
link = f"https://huggingface.co/{model_name}"
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
details_model_name = model_name.replace("/", "__")
|
18 |
details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
|
21 |
|
22 |
|
src/display/utils.py
CHANGED
@@ -1,7 +1,25 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
-
import pandas as pd
|
3 |
from enum import Enum
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# These classes are for user facing column names,
|
7 |
# to avoid having to change them all around the code
|
@@ -15,35 +33,29 @@ class ColumnContent:
|
|
15 |
never_hidden: bool = False
|
16 |
dummy: bool = False
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
still_on_hub = ColumnContent("Available on the hub", "bool", False)
|
42 |
-
revision = ColumnContent("Model sha", "str", False, False)
|
43 |
-
dummy = ColumnContent(
|
44 |
-
"model_name_for_query", "str", False, dummy=True
|
45 |
-
) # dummy col to implement search bar (hidden by custom CSS)
|
46 |
-
|
47 |
|
48 |
@dataclass(frozen=True)
|
49 |
class EvalQueueColumn: # Queue column
|
@@ -59,31 +71,52 @@ baseline_row = {
|
|
59 |
AutoEvalColumn.model.name: "<p>Baseline</p>",
|
60 |
AutoEvalColumn.revision.name: "N/A",
|
61 |
AutoEvalColumn.precision.name: None,
|
62 |
-
AutoEvalColumn.average.name:
|
63 |
AutoEvalColumn.arc.name: 25.0,
|
64 |
AutoEvalColumn.hellaswag.name: 25.0,
|
65 |
AutoEvalColumn.mmlu.name: 25.0,
|
66 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
67 |
AutoEvalColumn.winogrande.name: 50.0,
|
68 |
AutoEvalColumn.gsm8k.name: 0.21,
|
69 |
-
AutoEvalColumn.drop.name: 0.47,
|
70 |
AutoEvalColumn.dummy.name: "baseline",
|
71 |
AutoEvalColumn.model_type.name: "",
|
72 |
}
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
@dataclass
|
76 |
-
class
|
77 |
name: str
|
78 |
-
symbol: str
|
79 |
|
80 |
|
81 |
class ModelType(Enum):
|
82 |
-
PT =
|
83 |
-
FT =
|
84 |
-
IFT =
|
85 |
-
RL =
|
86 |
-
Unknown =
|
87 |
|
88 |
def to_str(self, separator=" "):
|
89 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
@@ -100,22 +133,33 @@ class ModelType(Enum):
|
|
100 |
return ModelType.IFT
|
101 |
return ModelType.Unknown
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
-
@dataclass
|
105 |
-
class Task:
|
106 |
-
benchmark: str
|
107 |
-
metric: str
|
108 |
-
col_name: str
|
109 |
-
|
110 |
-
|
111 |
-
class Tasks(Enum):
|
112 |
-
arc = Task("arc:challenge", "acc_norm", AutoEvalColumn.arc.name)
|
113 |
-
hellaswag = Task("hellaswag", "acc_norm", AutoEvalColumn.hellaswag.name)
|
114 |
-
mmlu = Task("hendrycksTest", "acc", AutoEvalColumn.mmlu.name)
|
115 |
-
truthfulqa = Task("truthfulqa:mc", "mc2", AutoEvalColumn.truthfulqa.name)
|
116 |
-
winogrande = Task("winogrande", "acc", AutoEvalColumn.winogrande.name)
|
117 |
-
gsm8k = Task("gsm8k", "acc", AutoEvalColumn.gsm8k.name)
|
118 |
-
drop = Task("drop", "f1", AutoEvalColumn.drop.name)
|
119 |
|
120 |
|
121 |
# Column selection
|
@@ -127,7 +171,7 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
|
|
127 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
128 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
129 |
|
130 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks
|
131 |
|
132 |
NUMERIC_INTERVALS = {
|
133 |
"?": pd.Interval(-1, 0, closed="right"),
|
|
|
1 |
+
from dataclasses import dataclass, make_dataclass
|
|
|
2 |
from enum import Enum
|
3 |
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
def fields(raw_class):
|
7 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
8 |
+
|
9 |
+
|
10 |
+
@dataclass
|
11 |
+
class Task:
|
12 |
+
benchmark: str
|
13 |
+
metric: str
|
14 |
+
col_name: str
|
15 |
+
|
16 |
+
class Tasks(Enum):
|
17 |
+
arc = Task("arc:challenge", "acc_norm", "ARC")
|
18 |
+
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
19 |
+
mmlu = Task("hendrycksTest", "acc", "MMLU")
|
20 |
+
truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
|
21 |
+
winogrande = Task("winogrande", "acc", "Winogrande")
|
22 |
+
gsm8k = Task("gsm8k", "acc", "GSM8K")
|
23 |
|
24 |
# These classes are for user facing column names,
|
25 |
# to avoid having to change them all around the code
|
|
|
33 |
never_hidden: bool = False
|
34 |
dummy: bool = False
|
35 |
|
36 |
+
auto_eval_column_dict = []
|
37 |
+
# Init
|
38 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
39 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
40 |
+
#Scores
|
41 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
42 |
+
for task in Tasks:
|
43 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
44 |
+
# Model information
|
45 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
46 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
47 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
48 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
49 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
50 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
51 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
52 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
53 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
54 |
+
# Dummy column for the search bar (hidden by the custom CSS)
|
55 |
+
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
56 |
+
|
57 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
58 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
@dataclass(frozen=True)
|
61 |
class EvalQueueColumn: # Queue column
|
|
|
71 |
AutoEvalColumn.model.name: "<p>Baseline</p>",
|
72 |
AutoEvalColumn.revision.name: "N/A",
|
73 |
AutoEvalColumn.precision.name: None,
|
74 |
+
AutoEvalColumn.average.name: 31.0,
|
75 |
AutoEvalColumn.arc.name: 25.0,
|
76 |
AutoEvalColumn.hellaswag.name: 25.0,
|
77 |
AutoEvalColumn.mmlu.name: 25.0,
|
78 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
79 |
AutoEvalColumn.winogrande.name: 50.0,
|
80 |
AutoEvalColumn.gsm8k.name: 0.21,
|
|
|
81 |
AutoEvalColumn.dummy.name: "baseline",
|
82 |
AutoEvalColumn.model_type.name: "",
|
83 |
}
|
84 |
|
85 |
+
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
86 |
+
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
|
87 |
+
# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
|
88 |
+
# MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
|
89 |
+
# TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
|
90 |
+
# Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
|
91 |
+
# GSM8K: paper
|
92 |
+
# Define the human baselines
|
93 |
+
human_baseline_row = {
|
94 |
+
AutoEvalColumn.model.name: "<p>Human performance</p>",
|
95 |
+
AutoEvalColumn.revision.name: "N/A",
|
96 |
+
AutoEvalColumn.precision.name: None,
|
97 |
+
AutoEvalColumn.average.name: 92.75,
|
98 |
+
AutoEvalColumn.arc.name: 80.0,
|
99 |
+
AutoEvalColumn.hellaswag.name: 95.0,
|
100 |
+
AutoEvalColumn.mmlu.name: 89.8,
|
101 |
+
AutoEvalColumn.truthfulqa.name: 94.0,
|
102 |
+
AutoEvalColumn.winogrande.name: 94.0,
|
103 |
+
AutoEvalColumn.gsm8k.name: 100,
|
104 |
+
AutoEvalColumn.dummy.name: "human_baseline",
|
105 |
+
AutoEvalColumn.model_type.name: "",
|
106 |
+
}
|
107 |
|
108 |
@dataclass
|
109 |
+
class ModelDetails:
|
110 |
name: str
|
111 |
+
symbol: str = "" # emoji, only for the model type
|
112 |
|
113 |
|
114 |
class ModelType(Enum):
|
115 |
+
PT = ModelDetails(name="pretrained", symbol="🟢")
|
116 |
+
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
117 |
+
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
118 |
+
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
119 |
+
Unknown = ModelDetails(name="", symbol="?")
|
120 |
|
121 |
def to_str(self, separator=" "):
|
122 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
|
|
133 |
return ModelType.IFT
|
134 |
return ModelType.Unknown
|
135 |
|
136 |
+
class WeightType(Enum):
|
137 |
+
Adapter = ModelDetails("Adapter")
|
138 |
+
Original = ModelDetails("Original")
|
139 |
+
Delta = ModelDetails("Delta")
|
140 |
+
|
141 |
+
class Precision(Enum):
|
142 |
+
float16 = ModelDetails("float16")
|
143 |
+
bfloat16 = ModelDetails("bfloat16")
|
144 |
+
qt_8bit = ModelDetails("8bit")
|
145 |
+
qt_4bit = ModelDetails("4bit")
|
146 |
+
qt_GPTQ = ModelDetails("GPTQ")
|
147 |
+
Unknown = ModelDetails("?")
|
148 |
+
|
149 |
+
def from_str(precision):
|
150 |
+
if precision in ["torch.float16", "float16"]:
|
151 |
+
return Precision.float16
|
152 |
+
if precision in ["torch.bfloat16", "bfloat16"]:
|
153 |
+
return Precision.bfloat16
|
154 |
+
if precision in ["8bit"]:
|
155 |
+
return Precision.qt_8bit
|
156 |
+
if precision in ["4bit"]:
|
157 |
+
return Precision.qt_4bit
|
158 |
+
if precision in ["GPTQ", "None"]:
|
159 |
+
return Precision.qt_GPTQ
|
160 |
+
return Precision.Unknown
|
161 |
+
|
162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
|
165 |
# Column selection
|
|
|
171 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
172 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
173 |
|
174 |
+
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
175 |
|
176 |
NUMERIC_INTERVALS = {
|
177 |
"?": pd.Interval(-1, 0, closed="right"),
|
src/envs.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import os
|
|
|
2 |
from huggingface_hub import HfApi
|
3 |
|
4 |
# clone / pull the lmeh eval data
|
@@ -13,8 +14,10 @@ PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
|
|
13 |
|
14 |
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
15 |
|
16 |
-
|
17 |
-
|
|
|
|
|
18 |
|
19 |
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
20 |
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
@@ -24,5 +27,6 @@ PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c796
|
|
24 |
# Rate limit variables
|
25 |
RATE_LIMIT_PERIOD = 7
|
26 |
RATE_LIMIT_QUOTA = 5
|
|
|
27 |
|
28 |
API = HfApi(token=H4_TOKEN)
|
|
|
1 |
import os
|
2 |
+
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
# clone / pull the lmeh eval data
|
|
|
14 |
|
15 |
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
16 |
|
17 |
+
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
+
|
19 |
+
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
20 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
21 |
|
22 |
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
23 |
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
|
|
27 |
# Rate limit variables
|
28 |
RATE_LIMIT_PERIOD = 7
|
29 |
RATE_LIMIT_QUOTA = 5
|
30 |
+
HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
|
31 |
|
32 |
API = HfApi(token=H4_TOKEN)
|
src/leaderboard/read_evals.py
CHANGED
@@ -1,37 +1,41 @@
|
|
|
|
1 |
import json
|
2 |
-
import os
|
3 |
import math
|
4 |
-
import
|
5 |
from dataclasses import dataclass
|
6 |
-
from typing import Dict, List, Tuple
|
7 |
|
8 |
import dateutil
|
|
|
|
|
9 |
import numpy as np
|
10 |
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks
|
12 |
from src.display.formatting import make_clickable_model
|
|
|
13 |
from src.submission.check_validity import is_model_on_hub
|
14 |
|
15 |
|
16 |
@dataclass
|
17 |
class EvalResult:
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
21 |
model: str
|
22 |
-
revision: str
|
23 |
results: dict
|
24 |
-
precision:
|
25 |
-
model_type: ModelType = ModelType.Unknown
|
26 |
-
weight_type:
|
|
|
27 |
license: str = "?"
|
28 |
likes: int = 0
|
29 |
num_params: int = 0
|
30 |
-
date: str = ""
|
31 |
still_on_hub: bool = False
|
32 |
|
33 |
@classmethod
|
34 |
def init_from_json_file(self, json_filepath):
|
|
|
35 |
with open(json_filepath) as fp:
|
36 |
data = json.load(fp)
|
37 |
|
@@ -39,9 +43,7 @@ class EvalResult:
|
|
39 |
config = data.get("config", data.get("config_general", None))
|
40 |
|
41 |
# Precision
|
42 |
-
precision = config.get("model_dtype")
|
43 |
-
if precision == "None":
|
44 |
-
precision = "GPTQ"
|
45 |
|
46 |
# Get model and org
|
47 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
@@ -50,13 +52,21 @@ class EvalResult:
|
|
50 |
if len(org_and_model) == 1:
|
51 |
org = None
|
52 |
model = org_and_model[0]
|
53 |
-
result_key = f"{model}_{precision}"
|
54 |
else:
|
55 |
org = org_and_model[0]
|
56 |
model = org_and_model[1]
|
57 |
-
result_key = f"{org}_{model}_{precision}"
|
|
|
58 |
|
59 |
-
still_on_hub
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
# Extract results available in this file (some results are split in several files)
|
62 |
results = {}
|
@@ -73,8 +83,8 @@ class EvalResult:
|
|
73 |
continue
|
74 |
|
75 |
# Some truthfulQA values are NaNs
|
76 |
-
if task.benchmark == "truthfulqa:mc" and
|
77 |
-
if math.isnan(float(data["results"][
|
78 |
results[task.benchmark] = 0.0
|
79 |
continue
|
80 |
|
@@ -88,37 +98,42 @@ class EvalResult:
|
|
88 |
|
89 |
return self(
|
90 |
eval_name=result_key,
|
91 |
-
full_model=
|
92 |
org=org,
|
93 |
model=model,
|
94 |
results=results,
|
95 |
-
precision=precision,
|
96 |
-
revision=config.get("model_sha", ""),
|
97 |
-
date=config.get("submission_date", ""),
|
98 |
still_on_hub=still_on_hub,
|
|
|
99 |
)
|
100 |
|
101 |
-
def update_with_request_file(self):
|
102 |
-
|
|
|
103 |
|
104 |
try:
|
105 |
with open(request_file, "r") as f:
|
106 |
request = json.load(f)
|
107 |
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
|
|
108 |
self.license = request.get("license", "?")
|
109 |
self.likes = request.get("likes", 0)
|
110 |
self.num_params = request.get("params", 0)
|
|
|
111 |
except Exception:
|
112 |
print(f"Could not find request file for {self.org}/{self.model}")
|
113 |
|
114 |
def to_dict(self):
|
|
|
115 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
116 |
data_dict = {
|
117 |
"eval_name": self.eval_name, # not a column, just a save name,
|
118 |
-
AutoEvalColumn.precision.name: self.precision,
|
119 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
120 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
121 |
-
AutoEvalColumn.weight_type.name: self.weight_type,
|
|
|
122 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
123 |
AutoEvalColumn.dummy.name: self.full_model,
|
124 |
AutoEvalColumn.revision.name: self.revision,
|
@@ -135,9 +150,10 @@ class EvalResult:
|
|
135 |
return data_dict
|
136 |
|
137 |
|
138 |
-
def get_request_file_for_model(model_name, precision):
|
|
|
139 |
request_files = os.path.join(
|
140 |
-
|
141 |
f"{model_name}_eval_request_*.json",
|
142 |
)
|
143 |
request_files = glob.glob(request_files)
|
@@ -149,15 +165,16 @@ def get_request_file_for_model(model_name, precision):
|
|
149 |
with open(tmp_request_file, "r") as f:
|
150 |
req_content = json.load(f)
|
151 |
if (
|
152 |
-
req_content["status"] in ["FINISHED"
|
153 |
and req_content["precision"] == precision.split(".")[-1]
|
154 |
):
|
155 |
request_file = tmp_request_file
|
156 |
return request_file
|
157 |
|
158 |
|
159 |
-
def
|
160 |
-
|
|
|
161 |
|
162 |
for root, _, files in os.walk(results_path):
|
163 |
# We should only have json files in model results
|
@@ -170,15 +187,14 @@ def get_eval_results(results_path: str) -> List[EvalResult]:
|
|
170 |
except dateutil.parser._parser.ParserError:
|
171 |
files = [files[-1]]
|
172 |
|
173 |
-
# up_to_date = files[-1]
|
174 |
for file in files:
|
175 |
-
|
176 |
|
177 |
eval_results = {}
|
178 |
-
for
|
179 |
# Creation of result
|
180 |
-
eval_result = EvalResult.init_from_json_file(
|
181 |
-
eval_result.update_with_request_file()
|
182 |
|
183 |
# Store results of same eval together
|
184 |
eval_name = eval_result.eval_name
|
@@ -190,8 +206,9 @@ def get_eval_results(results_path: str) -> List[EvalResult]:
|
|
190 |
results = []
|
191 |
for v in eval_results.values():
|
192 |
try:
|
193 |
-
|
194 |
-
|
|
|
195 |
continue
|
196 |
|
197 |
return results
|
|
|
1 |
+
import glob
|
2 |
import json
|
|
|
3 |
import math
|
4 |
+
import os
|
5 |
from dataclasses import dataclass
|
|
|
6 |
|
7 |
import dateutil
|
8 |
+
from datetime import datetime
|
9 |
+
from transformers import AutoConfig
|
10 |
import numpy as np
|
11 |
|
|
|
12 |
from src.display.formatting import make_clickable_model
|
13 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
14 |
from src.submission.check_validity import is_model_on_hub
|
15 |
|
16 |
|
17 |
@dataclass
|
18 |
class EvalResult:
|
19 |
+
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
20 |
+
eval_name: str # org_model_precision (uid)
|
21 |
+
full_model: str # org/model (path on hub)
|
22 |
+
org: str
|
23 |
model: str
|
24 |
+
revision: str # commit hash, "" if main
|
25 |
results: dict
|
26 |
+
precision: Precision = Precision.Unknown
|
27 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
28 |
+
weight_type: WeightType = WeightType.Original # Original or Adapter
|
29 |
+
architecture: str = "Unknown" # From config file
|
30 |
license: str = "?"
|
31 |
likes: int = 0
|
32 |
num_params: int = 0
|
33 |
+
date: str = "" # submission date of request file
|
34 |
still_on_hub: bool = False
|
35 |
|
36 |
@classmethod
|
37 |
def init_from_json_file(self, json_filepath):
|
38 |
+
"""Inits the result from the specific model result file"""
|
39 |
with open(json_filepath) as fp:
|
40 |
data = json.load(fp)
|
41 |
|
|
|
43 |
config = data.get("config", data.get("config_general", None))
|
44 |
|
45 |
# Precision
|
46 |
+
precision = Precision.from_str(config.get("model_dtype"))
|
|
|
|
|
47 |
|
48 |
# Get model and org
|
49 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
|
|
52 |
if len(org_and_model) == 1:
|
53 |
org = None
|
54 |
model = org_and_model[0]
|
55 |
+
result_key = f"{model}_{precision.value.name}"
|
56 |
else:
|
57 |
org = org_and_model[0]
|
58 |
model = org_and_model[1]
|
59 |
+
result_key = f"{org}_{model}_{precision.value.name}"
|
60 |
+
full_model = "/".join(org_and_model)
|
61 |
|
62 |
+
still_on_hub, error, model_config = is_model_on_hub(
|
63 |
+
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
64 |
+
)
|
65 |
+
architecture = "?"
|
66 |
+
if model_config is not None:
|
67 |
+
architectures = getattr(model_config, "architectures", None)
|
68 |
+
if architectures:
|
69 |
+
architecture = ";".join(architectures)
|
70 |
|
71 |
# Extract results available in this file (some results are split in several files)
|
72 |
results = {}
|
|
|
83 |
continue
|
84 |
|
85 |
# Some truthfulQA values are NaNs
|
86 |
+
if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
|
87 |
+
if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
|
88 |
results[task.benchmark] = 0.0
|
89 |
continue
|
90 |
|
|
|
98 |
|
99 |
return self(
|
100 |
eval_name=result_key,
|
101 |
+
full_model=full_model,
|
102 |
org=org,
|
103 |
model=model,
|
104 |
results=results,
|
105 |
+
precision=precision,
|
106 |
+
revision= config.get("model_sha", ""),
|
|
|
107 |
still_on_hub=still_on_hub,
|
108 |
+
architecture=architecture
|
109 |
)
|
110 |
|
111 |
+
def update_with_request_file(self, requests_path):
|
112 |
+
"""Finds the relevant request file for the current model and updates info with it"""
|
113 |
+
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
114 |
|
115 |
try:
|
116 |
with open(request_file, "r") as f:
|
117 |
request = json.load(f)
|
118 |
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
119 |
+
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
120 |
self.license = request.get("license", "?")
|
121 |
self.likes = request.get("likes", 0)
|
122 |
self.num_params = request.get("params", 0)
|
123 |
+
self.date = request.get("submitted_time", "")
|
124 |
except Exception:
|
125 |
print(f"Could not find request file for {self.org}/{self.model}")
|
126 |
|
127 |
def to_dict(self):
|
128 |
+
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
129 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
130 |
data_dict = {
|
131 |
"eval_name": self.eval_name, # not a column, just a save name,
|
132 |
+
AutoEvalColumn.precision.name: self.precision.value.name,
|
133 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
134 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
135 |
+
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
136 |
+
AutoEvalColumn.architecture.name: self.architecture,
|
137 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
138 |
AutoEvalColumn.dummy.name: self.full_model,
|
139 |
AutoEvalColumn.revision.name: self.revision,
|
|
|
150 |
return data_dict
|
151 |
|
152 |
|
153 |
+
def get_request_file_for_model(requests_path, model_name, precision):
|
154 |
+
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
155 |
request_files = os.path.join(
|
156 |
+
requests_path,
|
157 |
f"{model_name}_eval_request_*.json",
|
158 |
)
|
159 |
request_files = glob.glob(request_files)
|
|
|
165 |
with open(tmp_request_file, "r") as f:
|
166 |
req_content = json.load(f)
|
167 |
if (
|
168 |
+
req_content["status"] in ["FINISHED"]
|
169 |
and req_content["precision"] == precision.split(".")[-1]
|
170 |
):
|
171 |
request_file = tmp_request_file
|
172 |
return request_file
|
173 |
|
174 |
|
175 |
+
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
176 |
+
"""From the path of the results folder root, extract all needed info for results"""
|
177 |
+
model_result_filepaths = []
|
178 |
|
179 |
for root, _, files in os.walk(results_path):
|
180 |
# We should only have json files in model results
|
|
|
187 |
except dateutil.parser._parser.ParserError:
|
188 |
files = [files[-1]]
|
189 |
|
|
|
190 |
for file in files:
|
191 |
+
model_result_filepaths.append(os.path.join(root, file))
|
192 |
|
193 |
eval_results = {}
|
194 |
+
for model_result_filepath in model_result_filepaths:
|
195 |
# Creation of result
|
196 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
197 |
+
eval_result.update_with_request_file(requests_path)
|
198 |
|
199 |
# Store results of same eval together
|
200 |
eval_name = eval_result.eval_name
|
|
|
206 |
results = []
|
207 |
for v in eval_results.values():
|
208 |
try:
|
209 |
+
v.to_dict() # we test if the dict version is complete
|
210 |
+
results.append(v)
|
211 |
+
except KeyError: # not all eval values present
|
212 |
continue
|
213 |
|
214 |
return results
|
src/populate.py
CHANGED
@@ -3,24 +3,25 @@ import os
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.
|
7 |
-
from src.leaderboard.read_evals import get_eval_results
|
8 |
-
from src.display.formatting import make_clickable_model, has_no_nan_values
|
9 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
|
|
|
|
|
10 |
|
11 |
|
12 |
-
def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
16 |
|
17 |
-
df = pd.DataFrame.from_records(
|
18 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
19 |
df = df[cols].round(decimals=2)
|
20 |
|
21 |
# filter out if any of the benchmarks have not been produced
|
22 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
23 |
-
return df
|
24 |
|
25 |
|
26 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.display.formatting import has_no_nan_values, make_clickable_model
|
|
|
|
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
|
8 |
+
from src.leaderboard.filter_models import filter_models
|
9 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
12 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
+
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
+
all_data_json.append(baseline_row)
|
16 |
+
filter_models(all_data_json)
|
17 |
|
18 |
+
df = pd.DataFrame.from_records(all_data_json)
|
19 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
20 |
df = df[cols].round(decimals=2)
|
21 |
|
22 |
# filter out if any of the benchmarks have not been produced
|
23 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
24 |
+
return raw_data, df
|
25 |
|
26 |
|
27 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
src/submission/check_validity.py
CHANGED
@@ -1,13 +1,16 @@
|
|
1 |
-
import huggingface_hub
|
2 |
-
import os
|
3 |
import json
|
|
|
4 |
import re
|
5 |
from collections import defaultdict
|
6 |
-
from
|
|
|
|
|
7 |
from huggingface_hub import ModelCard
|
8 |
-
from
|
|
|
|
|
9 |
|
10 |
-
from
|
11 |
|
12 |
|
13 |
# ht to @Wauplin, thank you for the snippet!
|
@@ -34,26 +37,36 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
34 |
return True, ""
|
35 |
|
36 |
|
37 |
-
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False) -> tuple[bool, str]:
|
38 |
try:
|
39 |
-
AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
except ValueError:
|
43 |
return (
|
44 |
False,
|
45 |
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
|
|
46 |
)
|
47 |
|
48 |
-
except Exception:
|
49 |
-
return False, "was not found on hub!"
|
50 |
|
51 |
|
52 |
def get_model_size(model_info: ModelInfo, precision: str):
|
53 |
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
54 |
try:
|
55 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
56 |
-
except AttributeError:
|
57 |
try:
|
58 |
size_match = re.search(size_pattern, model_info.modelId.lower())
|
59 |
model_size = size_match.group(0)
|
@@ -65,9 +78,10 @@ def get_model_size(model_info: ModelInfo, precision: str):
|
|
65 |
model_size = size_factor * model_size
|
66 |
return model_size
|
67 |
|
|
|
|
|
68 |
|
69 |
-
def user_submission_permission(
|
70 |
-
org_or_user, _ = submission_name.split("/")
|
71 |
if org_or_user not in users_to_submission_dates:
|
72 |
return True, ""
|
73 |
submission_dates = sorted(users_to_submission_dates[org_or_user])
|
@@ -76,6 +90,9 @@ def user_submission_permission(submission_name, users_to_submission_dates, rate_
|
|
76 |
submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
|
77 |
|
78 |
num_models_submitted_in_period = len(submissions_after_timelimit)
|
|
|
|
|
|
|
79 |
if num_models_submitted_in_period > rate_limit_quota:
|
80 |
error_msg = f"Organisation or user `{org_or_user}`"
|
81 |
error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
|
|
|
|
|
|
|
1 |
import json
|
2 |
+
import os
|
3 |
import re
|
4 |
from collections import defaultdict
|
5 |
+
from datetime import datetime, timedelta, timezone
|
6 |
+
|
7 |
+
import huggingface_hub
|
8 |
from huggingface_hub import ModelCard
|
9 |
+
from huggingface_hub.hf_api import ModelInfo
|
10 |
+
from transformers import AutoConfig, AutoTokenizer
|
11 |
+
from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
|
12 |
|
13 |
+
from src.envs import HAS_HIGHER_RATE_LIMIT
|
14 |
|
15 |
|
16 |
# ht to @Wauplin, thank you for the snippet!
|
|
|
37 |
return True, ""
|
38 |
|
39 |
|
40 |
+
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
41 |
try:
|
42 |
+
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
43 |
+
if test_tokenizer:
|
44 |
+
try:
|
45 |
+
AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
46 |
+
except ValueError as e:
|
47 |
+
return (
|
48 |
+
False,
|
49 |
+
f"uses a tokenizer which is not in a transformers release: {e}",
|
50 |
+
None
|
51 |
+
)
|
52 |
+
return True, None, config
|
53 |
|
54 |
except ValueError:
|
55 |
return (
|
56 |
False,
|
57 |
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
58 |
+
None
|
59 |
)
|
60 |
|
61 |
+
except Exception as e:
|
62 |
+
return False, "was not found on hub!", None
|
63 |
|
64 |
|
65 |
def get_model_size(model_info: ModelInfo, precision: str):
|
66 |
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
67 |
try:
|
68 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
69 |
+
except (AttributeError, TypeError ):
|
70 |
try:
|
71 |
size_match = re.search(size_pattern, model_info.modelId.lower())
|
72 |
model_size = size_match.group(0)
|
|
|
78 |
model_size = size_factor * model_size
|
79 |
return model_size
|
80 |
|
81 |
+
def get_model_arch(model_info: ModelInfo):
|
82 |
+
return model_info.config.get("architectures", "Unknown")
|
83 |
|
84 |
+
def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
|
|
|
85 |
if org_or_user not in users_to_submission_dates:
|
86 |
return True, ""
|
87 |
submission_dates = sorted(users_to_submission_dates[org_or_user])
|
|
|
90 |
submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
|
91 |
|
92 |
num_models_submitted_in_period = len(submissions_after_timelimit)
|
93 |
+
if org_or_user in HAS_HIGHER_RATE_LIMIT:
|
94 |
+
rate_limit_quota = 2 * rate_limit_quota
|
95 |
+
|
96 |
if num_models_submitted_in_period > rate_limit_quota:
|
97 |
error_msg = f"Organisation or user `{org_or_user}`"
|
98 |
error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
|
src/submission/submit.py
CHANGED
@@ -1,20 +1,20 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
-
from src.display.formatting import styled_error,
|
|
|
6 |
from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
|
7 |
from src.submission.check_validity import (
|
8 |
-
user_submission_permission,
|
9 |
-
is_model_on_hub,
|
10 |
-
get_model_size,
|
11 |
-
check_model_card,
|
12 |
already_submitted_models,
|
|
|
|
|
|
|
|
|
13 |
)
|
14 |
-
from src.envs import RATE_LIMIT_QUOTA, RATE_LIMIT_PERIOD, H4_TOKEN, EVAL_REQUESTS_PATH, API, QUEUE_REPO
|
15 |
-
|
16 |
-
requested_models, users_to_submission_dates = already_submitted_models(EVAL_REQUESTS_PATH)
|
17 |
|
|
|
|
|
18 |
|
19 |
def add_new_eval(
|
20 |
model: str,
|
@@ -25,6 +25,17 @@ def add_new_eval(
|
|
25 |
weight_type: str,
|
26 |
model_type: str,
|
27 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
precision = precision.split(" ")[0]
|
29 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
30 |
|
@@ -32,11 +43,12 @@ def add_new_eval(
|
|
32 |
return styled_error("Please select a model type.")
|
33 |
|
34 |
# Is the user rate limited?
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
40 |
|
41 |
# Did the model authors forbid its submission to the leaderboard?
|
42 |
if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
|
@@ -48,12 +60,12 @@ def add_new_eval(
|
|
48 |
|
49 |
# Is the model on the hub?
|
50 |
if weight_type in ["Delta", "Adapter"]:
|
51 |
-
base_model_on_hub, error = is_model_on_hub(base_model, revision, H4_TOKEN)
|
52 |
if not base_model_on_hub:
|
53 |
return styled_error(f'Base model "{base_model}" {error}')
|
54 |
|
55 |
if not weight_type == "Adapter":
|
56 |
-
model_on_hub, error = is_model_on_hub(model, revision)
|
57 |
if not model_on_hub:
|
58 |
return styled_error(f'Model "{model}" {error}')
|
59 |
|
@@ -93,21 +105,15 @@ def add_new_eval(
|
|
93 |
"license": license,
|
94 |
}
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
user_name = model.split("/")[0]
|
100 |
-
model_path = model.split("/")[1]
|
101 |
|
102 |
print("Creating eval file")
|
103 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
104 |
os.makedirs(OUT_DIR, exist_ok=True)
|
105 |
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
|
106 |
|
107 |
-
# Check for duplicate submission
|
108 |
-
if f"{model}_{revision}_{precision}" in requested_models:
|
109 |
-
return styled_warning("This model has been already submitted.")
|
110 |
-
|
111 |
with open(out_path, "w") as f:
|
112 |
f.write(json.dumps(eval_entry))
|
113 |
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
+
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
+
from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
|
7 |
from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
|
8 |
from src.submission.check_validity import (
|
|
|
|
|
|
|
|
|
9 |
already_submitted_models,
|
10 |
+
check_model_card,
|
11 |
+
get_model_size,
|
12 |
+
is_model_on_hub,
|
13 |
+
user_submission_permission,
|
14 |
)
|
|
|
|
|
|
|
15 |
|
16 |
+
REQUESTED_MODELS = None
|
17 |
+
USERS_TO_SUBMISSION_DATES = None
|
18 |
|
19 |
def add_new_eval(
|
20 |
model: str,
|
|
|
25 |
weight_type: str,
|
26 |
model_type: str,
|
27 |
):
|
28 |
+
global REQUESTED_MODELS
|
29 |
+
global USERS_TO_SUBMISSION_DATES
|
30 |
+
if not REQUESTED_MODELS:
|
31 |
+
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
32 |
+
|
33 |
+
user_name = ""
|
34 |
+
model_path = model
|
35 |
+
if "/" in model:
|
36 |
+
user_name = model.split("/")[0]
|
37 |
+
model_path = model.split("/")[1]
|
38 |
+
|
39 |
precision = precision.split(" ")[0]
|
40 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
41 |
|
|
|
43 |
return styled_error("Please select a model type.")
|
44 |
|
45 |
# Is the user rate limited?
|
46 |
+
if user_name != "":
|
47 |
+
user_can_submit, error_msg = user_submission_permission(
|
48 |
+
user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
|
49 |
+
)
|
50 |
+
if not user_can_submit:
|
51 |
+
return styled_error(error_msg)
|
52 |
|
53 |
# Did the model authors forbid its submission to the leaderboard?
|
54 |
if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
|
|
|
60 |
|
61 |
# Is the model on the hub?
|
62 |
if weight_type in ["Delta", "Adapter"]:
|
63 |
+
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
|
64 |
if not base_model_on_hub:
|
65 |
return styled_error(f'Base model "{base_model}" {error}')
|
66 |
|
67 |
if not weight_type == "Adapter":
|
68 |
+
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
|
69 |
if not model_on_hub:
|
70 |
return styled_error(f'Model "{model}" {error}')
|
71 |
|
|
|
105 |
"license": license,
|
106 |
}
|
107 |
|
108 |
+
# Check for duplicate submission
|
109 |
+
if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
110 |
+
return styled_warning("This model has been already submitted.")
|
|
|
|
|
111 |
|
112 |
print("Creating eval file")
|
113 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
114 |
os.makedirs(OUT_DIR, exist_ok=True)
|
115 |
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
|
116 |
|
|
|
|
|
|
|
|
|
117 |
with open(out_path, "w") as f:
|
118 |
f.write(json.dumps(eval_entry))
|
119 |
|
src/tools/collections.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
import os
|
|
|
2 |
import pandas as pd
|
3 |
-
from
|
4 |
-
from huggingface_hub import get_collection, add_collection_item, update_collection_item, delete_collection_item
|
5 |
from huggingface_hub.utils._errors import HfHubHTTPError
|
|
|
6 |
|
7 |
from src.display.utils import AutoEvalColumn, ModelType
|
8 |
-
|
9 |
from src.envs import H4_TOKEN, PATH_TO_COLLECTION
|
10 |
|
11 |
# Specific intervals for the collections
|
|
|
1 |
import os
|
2 |
+
|
3 |
import pandas as pd
|
4 |
+
from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
|
|
|
5 |
from huggingface_hub.utils._errors import HfHubHTTPError
|
6 |
+
from pandas import DataFrame
|
7 |
|
8 |
from src.display.utils import AutoEvalColumn, ModelType
|
|
|
9 |
from src.envs import H4_TOKEN, PATH_TO_COLLECTION
|
10 |
|
11 |
# Specific intervals for the collections
|
src/tools/plots.py
CHANGED
@@ -1,151 +1,84 @@
|
|
1 |
import pandas as pd
|
|
|
2 |
import plotly.express as px
|
3 |
from plotly.graph_objs import Figure
|
4 |
-
import pickle
|
5 |
-
from datetime import datetime, timezone
|
6 |
-
from typing import List, Dict, Tuple, Any
|
7 |
-
from src.leaderboard.filter_models import FLAGGED_MODELS
|
8 |
-
|
9 |
-
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
10 |
-
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
|
11 |
-
# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
|
12 |
-
# MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
|
13 |
-
# TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
|
14 |
-
# Define the human baselines
|
15 |
-
HUMAN_BASELINES = {
|
16 |
-
"Average ⬆️": 0.897 * 100,
|
17 |
-
"ARC": 0.80 * 100,
|
18 |
-
"HellaSwag": 0.95 * 100,
|
19 |
-
"MMLU": 0.898 * 100,
|
20 |
-
"TruthfulQA": 0.94 * 100,
|
21 |
-
}
|
22 |
-
|
23 |
-
|
24 |
-
def to_datetime(model_info: Tuple[str, Any]) -> datetime:
|
25 |
-
"""
|
26 |
-
Converts the lastModified attribute of the object to datetime.
|
27 |
-
|
28 |
-
:param model_info: A tuple containing the name and object.
|
29 |
-
The object must have a lastModified attribute
|
30 |
-
with a string representing the date and time.
|
31 |
-
:return: A datetime object converted from the lastModified attribute of the input object.
|
32 |
-
"""
|
33 |
-
name, obj = model_info
|
34 |
-
return datetime.strptime(obj.lastModified, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
|
35 |
-
|
36 |
-
|
37 |
-
def join_model_info_with_results(results_df: pd.DataFrame) -> pd.DataFrame:
|
38 |
-
"""
|
39 |
-
Integrates model information with the results DataFrame by matching 'Model sha'.
|
40 |
-
:param results_df: A DataFrame containing results information including 'Model sha' column.
|
41 |
-
:return: A DataFrame with updated 'Results Date' columns, which are synchronized with model information.
|
42 |
-
"""
|
43 |
-
# copy dataframe to avoid modifying the original
|
44 |
-
df = results_df.copy(deep=True)
|
45 |
-
|
46 |
-
# Filter out FLAGGED_MODELS to ensure graph is not skewed by mistakes
|
47 |
-
df = df[~df["model_name_for_query"].isin(FLAGGED_MODELS.keys())].reset_index(drop=True)
|
48 |
-
|
49 |
-
# load cache from disk
|
50 |
-
try:
|
51 |
-
with open("model_info_cache.pkl", "rb") as f:
|
52 |
-
model_info_cache = pickle.load(f)
|
53 |
-
except (EOFError, FileNotFoundError):
|
54 |
-
model_info_cache = {}
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
# Define the date format string
|
61 |
-
date_format = "%Y-%m-%dT%H:%M:%S.%fZ"
|
62 |
-
|
63 |
-
# Iterate over sorted_dates and update the dataframe
|
64 |
-
for name, obj in sorted_dates:
|
65 |
-
# Convert the lastModified string to a datetime object
|
66 |
-
last_modified_datetime = datetime.strptime(obj.lastModified, date_format).replace(tzinfo=timezone.utc)
|
67 |
|
68 |
-
# Update the "Results Date" column where "Model sha" equals obj.sha
|
69 |
-
df.loc[df["Model sha"] == obj.sha, "Results Date"] = last_modified_datetime
|
70 |
-
return df
|
71 |
|
72 |
|
73 |
-
def create_scores_df(
|
74 |
"""
|
75 |
-
Generates a DataFrame containing the maximum scores until each
|
76 |
|
77 |
-
:param results_df: A DataFrame containing result information including metric scores and
|
78 |
-
:return: A new DataFrame containing the maximum scores until each
|
79 |
"""
|
80 |
-
# Step 1: Ensure '
|
81 |
-
results_df
|
82 |
-
results_df.
|
|
|
83 |
|
84 |
# Step 2: Initialize the scores dictionary
|
85 |
-
scores = {
|
86 |
-
"Average ⬆️": [],
|
87 |
-
"ARC": [],
|
88 |
-
"HellaSwag": [],
|
89 |
-
"MMLU": [],
|
90 |
-
"TruthfulQA": [],
|
91 |
-
"Result Date": [],
|
92 |
-
"Model Name": [],
|
93 |
-
}
|
94 |
|
95 |
# Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
|
96 |
-
for
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
|
|
102 |
continue
|
103 |
-
if column == "Model Name":
|
104 |
-
scores[column].append(row["model_name_for_query"])
|
105 |
-
continue
|
106 |
-
current_max = scores[column][-1] if scores[column] else float("-inf")
|
107 |
-
scores[column].append(max(current_max, row[column]))
|
108 |
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
|
|
|
|
112 |
|
113 |
-
|
|
|
114 |
"""
|
115 |
Transforms the scores DataFrame into a new format suitable for plotting.
|
116 |
|
117 |
-
:param scores_df: A DataFrame containing metric scores and
|
118 |
:return: A new DataFrame reshaped for plotting purposes.
|
119 |
"""
|
120 |
-
# Sample columns
|
121 |
-
cols = ["Average ⬆️", "ARC", "HellaSwag", "MMLU", "TruthfulQA"]
|
122 |
-
|
123 |
# Initialize the list to store DataFrames
|
124 |
dfs = []
|
125 |
|
126 |
# Iterate over the cols and create a new DataFrame for each column
|
127 |
-
for col in
|
128 |
-
d = scores_df[
|
129 |
-
d["
|
130 |
-
d.rename(columns={col: "Metric Value"}, inplace=True)
|
131 |
dfs.append(d)
|
132 |
|
133 |
# Concatenate all the created DataFrames
|
134 |
concat_df = pd.concat(dfs, ignore_index=True)
|
135 |
|
136 |
-
# Sort values by '
|
137 |
-
concat_df.sort_values(by="
|
138 |
-
concat_df.reset_index(drop=True, inplace=True)
|
139 |
-
|
140 |
-
# Drop duplicates based on 'Metric Name' and 'Metric Value' and keep the first (earliest) occurrence
|
141 |
-
concat_df.drop_duplicates(subset=["Metric Name", "Metric Value"], keep="first", inplace=True)
|
142 |
-
|
143 |
concat_df.reset_index(drop=True, inplace=True)
|
144 |
return concat_df
|
145 |
|
146 |
|
147 |
def create_metric_plot_obj(
|
148 |
-
df: pd.DataFrame, metrics:
|
149 |
) -> Figure:
|
150 |
"""
|
151 |
Create a Plotly figure object with lines representing different metrics
|
@@ -154,27 +87,25 @@ def create_metric_plot_obj(
|
|
154 |
:param df: The DataFrame containing the metric values, names, and dates.
|
155 |
:param metrics: A list of strings representing the names of the metrics
|
156 |
to be included in the plot.
|
157 |
-
:param human_baselines: A dictionary where keys are metric names
|
158 |
-
and values are human baseline values for the metrics.
|
159 |
:param title: A string representing the title of the plot.
|
160 |
:return: A Plotly figure object with lines representing metrics and
|
161 |
horizontal dotted lines representing human baselines.
|
162 |
"""
|
163 |
|
164 |
# Filter the DataFrame based on the specified metrics
|
165 |
-
df = df[df["
|
166 |
|
167 |
# Filter the human baselines based on the specified metrics
|
168 |
-
filtered_human_baselines = {k: v for k, v in
|
169 |
|
170 |
# Create a line figure using plotly express with specified markers and custom data
|
171 |
fig = px.line(
|
172 |
df,
|
173 |
-
x="
|
174 |
-
y="
|
175 |
-
color="
|
176 |
markers=True,
|
177 |
-
custom_data=["
|
178 |
title=title,
|
179 |
)
|
180 |
|
|
|
1 |
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
import plotly.express as px
|
4 |
from plotly.graph_objs import Figure
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
+
from src.leaderboard.filter_models import FLAGGED_MODELS
|
7 |
+
from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS
|
8 |
+
from src.leaderboard.read_evals import EvalResult
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
|
|
|
|
|
|
10 |
|
11 |
|
12 |
+
def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
13 |
"""
|
14 |
+
Generates a DataFrame containing the maximum scores until each date.
|
15 |
|
16 |
+
:param results_df: A DataFrame containing result information including metric scores and dates.
|
17 |
+
:return: A new DataFrame containing the maximum scores until each date for every metric.
|
18 |
"""
|
19 |
+
# Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
|
20 |
+
results_df = pd.DataFrame(raw_data)
|
21 |
+
#results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
|
22 |
+
results_df.sort_values(by="date", inplace=True)
|
23 |
|
24 |
# Step 2: Initialize the scores dictionary
|
25 |
+
scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
# Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
|
28 |
+
for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
|
29 |
+
current_max = 0
|
30 |
+
last_date = ""
|
31 |
+
column = task.col_name
|
32 |
+
for _, row in results_df.iterrows():
|
33 |
+
current_model = row["full_model"]
|
34 |
+
if current_model in FLAGGED_MODELS:
|
35 |
continue
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
current_date = row["date"]
|
38 |
+
if task.benchmark == "Average":
|
39 |
+
current_score = np.mean(list(row["results"].values()))
|
40 |
+
else:
|
41 |
+
current_score = row["results"][task.benchmark]
|
42 |
+
|
43 |
+
if current_score > current_max:
|
44 |
+
if current_date == last_date and len(scores[column]) > 0:
|
45 |
+
scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
|
46 |
+
else:
|
47 |
+
scores[column].append({"model": current_model, "date": current_date, "score": current_score})
|
48 |
+
current_max = current_score
|
49 |
+
last_date = current_date
|
50 |
|
51 |
+
# Step 4: Return all dictionaries as DataFrames
|
52 |
+
return {k: pd.DataFrame(v) for k, v in scores.items()}
|
53 |
|
54 |
+
|
55 |
+
def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
|
56 |
"""
|
57 |
Transforms the scores DataFrame into a new format suitable for plotting.
|
58 |
|
59 |
+
:param scores_df: A DataFrame containing metric scores and dates.
|
60 |
:return: A new DataFrame reshaped for plotting purposes.
|
61 |
"""
|
|
|
|
|
|
|
62 |
# Initialize the list to store DataFrames
|
63 |
dfs = []
|
64 |
|
65 |
# Iterate over the cols and create a new DataFrame for each column
|
66 |
+
for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
|
67 |
+
d = scores_df[col].reset_index(drop=True)
|
68 |
+
d["task"] = col
|
|
|
69 |
dfs.append(d)
|
70 |
|
71 |
# Concatenate all the created DataFrames
|
72 |
concat_df = pd.concat(dfs, ignore_index=True)
|
73 |
|
74 |
+
# Sort values by 'date'
|
75 |
+
concat_df.sort_values(by="date", inplace=True)
|
|
|
|
|
|
|
|
|
|
|
76 |
concat_df.reset_index(drop=True, inplace=True)
|
77 |
return concat_df
|
78 |
|
79 |
|
80 |
def create_metric_plot_obj(
|
81 |
+
df: pd.DataFrame, metrics: list[str], title: str
|
82 |
) -> Figure:
|
83 |
"""
|
84 |
Create a Plotly figure object with lines representing different metrics
|
|
|
87 |
:param df: The DataFrame containing the metric values, names, and dates.
|
88 |
:param metrics: A list of strings representing the names of the metrics
|
89 |
to be included in the plot.
|
|
|
|
|
90 |
:param title: A string representing the title of the plot.
|
91 |
:return: A Plotly figure object with lines representing metrics and
|
92 |
horizontal dotted lines representing human baselines.
|
93 |
"""
|
94 |
|
95 |
# Filter the DataFrame based on the specified metrics
|
96 |
+
df = df[df["task"].isin(metrics)]
|
97 |
|
98 |
# Filter the human baselines based on the specified metrics
|
99 |
+
filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
|
100 |
|
101 |
# Create a line figure using plotly express with specified markers and custom data
|
102 |
fig = px.line(
|
103 |
df,
|
104 |
+
x="date",
|
105 |
+
y="score",
|
106 |
+
color="task",
|
107 |
markers=True,
|
108 |
+
custom_data=["task", "score", "model"],
|
109 |
title=title,
|
110 |
)
|
111 |
|