felix commited on
Commit
c5f708b
·
2 Parent(s): 940ccca 2246286

Merge remote-tracking branch 'upstream/main'

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🏆
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 3.43.2
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.4.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
app.py CHANGED
@@ -1,56 +1,59 @@
 
1
  import json
2
  import os
3
  from datetime import datetime, timezone
4
 
5
- import gradio as gr
6
  import pandas as pd
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
  from huggingface_hub import snapshot_download
9
 
10
- from src.display.utils import (
11
- COLS,
12
- TYPES,
13
- BENCHMARK_COLS,
14
- EVAL_COLS,
15
- EVAL_TYPES,
16
- AutoEvalColumn,
17
- ModelType,
18
- NUMERIC_INTERVALS,
19
- fields,
20
- )
21
- from src.display.css_html_js import custom_css, get_window_url_params
22
  from src.display.about import (
23
  CITATION_BUTTON_LABEL,
24
  CITATION_BUTTON_TEXT,
25
  EVALUATION_QUEUE_TEXT,
26
  INTRODUCTION_TEXT,
27
  LLM_BENCHMARKS_TEXT,
 
28
  TITLE,
29
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  from src.tools.plots import (
31
  create_metric_plot_obj,
32
- create_scores_df,
33
  create_plot_df,
34
- join_model_info_with_results,
35
- HUMAN_BASELINES,
36
  )
37
- from src.tools.collections import update_collections
38
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
39
- from src.envs import H4_TOKEN, QUEUE_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO, API, REPO_ID, IS_PUBLIC
40
- from src.submission.submit import add_new_eval
41
 
42
 
43
  def restart_space():
44
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
45
 
46
-
47
  try:
 
48
  snapshot_download(
49
  repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
50
  )
51
  except Exception:
52
  restart_space()
53
  try:
 
54
  snapshot_download(
55
  repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
56
  )
@@ -58,13 +61,11 @@ except Exception:
58
  restart_space()
59
 
60
 
61
- original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
62
  update_collections(original_df.copy())
63
  leaderboard_df = original_df.copy()
64
 
65
- #models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
66
- # plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
67
- #to_be_dumped = f"models = {repr(models)}\n"
68
 
69
  (
70
  finished_eval_queue_df,
@@ -73,26 +74,15 @@ leaderboard_df = original_df.copy()
73
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
74
 
75
 
76
- # Basics
77
- def change_tab(query_param: str):
78
- query_param = query_param.replace("'", '"')
79
- query_param = json.loads(query_param)
80
-
81
- if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "evaluation":
82
- return gr.Tabs.update(selected=1)
83
- else:
84
- return gr.Tabs.update(selected=0)
85
-
86
-
87
  # Searching and filtering
88
  def update_table(
89
- hidden_df: pd.DataFrame,
90
- columns: list,
91
- type_query: list,
92
- precision_query: str,
93
- size_query: list,
94
- show_deleted: bool,
95
- query: str,
96
  ):
97
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
98
  filtered_df = filter_queries(query, filtered_df)
@@ -112,7 +102,7 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
112
  # We use COLS to maintain sorting
113
  filtered_df = df[
114
  always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
115
- ]
116
  return filtered_df
117
 
118
 
@@ -137,7 +127,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
137
 
138
 
139
  def filter_models(
140
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
141
  ) -> pd.DataFrame:
142
  # Show all models
143
  if show_deleted:
@@ -146,8 +136,8 @@ def filter_models(
146
  filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
147
 
148
  type_emoji = [t[0] for t in type_query]
149
- filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
150
- filtered_df = filtered_df[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
151
 
152
  numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
153
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
@@ -478,7 +468,7 @@ dummy1 = gr.Textbox(visible=False)
478
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
479
  headers=COLS,
480
  datatype=TYPES,
481
- max_rows=None,
482
  visible=False,
483
  )
484
 
 
1
+ import gradio as gr
2
  import json
3
  import os
4
  from datetime import datetime, timezone
5
 
 
6
  import pandas as pd
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
  from huggingface_hub import snapshot_download
9
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  from src.display.about import (
11
  CITATION_BUTTON_LABEL,
12
  CITATION_BUTTON_TEXT,
13
  EVALUATION_QUEUE_TEXT,
14
  INTRODUCTION_TEXT,
15
  LLM_BENCHMARKS_TEXT,
16
+ FAQ_TEXT,
17
  TITLE,
18
  )
19
+ from src.display.css_html_js import custom_css
20
+ from src.display.utils import (
21
+ BENCHMARK_COLS,
22
+ COLS,
23
+ EVAL_COLS,
24
+ EVAL_TYPES,
25
+ NUMERIC_INTERVALS,
26
+ TYPES,
27
+ AutoEvalColumn,
28
+ ModelType,
29
+ fields,
30
+ WeightType,
31
+ Precision
32
+ )
33
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
34
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
35
+ from src.submission.submit import add_new_eval
36
+ from src.submission.check_validity import already_submitted_models
37
+ from src.tools.collections import update_collections
38
  from src.tools.plots import (
39
  create_metric_plot_obj,
 
40
  create_plot_df,
41
+ create_scores_df,
 
42
  )
 
 
 
 
43
 
44
 
45
  def restart_space():
46
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
47
 
 
48
  try:
49
+ print(EVAL_REQUESTS_PATH)
50
  snapshot_download(
51
  repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
52
  )
53
  except Exception:
54
  restart_space()
55
  try:
56
+ print(EVAL_RESULTS_PATH)
57
  snapshot_download(
58
  repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
59
  )
 
61
  restart_space()
62
 
63
 
64
+ raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
65
  update_collections(original_df.copy())
66
  leaderboard_df = original_df.copy()
67
 
68
+ plot_df = create_plot_df(create_scores_df(raw_data))
 
 
69
 
70
  (
71
  finished_eval_queue_df,
 
74
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
75
 
76
 
 
 
 
 
 
 
 
 
 
 
 
77
  # Searching and filtering
78
  def update_table(
79
+ hidden_df: pd.DataFrame,
80
+ columns: list,
81
+ type_query: list,
82
+ precision_query: str,
83
+ size_query: list,
84
+ show_deleted: bool,
85
+ query: str,
86
  ):
87
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
88
  filtered_df = filter_queries(query, filtered_df)
 
102
  # We use COLS to maintain sorting
103
  filtered_df = df[
104
  always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
105
+ ]
106
  return filtered_df
107
 
108
 
 
127
 
128
 
129
  def filter_models(
130
+ df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
131
  ) -> pd.DataFrame:
132
  # Show all models
133
  if show_deleted:
 
136
  filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
137
 
138
  type_emoji = [t[0] for t in type_query]
139
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
140
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
141
 
142
  numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
143
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
 
468
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
469
  headers=COLS,
470
  datatype=TYPES,
471
+ max_rows=None,
472
  visible=False,
473
  )
474
 
requirements.txt CHANGED
@@ -13,8 +13,8 @@ pandas==2.0.0
13
  plotly==5.14.1
14
  python-dateutil==2.8.2
15
  requests==2.28.2
 
16
  semantic-version==2.10.0
17
  tqdm==4.65.0
18
- git+https://github.com/clefourrier/transformers.git@req-fix#egg=transformers
19
- #transformers==4.35.1
20
- tokenizers>=0.15.0
 
13
  plotly==5.14.1
14
  python-dateutil==2.8.2
15
  requests==2.28.2
16
+ sentencepiece
17
  semantic-version==2.10.0
18
  tqdm==4.65.0
19
+ transformers==4.35.2
20
+ tokenizers>=0.15.0
 
scripts/create_request_file.py CHANGED
@@ -1,11 +1,12 @@
1
- from datetime import datetime, timezone
2
  import json
3
  import os
 
4
  import re
 
 
5
  import click
6
- from huggingface_hub import HfApi, snapshot_download
7
  from colorama import Fore
8
- import pprint
9
 
10
  EVAL_REQUESTS_PATH = "eval-queue"
11
  QUEUE_REPO = "open-llm-leaderboard/requests"
@@ -19,7 +20,7 @@ def get_model_size(model_info, precision: str):
19
  size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
20
  try:
21
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
22
- except AttributeError:
23
  try:
24
  size_match = re.search(size_pattern, model_info.modelId.lower())
25
  model_size = size_match.group(0)
 
 
1
  import json
2
  import os
3
+ import pprint
4
  import re
5
+ from datetime import datetime, timezone
6
+
7
  import click
 
8
  from colorama import Fore
9
+ from huggingface_hub import HfApi, snapshot_download
10
 
11
  EVAL_REQUESTS_PATH = "eval-queue"
12
  QUEUE_REPO = "open-llm-leaderboard/requests"
 
20
  size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
21
  try:
22
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
23
+ except (AttributeError, TypeError):
24
  try:
25
  size_match = re.search(size_pattern, model_info.modelId.lower())
26
  model_size = size_match.group(0)
src/display/about.py CHANGED
@@ -13,20 +13,9 @@ LLM_BENCHMARKS_TEXT = f"""
13
  # Context
14
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
15
 
16
- ## Icons
17
- {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
18
- {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
19
- Specific fine-tune subcategories (more adapted to chat):
20
- {ModelType.IFT.to_str(" : ")} model: instruction fine-tunes, which are model fine-tuned specifically on datasets of task instruction
21
- {ModelType.RL.to_str(" : ")} model: reinforcement fine-tunes, which usually change the model loss a bit with an added policy.
22
- If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
23
-
24
- "Flagged" indicates that this model has been flagged by the community, and should probably be ignored! Clicking the link will redirect you to the discussion about the model.
25
- (For ex, the model was trained on the evaluation data, and is therefore cheating on the leaderboard.)
26
-
27
  ## How it works
28
 
29
- 📈 We evaluate models on 4 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
30
 
31
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
32
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
@@ -34,7 +23,6 @@ If there is no icon, we have not uploaded the information on the model yet, feel
34
  - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
35
  - <a href="https://arxiv.org/abs/1907.10641" target="_blank"> Winogrande </a> (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
36
  - <a href="https://arxiv.org/abs/2110.14168" target="_blank"> GSM8k </a> (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.
37
- - <a href="https://arxiv.org/abs/1903.00161" target="_blank"> DROP </a> (3-shot) - English reading comprehension benchmark requiring Discrete Reasoning Over the content of Paragraphs.
38
 
39
  For all these evaluations, a higher score is a better score.
40
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
@@ -47,10 +35,10 @@ You can find:
47
 
48
  ## Reproducibility
49
  To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
50
- `python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
51
- ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=2 --output_path=<output_path>`
52
 
53
- The total batch size we get for models which fit on one A100 node is 16 (8 GPUs * 2). If you don't use parallelism, adapt your batch size to fit.
54
  *You can expect results to vary slightly for different batch sizes because of padding.*
55
 
56
  The tasks and few shots parameters are:
@@ -60,29 +48,95 @@ The tasks and few shots parameters are:
60
  - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
61
  - Winogrande: 5-shot, *winogrande* (`acc`)
62
  - GSM8k: 5-shot, *gsm8k* (`acc`)
63
- - DROP: 3-shot, *drop* (`f1`)
64
 
65
  Side note on the baseline scores:
66
  - for log-likelihood evaluation, we select the random baseline
67
- - for DROP, we select the best submission score according to [their leaderboard](https://leaderboard.allenai.org/drop/submissions/public) when the paper came out (NAQANet score)
68
- - for GSM8K, we select the score obtained in the paper after inetuning a 6B model on the full GSM8K training set for 50 epochs
 
 
 
 
 
 
 
 
 
69
 
70
  ## Quantization
71
  To get more information about quantization, see:
72
  - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
73
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
74
 
75
- ## More resources
76
- If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/179)!
77
- We also gather cool resources from the community, other teams, and other labs [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)!
78
  """
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  EVALUATION_QUEUE_TEXT = """
81
  # Evaluation Queue for the 🤗 Open LLM Leaderboard
82
 
83
  Models added here will be automatically evaluated on the 🤗 cluster.
84
 
85
- ## Some good practices before submitting a model
86
 
87
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
88
  ```python
@@ -205,17 +259,4 @@ CITATION_BUTTON_TEXT = r"""
205
  archivePrefix={arXiv},
206
  primaryClass={cs.CL}
207
  }
208
- @misc{DBLP:journals/corr/abs-1903-00161,
209
- title={{DROP:} {A} Reading Comprehension Benchmark Requiring Discrete Reasoning
210
- Over Paragraphs},
211
- author={Dheeru Dua and
212
- Yizhong Wang and
213
- Pradeep Dasigi and
214
- Gabriel Stanovsky and
215
- Sameer Singh and
216
- Matt Gardner},
217
- year={2019},
218
- eprinttype={arXiv},
219
- eprint={1903.00161},
220
- primaryClass={cs.CL}
221
- }"""
 
13
  # Context
14
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
15
 
 
 
 
 
 
 
 
 
 
 
 
16
  ## How it works
17
 
18
+ 📈 We evaluate models on 7 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
19
 
20
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
21
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
 
23
  - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
24
  - <a href="https://arxiv.org/abs/1907.10641" target="_blank"> Winogrande </a> (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
25
  - <a href="https://arxiv.org/abs/2110.14168" target="_blank"> GSM8k </a> (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.
 
26
 
27
  For all these evaluations, a higher score is a better score.
28
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
 
35
 
36
  ## Reproducibility
37
  To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
38
+ `python main.py --model=hf-causal-experimental --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
39
+ ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=1 --output_path=<output_path>`
40
 
41
+ The total batch size we get for models which fit on one A100 node is 8 (8 GPUs * 1). If you don't use parallelism, adapt your batch size to fit.
42
  *You can expect results to vary slightly for different batch sizes because of padding.*
43
 
44
  The tasks and few shots parameters are:
 
48
  - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
49
  - Winogrande: 5-shot, *winogrande* (`acc`)
50
  - GSM8k: 5-shot, *gsm8k* (`acc`)
 
51
 
52
  Side note on the baseline scores:
53
  - for log-likelihood evaluation, we select the random baseline
54
+ - for GSM8K, we select the score obtained in the paper after finetuning a 6B model on the full GSM8K training set for 50 epochs
55
+
56
+ ## Icons
57
+ - {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
58
+ - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
59
+ Specific fine-tune subcategories (more adapted to chat):
60
+ - {ModelType.IFT.to_str(" : ")} model: instruction fine-tunes, which are model fine-tuned specifically on datasets of task instruction
61
+ - {ModelType.RL.to_str(" : ")} model: reinforcement fine-tunes, which usually change the model loss a bit with an added policy.
62
+ If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
63
+
64
+ "Flagged" indicates that this model has been flagged by the community, and should probably be ignored! Clicking the link will redirect you to the discussion about the model.
65
 
66
  ## Quantization
67
  To get more information about quantization, see:
68
  - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
69
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
70
 
71
+ ## Useful links
72
+ - [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)
73
+ - [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
74
  """
75
 
76
+ FAQ_TEXT = """
77
+ ---------------------------
78
+ # FAQ
79
+ Below are some common questions - if this FAQ does not answer you, feel free to create a new issue, and we'll take care of it as soon as we can!
80
+
81
+ ## 1) Submitting a model
82
+ My model requires `trust_remote_code=True`, can I submit it?
83
+ - *We only support models that have been integrated in a stable version of the `transformers` library for automatic submission, as we don't want to run possibly unsage code on our cluster.*
84
+
85
+ What about models of type X?
86
+ - *We only support models that have been integrated in a stable version of the `transformers` library for automatic submission.*
87
+
88
+ How can I follow when my model is launched?
89
+ - *You can look for its request file [here](https://huggingface.co/datasets/open-llm-leaderboard/requests) and follow the status evolution, or directly in the queues above the submit form.*
90
+
91
+ My model disappeared from all the queues, what happened?
92
+ - *A model disappearing from all the queues usually means that there has been a failure. You can check if that is the case by looking for your model [here](https://huggingface.co/datasets/open-llm-leaderboard/requests).*
93
+
94
+ What causes an evaluation failure?
95
+ - *Most of the failures we get come from problems in the submissions (corrupted files, config problems, wrong parameters selected for eval ...), so we'll be grateful if you first make sure you have followed the steps in `About`. However, from time to time, we have failures on our side (hardware/node failures, problem with an update of our backend, connectivity problem ending up in the results not being saved, ...).*
96
+
97
+ How can I report an evaluation failure?
98
+ - *As we store the logs for all models, feel free to create an issue, **where you link to the requests file of your model** (look for it [here](https://huggingface.co/datasets/open-llm-leaderboard/requests/tree/main)), so we can investigate! If the model failed due to a problem on our side, we'll relaunch it right away!*
99
+ *Note: Please do not re-upload your model under a different name, it will not help*
100
+
101
+ ## 2) Model results
102
+ What kind of information can I find?
103
+ - *Let's imagine you are interested in the Yi-34B results. You have access to 3 different information categories:*
104
+ - *The [request file](https://huggingface.co/datasets/open-llm-leaderboard/requests/blob/main/01-ai/Yi-34B_eval_request_False_bfloat16_Original.json): it gives you information about the status of the evaluation*
105
+ - *The [aggregated results folder](https://huggingface.co/datasets/open-llm-leaderboard/results/tree/main/01-ai/Yi-34B): it gives you aggregated scores, per experimental run*
106
+ - *The [details dataset](https://huggingface.co/datasets/open-llm-leaderboard/details_01-ai__Yi-34B/tree/main): it gives you the full details (scores and examples for each task and a given model)*
107
+
108
+
109
+ Why do models appear several times in the leaderboard?
110
+ - *We run evaluations with user selected precision and model commit. Sometimes, users submit specific models at different commits and at different precisions (for example, in float16 and 4bit to see how quantization affects performance). You should be able to verify this by displaying the `precision` and `model sha` columns in the display. If, however, you see models appearing several time with the same precision and hash commit, this is not normal.*
111
+
112
+ What is this concept of "flagging"?
113
+ - *This mechanism allows user to report models that have unfair performance on the leaderboard. This contains several categories: exceedingly good results on the leaderboard because the model was (maybe accidentally) trained on the evaluation data, models that are copy of other models not atrributed properly, etc.*
114
+
115
+ My model has been flagged improperly, what can I do?
116
+ - *Every flagged model has a discussion associated with it - feel free to plead your case there, and we'll see what to do together with the community.*
117
+
118
+ ## 3) Editing a submission
119
+ I upgraded my model and want to re-submit, how can I do that?
120
+ - *Please open an issue with the precise name of your model, and we'll remove your model from the leaderboard so you can resubmit. You can also resubmit directly with the new commit hash!*
121
+
122
+ I need to rename my model, how can I do that?
123
+ - *You can use @Weyaxi 's [super cool tool](https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-renamer) to request model name changes, then open a discussion where you link to the created pull request, and we'll check them and merge them as needed.*
124
+
125
+ ## 4) Other
126
+ Why don't you display closed source model scores?
127
+ - *This is a leaderboard for Open models, both for philosophical reasons (openness is cool) and for practical reasons: we want to ensure that the results we display are accurate and reproducible, but 1) commercial closed models can change their API thus rendering any scoring at a given time incorrect 2) we re-run everything on our cluster to ensure all models are run on the same setup and you can't do that for these models.*
128
+
129
+ I have an issue about accessing the leaderboard through the Gradio API
130
+ - *Since this is not the recommended way to access the leaderboard, we won't provide support for this, but you can look at tools provided by the community for inspiration!*
131
+ """
132
+
133
+
134
  EVALUATION_QUEUE_TEXT = """
135
  # Evaluation Queue for the 🤗 Open LLM Leaderboard
136
 
137
  Models added here will be automatically evaluated on the 🤗 cluster.
138
 
139
+ ## First steps before submitting a model
140
 
141
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
142
  ```python
 
259
  archivePrefix={arXiv},
260
  primaryClass={cs.CL}
261
  }
262
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/formatting.py CHANGED
@@ -1,24 +1,11 @@
1
  import os
2
- from huggingface_hub import HfApi
3
-
4
- API = HfApi()
5
 
6
- LLAMAS = [
7
- "huggingface/llama-7b",
8
- "huggingface/llama-13b",
9
- "huggingface/llama-30b",
10
- "huggingface/llama-65b",
11
- ]
12
 
13
- KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
14
- VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
15
- OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
16
- DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
17
- MODEL_PAGE = "https://huggingface.co/models"
18
- LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
19
- VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
20
- ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
21
 
 
22
 
23
  def model_hyperlink(link, model_name):
24
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
@@ -27,44 +14,9 @@ def model_hyperlink(link, model_name):
27
  def make_clickable_model(model_name):
28
  link = f"https://huggingface.co/{model_name}"
29
 
30
- if model_name in LLAMAS:
31
- link = LLAMA_LINK
32
- model_name = model_name.split("/")[1]
33
- elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
34
- link = VICUNA_LINK
35
- model_name = "stable-vicuna-13b"
36
- elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
37
- link = ALPACA_LINK
38
- model_name = "alpaca-13b"
39
- if model_name == "dolly-12b":
40
- link = DOLLY_LINK
41
- elif model_name == "vicuna-13b":
42
- link = VICUNA_LINK
43
- elif model_name == "koala-13b":
44
- link = KOALA_LINK
45
- elif model_name == "oasst-12b":
46
- link = OASST_LINK
47
-
48
  details_model_name = model_name.replace("/", "__")
49
  details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
50
 
51
- if not bool(os.getenv("DEBUG", "False")):
52
- # We only add these checks when not debugging, as they are extremely slow
53
- print(f"details_link: {details_link}")
54
- try:
55
- check_path = list(
56
- API.list_files_info(
57
- repo_id=f"open-llm-leaderboard/details_{details_model_name}",
58
- paths="README.md",
59
- repo_type="dataset",
60
- )
61
- )
62
- print(f"check_path: {check_path}")
63
- except Exception as err:
64
- # No details repo for this model
65
- print(f"No details repo for this model: {err}")
66
- return model_hyperlink(link, model_name)
67
-
68
  return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
69
 
70
 
 
1
  import os
2
+ from datetime import datetime, timezone
 
 
3
 
4
+ from huggingface_hub import HfApi
5
+ from huggingface_hub.hf_api import ModelInfo
 
 
 
 
6
 
 
 
 
 
 
 
 
 
7
 
8
+ API = HfApi()
9
 
10
  def model_hyperlink(link, model_name):
11
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 
14
  def make_clickable_model(model_name):
15
  link = f"https://huggingface.co/{model_name}"
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  details_model_name = model_name.replace("/", "__")
18
  details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
21
 
22
 
src/display/utils.py CHANGED
@@ -1,7 +1,25 @@
1
- from dataclasses import dataclass
2
- import pandas as pd
3
  from enum import Enum
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  # These classes are for user facing column names,
7
  # to avoid having to change them all around the code
@@ -15,35 +33,29 @@ class ColumnContent:
15
  never_hidden: bool = False
16
  dummy: bool = False
17
 
18
-
19
- def fields(raw_class):
20
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
21
-
22
-
23
- @dataclass(frozen=True)
24
- class AutoEvalColumn: # Auto evals column
25
- model_type_symbol = ColumnContent("T", "str", True, never_hidden=True)
26
- model = ColumnContent("Model", "markdown", True, never_hidden=True)
27
- average = ColumnContent("Average ⬆️", "number", True)
28
- arc = ColumnContent("ARC", "number", True)
29
- hellaswag = ColumnContent("HellaSwag", "number", True)
30
- mmlu = ColumnContent("MMLU", "number", True)
31
- truthfulqa = ColumnContent("TruthfulQA", "number", True)
32
- winogrande = ColumnContent("Winogrande", "number", True)
33
- gsm8k = ColumnContent("GSM8K", "number", True)
34
- drop = ColumnContent("DROP", "number", True)
35
- model_type = ColumnContent("Type", "str", False)
36
- weight_type = ColumnContent("Weight type", "str", False, True)
37
- precision = ColumnContent("Precision", "str", False) # , True)
38
- license = ColumnContent("Hub License", "str", False)
39
- params = ColumnContent("#Params (B)", "number", False)
40
- likes = ColumnContent("Hub ❤️", "number", False)
41
- still_on_hub = ColumnContent("Available on the hub", "bool", False)
42
- revision = ColumnContent("Model sha", "str", False, False)
43
- dummy = ColumnContent(
44
- "model_name_for_query", "str", False, dummy=True
45
- ) # dummy col to implement search bar (hidden by custom CSS)
46
-
47
 
48
  @dataclass(frozen=True)
49
  class EvalQueueColumn: # Queue column
@@ -59,31 +71,52 @@ baseline_row = {
59
  AutoEvalColumn.model.name: "<p>Baseline</p>",
60
  AutoEvalColumn.revision.name: "N/A",
61
  AutoEvalColumn.precision.name: None,
62
- AutoEvalColumn.average.name: 25.0,
63
  AutoEvalColumn.arc.name: 25.0,
64
  AutoEvalColumn.hellaswag.name: 25.0,
65
  AutoEvalColumn.mmlu.name: 25.0,
66
  AutoEvalColumn.truthfulqa.name: 25.0,
67
  AutoEvalColumn.winogrande.name: 50.0,
68
  AutoEvalColumn.gsm8k.name: 0.21,
69
- AutoEvalColumn.drop.name: 0.47,
70
  AutoEvalColumn.dummy.name: "baseline",
71
  AutoEvalColumn.model_type.name: "",
72
  }
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  @dataclass
76
- class ModelInfo:
77
  name: str
78
- symbol: str # emoji
79
 
80
 
81
  class ModelType(Enum):
82
- PT = ModelInfo(name="pretrained", symbol="🟢")
83
- FT = ModelInfo(name="fine-tuned", symbol="🔶")
84
- IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
85
- RL = ModelInfo(name="RL-tuned", symbol="🟦")
86
- Unknown = ModelInfo(name="", symbol="?")
87
 
88
  def to_str(self, separator=" "):
89
  return f"{self.value.symbol}{separator}{self.value.name}"
@@ -100,22 +133,33 @@ class ModelType(Enum):
100
  return ModelType.IFT
101
  return ModelType.Unknown
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- @dataclass
105
- class Task:
106
- benchmark: str
107
- metric: str
108
- col_name: str
109
-
110
-
111
- class Tasks(Enum):
112
- arc = Task("arc:challenge", "acc_norm", AutoEvalColumn.arc.name)
113
- hellaswag = Task("hellaswag", "acc_norm", AutoEvalColumn.hellaswag.name)
114
- mmlu = Task("hendrycksTest", "acc", AutoEvalColumn.mmlu.name)
115
- truthfulqa = Task("truthfulqa:mc", "mc2", AutoEvalColumn.truthfulqa.name)
116
- winogrande = Task("winogrande", "acc", AutoEvalColumn.winogrande.name)
117
- gsm8k = Task("gsm8k", "acc", AutoEvalColumn.gsm8k.name)
118
- drop = Task("drop", "f1", AutoEvalColumn.drop.name)
119
 
120
 
121
  # Column selection
@@ -127,7 +171,7 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
127
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
128
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
129
 
130
- BENCHMARK_COLS = [t.value.col_name for t in Tasks if t.value.col_name in fields(AutoEvalColumn)]
131
 
132
  NUMERIC_INTERVALS = {
133
  "?": pd.Interval(-1, 0, closed="right"),
 
1
+ from dataclasses import dataclass, make_dataclass
 
2
  from enum import Enum
3
 
4
+ import pandas as pd
5
+
6
+ def fields(raw_class):
7
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
8
+
9
+
10
+ @dataclass
11
+ class Task:
12
+ benchmark: str
13
+ metric: str
14
+ col_name: str
15
+
16
+ class Tasks(Enum):
17
+ arc = Task("arc:challenge", "acc_norm", "ARC")
18
+ hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
19
+ mmlu = Task("hendrycksTest", "acc", "MMLU")
20
+ truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
21
+ winogrande = Task("winogrande", "acc", "Winogrande")
22
+ gsm8k = Task("gsm8k", "acc", "GSM8K")
23
 
24
  # These classes are for user facing column names,
25
  # to avoid having to change them all around the code
 
33
  never_hidden: bool = False
34
  dummy: bool = False
35
 
36
+ auto_eval_column_dict = []
37
+ # Init
38
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
39
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
40
+ #Scores
41
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
42
+ for task in Tasks:
43
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
44
+ # Model information
45
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
46
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
47
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
48
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
49
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
50
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
51
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
52
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
53
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
54
+ # Dummy column for the search bar (hidden by the custom CSS)
55
+ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
56
+
57
+ # We use make dataclass to dynamically fill the scores from Tasks
58
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
 
 
 
 
 
59
 
60
  @dataclass(frozen=True)
61
  class EvalQueueColumn: # Queue column
 
71
  AutoEvalColumn.model.name: "<p>Baseline</p>",
72
  AutoEvalColumn.revision.name: "N/A",
73
  AutoEvalColumn.precision.name: None,
74
+ AutoEvalColumn.average.name: 31.0,
75
  AutoEvalColumn.arc.name: 25.0,
76
  AutoEvalColumn.hellaswag.name: 25.0,
77
  AutoEvalColumn.mmlu.name: 25.0,
78
  AutoEvalColumn.truthfulqa.name: 25.0,
79
  AutoEvalColumn.winogrande.name: 50.0,
80
  AutoEvalColumn.gsm8k.name: 0.21,
 
81
  AutoEvalColumn.dummy.name: "baseline",
82
  AutoEvalColumn.model_type.name: "",
83
  }
84
 
85
+ # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
86
+ # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
87
+ # HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
88
+ # MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
89
+ # TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
90
+ # Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
91
+ # GSM8K: paper
92
+ # Define the human baselines
93
+ human_baseline_row = {
94
+ AutoEvalColumn.model.name: "<p>Human performance</p>",
95
+ AutoEvalColumn.revision.name: "N/A",
96
+ AutoEvalColumn.precision.name: None,
97
+ AutoEvalColumn.average.name: 92.75,
98
+ AutoEvalColumn.arc.name: 80.0,
99
+ AutoEvalColumn.hellaswag.name: 95.0,
100
+ AutoEvalColumn.mmlu.name: 89.8,
101
+ AutoEvalColumn.truthfulqa.name: 94.0,
102
+ AutoEvalColumn.winogrande.name: 94.0,
103
+ AutoEvalColumn.gsm8k.name: 100,
104
+ AutoEvalColumn.dummy.name: "human_baseline",
105
+ AutoEvalColumn.model_type.name: "",
106
+ }
107
 
108
  @dataclass
109
+ class ModelDetails:
110
  name: str
111
+ symbol: str = "" # emoji, only for the model type
112
 
113
 
114
  class ModelType(Enum):
115
+ PT = ModelDetails(name="pretrained", symbol="🟢")
116
+ FT = ModelDetails(name="fine-tuned", symbol="🔶")
117
+ IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
118
+ RL = ModelDetails(name="RL-tuned", symbol="🟦")
119
+ Unknown = ModelDetails(name="", symbol="?")
120
 
121
  def to_str(self, separator=" "):
122
  return f"{self.value.symbol}{separator}{self.value.name}"
 
133
  return ModelType.IFT
134
  return ModelType.Unknown
135
 
136
+ class WeightType(Enum):
137
+ Adapter = ModelDetails("Adapter")
138
+ Original = ModelDetails("Original")
139
+ Delta = ModelDetails("Delta")
140
+
141
+ class Precision(Enum):
142
+ float16 = ModelDetails("float16")
143
+ bfloat16 = ModelDetails("bfloat16")
144
+ qt_8bit = ModelDetails("8bit")
145
+ qt_4bit = ModelDetails("4bit")
146
+ qt_GPTQ = ModelDetails("GPTQ")
147
+ Unknown = ModelDetails("?")
148
+
149
+ def from_str(precision):
150
+ if precision in ["torch.float16", "float16"]:
151
+ return Precision.float16
152
+ if precision in ["torch.bfloat16", "bfloat16"]:
153
+ return Precision.bfloat16
154
+ if precision in ["8bit"]:
155
+ return Precision.qt_8bit
156
+ if precision in ["4bit"]:
157
+ return Precision.qt_4bit
158
+ if precision in ["GPTQ", "None"]:
159
+ return Precision.qt_GPTQ
160
+ return Precision.Unknown
161
+
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
 
165
  # Column selection
 
171
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
172
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
173
 
174
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
175
 
176
  NUMERIC_INTERVALS = {
177
  "?": pd.Interval(-1, 0, closed="right"),
src/envs.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  from huggingface_hub import HfApi
3
 
4
  # clone / pull the lmeh eval data
@@ -13,8 +14,10 @@ PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
13
 
14
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
15
 
16
- EVAL_REQUESTS_PATH = "eval-queue"
17
- EVAL_RESULTS_PATH = "eval-results"
 
 
18
 
19
  EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
20
  EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
@@ -24,5 +27,6 @@ PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c796
24
  # Rate limit variables
25
  RATE_LIMIT_PERIOD = 7
26
  RATE_LIMIT_QUOTA = 5
 
27
 
28
  API = HfApi(token=H4_TOKEN)
 
1
  import os
2
+
3
  from huggingface_hub import HfApi
4
 
5
  # clone / pull the lmeh eval data
 
14
 
15
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
16
 
17
+ CACHE_PATH=os.getenv("HF_HOME", ".")
18
+
19
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
20
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
21
 
22
  EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
23
  EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
 
27
  # Rate limit variables
28
  RATE_LIMIT_PERIOD = 7
29
  RATE_LIMIT_QUOTA = 5
30
+ HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
31
 
32
  API = HfApi(token=H4_TOKEN)
src/leaderboard/read_evals.py CHANGED
@@ -1,37 +1,41 @@
 
1
  import json
2
- import os
3
  import math
4
- import glob
5
  from dataclasses import dataclass
6
- from typing import Dict, List, Tuple
7
 
8
  import dateutil
 
 
9
  import numpy as np
10
 
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks
12
  from src.display.formatting import make_clickable_model
 
13
  from src.submission.check_validity import is_model_on_hub
14
 
15
 
16
  @dataclass
17
  class EvalResult:
18
- eval_name: str
19
- full_model: str
20
- org: str
 
21
  model: str
22
- revision: str
23
  results: dict
24
- precision: str = ""
25
- model_type: ModelType = ModelType.Unknown
26
- weight_type: str = "Original"
 
27
  license: str = "?"
28
  likes: int = 0
29
  num_params: int = 0
30
- date: str = ""
31
  still_on_hub: bool = False
32
 
33
  @classmethod
34
  def init_from_json_file(self, json_filepath):
 
35
  with open(json_filepath) as fp:
36
  data = json.load(fp)
37
 
@@ -39,9 +43,7 @@ class EvalResult:
39
  config = data.get("config", data.get("config_general", None))
40
 
41
  # Precision
42
- precision = config.get("model_dtype")
43
- if precision == "None":
44
- precision = "GPTQ"
45
 
46
  # Get model and org
47
  org_and_model = config.get("model_name", config.get("model_args", None))
@@ -50,13 +52,21 @@ class EvalResult:
50
  if len(org_and_model) == 1:
51
  org = None
52
  model = org_and_model[0]
53
- result_key = f"{model}_{precision}"
54
  else:
55
  org = org_and_model[0]
56
  model = org_and_model[1]
57
- result_key = f"{org}_{model}_{precision}"
 
58
 
59
- still_on_hub = is_model_on_hub("/".join(org_and_model), config.get("model_sha", "main"), trust_remote_code=True)[0]
 
 
 
 
 
 
 
60
 
61
  # Extract results available in this file (some results are split in several files)
62
  results = {}
@@ -73,8 +83,8 @@ class EvalResult:
73
  continue
74
 
75
  # Some truthfulQA values are NaNs
76
- if task.benchmark == "truthfulqa:mc" and task.benchmark in data["results"]:
77
- if math.isnan(float(data["results"][task.benchmark][task.metric])):
78
  results[task.benchmark] = 0.0
79
  continue
80
 
@@ -88,37 +98,42 @@ class EvalResult:
88
 
89
  return self(
90
  eval_name=result_key,
91
- full_model="/".join(org_and_model),
92
  org=org,
93
  model=model,
94
  results=results,
95
- precision=precision, # todo model_type=, weight_type=
96
- revision=config.get("model_sha", ""),
97
- date=config.get("submission_date", ""),
98
  still_on_hub=still_on_hub,
 
99
  )
100
 
101
- def update_with_request_file(self):
102
- request_file = get_request_file_for_model(self.full_model, self.precision)
 
103
 
104
  try:
105
  with open(request_file, "r") as f:
106
  request = json.load(f)
107
  self.model_type = ModelType.from_str(request.get("model_type", ""))
 
108
  self.license = request.get("license", "?")
109
  self.likes = request.get("likes", 0)
110
  self.num_params = request.get("params", 0)
 
111
  except Exception:
112
  print(f"Could not find request file for {self.org}/{self.model}")
113
 
114
  def to_dict(self):
 
115
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
116
  data_dict = {
117
  "eval_name": self.eval_name, # not a column, just a save name,
118
- AutoEvalColumn.precision.name: self.precision,
119
  AutoEvalColumn.model_type.name: self.model_type.value.name,
120
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
121
- AutoEvalColumn.weight_type.name: self.weight_type,
 
122
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
123
  AutoEvalColumn.dummy.name: self.full_model,
124
  AutoEvalColumn.revision.name: self.revision,
@@ -135,9 +150,10 @@ class EvalResult:
135
  return data_dict
136
 
137
 
138
- def get_request_file_for_model(model_name, precision):
 
139
  request_files = os.path.join(
140
- "eval-queue",
141
  f"{model_name}_eval_request_*.json",
142
  )
143
  request_files = glob.glob(request_files)
@@ -149,15 +165,16 @@ def get_request_file_for_model(model_name, precision):
149
  with open(tmp_request_file, "r") as f:
150
  req_content = json.load(f)
151
  if (
152
- req_content["status"] in ["FINISHED", "PENDING_NEW_EVAL"]
153
  and req_content["precision"] == precision.split(".")[-1]
154
  ):
155
  request_file = tmp_request_file
156
  return request_file
157
 
158
 
159
- def get_eval_results(results_path: str) -> List[EvalResult]:
160
- json_filepaths = []
 
161
 
162
  for root, _, files in os.walk(results_path):
163
  # We should only have json files in model results
@@ -170,15 +187,14 @@ def get_eval_results(results_path: str) -> List[EvalResult]:
170
  except dateutil.parser._parser.ParserError:
171
  files = [files[-1]]
172
 
173
- # up_to_date = files[-1]
174
  for file in files:
175
- json_filepaths.append(os.path.join(root, file))
176
 
177
  eval_results = {}
178
- for json_filepath in json_filepaths:
179
  # Creation of result
180
- eval_result = EvalResult.init_from_json_file(json_filepath)
181
- eval_result.update_with_request_file()
182
 
183
  # Store results of same eval together
184
  eval_name = eval_result.eval_name
@@ -190,8 +206,9 @@ def get_eval_results(results_path: str) -> List[EvalResult]:
190
  results = []
191
  for v in eval_results.values():
192
  try:
193
- results.append(v.to_dict())
194
- except KeyError: # not all eval values present
 
195
  continue
196
 
197
  return results
 
1
+ import glob
2
  import json
 
3
  import math
4
+ import os
5
  from dataclasses import dataclass
 
6
 
7
  import dateutil
8
+ from datetime import datetime
9
+ from transformers import AutoConfig
10
  import numpy as np
11
 
 
12
  from src.display.formatting import make_clickable_model
13
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
14
  from src.submission.check_validity import is_model_on_hub
15
 
16
 
17
  @dataclass
18
  class EvalResult:
19
+ # Also see src.display.utils.AutoEvalColumn for what will be displayed.
20
+ eval_name: str # org_model_precision (uid)
21
+ full_model: str # org/model (path on hub)
22
+ org: str
23
  model: str
24
+ revision: str # commit hash, "" if main
25
  results: dict
26
+ precision: Precision = Precision.Unknown
27
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
+ weight_type: WeightType = WeightType.Original # Original or Adapter
29
+ architecture: str = "Unknown" # From config file
30
  license: str = "?"
31
  likes: int = 0
32
  num_params: int = 0
33
+ date: str = "" # submission date of request file
34
  still_on_hub: bool = False
35
 
36
  @classmethod
37
  def init_from_json_file(self, json_filepath):
38
+ """Inits the result from the specific model result file"""
39
  with open(json_filepath) as fp:
40
  data = json.load(fp)
41
 
 
43
  config = data.get("config", data.get("config_general", None))
44
 
45
  # Precision
46
+ precision = Precision.from_str(config.get("model_dtype"))
 
 
47
 
48
  # Get model and org
49
  org_and_model = config.get("model_name", config.get("model_args", None))
 
52
  if len(org_and_model) == 1:
53
  org = None
54
  model = org_and_model[0]
55
+ result_key = f"{model}_{precision.value.name}"
56
  else:
57
  org = org_and_model[0]
58
  model = org_and_model[1]
59
+ result_key = f"{org}_{model}_{precision.value.name}"
60
+ full_model = "/".join(org_and_model)
61
 
62
+ still_on_hub, error, model_config = is_model_on_hub(
63
+ full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
64
+ )
65
+ architecture = "?"
66
+ if model_config is not None:
67
+ architectures = getattr(model_config, "architectures", None)
68
+ if architectures:
69
+ architecture = ";".join(architectures)
70
 
71
  # Extract results available in this file (some results are split in several files)
72
  results = {}
 
83
  continue
84
 
85
  # Some truthfulQA values are NaNs
86
+ if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
87
+ if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
88
  results[task.benchmark] = 0.0
89
  continue
90
 
 
98
 
99
  return self(
100
  eval_name=result_key,
101
+ full_model=full_model,
102
  org=org,
103
  model=model,
104
  results=results,
105
+ precision=precision,
106
+ revision= config.get("model_sha", ""),
 
107
  still_on_hub=still_on_hub,
108
+ architecture=architecture
109
  )
110
 
111
+ def update_with_request_file(self, requests_path):
112
+ """Finds the relevant request file for the current model and updates info with it"""
113
+ request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
114
 
115
  try:
116
  with open(request_file, "r") as f:
117
  request = json.load(f)
118
  self.model_type = ModelType.from_str(request.get("model_type", ""))
119
+ self.weight_type = WeightType[request.get("weight_type", "Original")]
120
  self.license = request.get("license", "?")
121
  self.likes = request.get("likes", 0)
122
  self.num_params = request.get("params", 0)
123
+ self.date = request.get("submitted_time", "")
124
  except Exception:
125
  print(f"Could not find request file for {self.org}/{self.model}")
126
 
127
  def to_dict(self):
128
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
129
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
130
  data_dict = {
131
  "eval_name": self.eval_name, # not a column, just a save name,
132
+ AutoEvalColumn.precision.name: self.precision.value.name,
133
  AutoEvalColumn.model_type.name: self.model_type.value.name,
134
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
135
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
136
+ AutoEvalColumn.architecture.name: self.architecture,
137
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
138
  AutoEvalColumn.dummy.name: self.full_model,
139
  AutoEvalColumn.revision.name: self.revision,
 
150
  return data_dict
151
 
152
 
153
+ def get_request_file_for_model(requests_path, model_name, precision):
154
+ """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
155
  request_files = os.path.join(
156
+ requests_path,
157
  f"{model_name}_eval_request_*.json",
158
  )
159
  request_files = glob.glob(request_files)
 
165
  with open(tmp_request_file, "r") as f:
166
  req_content = json.load(f)
167
  if (
168
+ req_content["status"] in ["FINISHED"]
169
  and req_content["precision"] == precision.split(".")[-1]
170
  ):
171
  request_file = tmp_request_file
172
  return request_file
173
 
174
 
175
+ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
176
+ """From the path of the results folder root, extract all needed info for results"""
177
+ model_result_filepaths = []
178
 
179
  for root, _, files in os.walk(results_path):
180
  # We should only have json files in model results
 
187
  except dateutil.parser._parser.ParserError:
188
  files = [files[-1]]
189
 
 
190
  for file in files:
191
+ model_result_filepaths.append(os.path.join(root, file))
192
 
193
  eval_results = {}
194
+ for model_result_filepath in model_result_filepaths:
195
  # Creation of result
196
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
197
+ eval_result.update_with_request_file(requests_path)
198
 
199
  # Store results of same eval together
200
  eval_name = eval_result.eval_name
 
206
  results = []
207
  for v in eval_results.values():
208
  try:
209
+ v.to_dict() # we test if the dict version is complete
210
+ results.append(v)
211
+ except KeyError: # not all eval values present
212
  continue
213
 
214
  return results
src/populate.py CHANGED
@@ -3,24 +3,25 @@ import os
3
 
4
  import pandas as pd
5
 
6
- from src.leaderboard.filter_models import filter_models
7
- from src.leaderboard.read_evals import get_eval_results
8
- from src.display.formatting import make_clickable_model, has_no_nan_values
9
  from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
 
 
10
 
11
 
12
- def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
- all_data = get_eval_results(results_path)
14
- all_data.append(baseline_row)
15
- filter_models(all_data)
 
16
 
17
- df = pd.DataFrame.from_records(all_data)
18
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
19
  df = df[cols].round(decimals=2)
20
 
21
  # filter out if any of the benchmarks have not been produced
22
  df = df[has_no_nan_values(df, benchmark_cols)]
23
- return df
24
 
25
 
26
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
3
 
4
  import pandas as pd
5
 
6
+ from src.display.formatting import has_no_nan_values, make_clickable_model
 
 
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
8
+ from src.leaderboard.filter_models import filter_models
9
+ from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
+ raw_data = get_raw_eval_results(results_path, requests_path)
14
+ all_data_json = [v.to_dict() for v in raw_data]
15
+ all_data_json.append(baseline_row)
16
+ filter_models(all_data_json)
17
 
18
+ df = pd.DataFrame.from_records(all_data_json)
19
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
20
  df = df[cols].round(decimals=2)
21
 
22
  # filter out if any of the benchmarks have not been produced
23
  df = df[has_no_nan_values(df, benchmark_cols)]
24
+ return raw_data, df
25
 
26
 
27
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
src/submission/check_validity.py CHANGED
@@ -1,13 +1,16 @@
1
- import huggingface_hub
2
- import os
3
  import json
 
4
  import re
5
  from collections import defaultdict
6
- from huggingface_hub.hf_api import ModelInfo
 
 
7
  from huggingface_hub import ModelCard
8
- from transformers import AutoConfig
 
 
9
 
10
- from datetime import datetime, timedelta, timezone
11
 
12
 
13
  # ht to @Wauplin, thank you for the snippet!
@@ -34,26 +37,36 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
34
  return True, ""
35
 
36
 
37
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False) -> tuple[bool, str]:
38
  try:
39
- AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
40
- return True, None
 
 
 
 
 
 
 
 
 
41
 
42
  except ValueError:
43
  return (
44
  False,
45
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
 
46
  )
47
 
48
- except Exception:
49
- return False, "was not found on hub!"
50
 
51
 
52
  def get_model_size(model_info: ModelInfo, precision: str):
53
  size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
54
  try:
55
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
56
- except AttributeError:
57
  try:
58
  size_match = re.search(size_pattern, model_info.modelId.lower())
59
  model_size = size_match.group(0)
@@ -65,9 +78,10 @@ def get_model_size(model_info: ModelInfo, precision: str):
65
  model_size = size_factor * model_size
66
  return model_size
67
 
 
 
68
 
69
- def user_submission_permission(submission_name, users_to_submission_dates, rate_limit_period, rate_limit_quota):
70
- org_or_user, _ = submission_name.split("/")
71
  if org_or_user not in users_to_submission_dates:
72
  return True, ""
73
  submission_dates = sorted(users_to_submission_dates[org_or_user])
@@ -76,6 +90,9 @@ def user_submission_permission(submission_name, users_to_submission_dates, rate_
76
  submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
77
 
78
  num_models_submitted_in_period = len(submissions_after_timelimit)
 
 
 
79
  if num_models_submitted_in_period > rate_limit_quota:
80
  error_msg = f"Organisation or user `{org_or_user}`"
81
  error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
 
 
 
1
  import json
2
+ import os
3
  import re
4
  from collections import defaultdict
5
+ from datetime import datetime, timedelta, timezone
6
+
7
+ import huggingface_hub
8
  from huggingface_hub import ModelCard
9
+ from huggingface_hub.hf_api import ModelInfo
10
+ from transformers import AutoConfig, AutoTokenizer
11
+ from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
12
 
13
+ from src.envs import HAS_HIGHER_RATE_LIMIT
14
 
15
 
16
  # ht to @Wauplin, thank you for the snippet!
 
37
  return True, ""
38
 
39
 
40
+ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
41
  try:
42
+ config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
43
+ if test_tokenizer:
44
+ try:
45
+ AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
46
+ except ValueError as e:
47
+ return (
48
+ False,
49
+ f"uses a tokenizer which is not in a transformers release: {e}",
50
+ None
51
+ )
52
+ return True, None, config
53
 
54
  except ValueError:
55
  return (
56
  False,
57
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
58
+ None
59
  )
60
 
61
+ except Exception as e:
62
+ return False, "was not found on hub!", None
63
 
64
 
65
  def get_model_size(model_info: ModelInfo, precision: str):
66
  size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
67
  try:
68
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
69
+ except (AttributeError, TypeError ):
70
  try:
71
  size_match = re.search(size_pattern, model_info.modelId.lower())
72
  model_size = size_match.group(0)
 
78
  model_size = size_factor * model_size
79
  return model_size
80
 
81
+ def get_model_arch(model_info: ModelInfo):
82
+ return model_info.config.get("architectures", "Unknown")
83
 
84
+ def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
 
85
  if org_or_user not in users_to_submission_dates:
86
  return True, ""
87
  submission_dates = sorted(users_to_submission_dates[org_or_user])
 
90
  submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
91
 
92
  num_models_submitted_in_period = len(submissions_after_timelimit)
93
+ if org_or_user in HAS_HIGHER_RATE_LIMIT:
94
+ rate_limit_quota = 2 * rate_limit_quota
95
+
96
  if num_models_submitted_in_period > rate_limit_quota:
97
  error_msg = f"Organisation or user `{org_or_user}`"
98
  error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
src/submission/submit.py CHANGED
@@ -1,20 +1,20 @@
1
- import os, json
2
-
3
  from datetime import datetime, timezone
4
 
5
- from src.display.formatting import styled_error, styled_warning, styled_message
 
6
  from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
7
  from src.submission.check_validity import (
8
- user_submission_permission,
9
- is_model_on_hub,
10
- get_model_size,
11
- check_model_card,
12
  already_submitted_models,
 
 
 
 
13
  )
14
- from src.envs import RATE_LIMIT_QUOTA, RATE_LIMIT_PERIOD, H4_TOKEN, EVAL_REQUESTS_PATH, API, QUEUE_REPO
15
-
16
- requested_models, users_to_submission_dates = already_submitted_models(EVAL_REQUESTS_PATH)
17
 
 
 
18
 
19
  def add_new_eval(
20
  model: str,
@@ -25,6 +25,17 @@ def add_new_eval(
25
  weight_type: str,
26
  model_type: str,
27
  ):
 
 
 
 
 
 
 
 
 
 
 
28
  precision = precision.split(" ")[0]
29
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
30
 
@@ -32,11 +43,12 @@ def add_new_eval(
32
  return styled_error("Please select a model type.")
33
 
34
  # Is the user rate limited?
35
- user_can_submit, error_msg = user_submission_permission(
36
- model, users_to_submission_dates, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
37
- )
38
- if not user_can_submit:
39
- return styled_error(error_msg)
 
40
 
41
  # Did the model authors forbid its submission to the leaderboard?
42
  if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
@@ -48,12 +60,12 @@ def add_new_eval(
48
 
49
  # Is the model on the hub?
50
  if weight_type in ["Delta", "Adapter"]:
51
- base_model_on_hub, error = is_model_on_hub(base_model, revision, H4_TOKEN)
52
  if not base_model_on_hub:
53
  return styled_error(f'Base model "{base_model}" {error}')
54
 
55
  if not weight_type == "Adapter":
56
- model_on_hub, error = is_model_on_hub(model, revision)
57
  if not model_on_hub:
58
  return styled_error(f'Model "{model}" {error}')
59
 
@@ -93,21 +105,15 @@ def add_new_eval(
93
  "license": license,
94
  }
95
 
96
- user_name = ""
97
- model_path = model
98
- if "/" in model:
99
- user_name = model.split("/")[0]
100
- model_path = model.split("/")[1]
101
 
102
  print("Creating eval file")
103
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
104
  os.makedirs(OUT_DIR, exist_ok=True)
105
  out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
106
 
107
- # Check for duplicate submission
108
- if f"{model}_{revision}_{precision}" in requested_models:
109
- return styled_warning("This model has been already submitted.")
110
-
111
  with open(out_path, "w") as f:
112
  f.write(json.dumps(eval_entry))
113
 
 
1
+ import json
2
+ import os
3
  from datetime import datetime, timezone
4
 
5
+ from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
7
  from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
8
  from src.submission.check_validity import (
 
 
 
 
9
  already_submitted_models,
10
+ check_model_card,
11
+ get_model_size,
12
+ is_model_on_hub,
13
+ user_submission_permission,
14
  )
 
 
 
15
 
16
+ REQUESTED_MODELS = None
17
+ USERS_TO_SUBMISSION_DATES = None
18
 
19
  def add_new_eval(
20
  model: str,
 
25
  weight_type: str,
26
  model_type: str,
27
  ):
28
+ global REQUESTED_MODELS
29
+ global USERS_TO_SUBMISSION_DATES
30
+ if not REQUESTED_MODELS:
31
+ REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
32
+
33
+ user_name = ""
34
+ model_path = model
35
+ if "/" in model:
36
+ user_name = model.split("/")[0]
37
+ model_path = model.split("/")[1]
38
+
39
  precision = precision.split(" ")[0]
40
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
41
 
 
43
  return styled_error("Please select a model type.")
44
 
45
  # Is the user rate limited?
46
+ if user_name != "":
47
+ user_can_submit, error_msg = user_submission_permission(
48
+ user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
49
+ )
50
+ if not user_can_submit:
51
+ return styled_error(error_msg)
52
 
53
  # Did the model authors forbid its submission to the leaderboard?
54
  if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
 
60
 
61
  # Is the model on the hub?
62
  if weight_type in ["Delta", "Adapter"]:
63
+ base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
64
  if not base_model_on_hub:
65
  return styled_error(f'Base model "{base_model}" {error}')
66
 
67
  if not weight_type == "Adapter":
68
+ model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
69
  if not model_on_hub:
70
  return styled_error(f'Model "{model}" {error}')
71
 
 
105
  "license": license,
106
  }
107
 
108
+ # Check for duplicate submission
109
+ if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
110
+ return styled_warning("This model has been already submitted.")
 
 
111
 
112
  print("Creating eval file")
113
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
114
  os.makedirs(OUT_DIR, exist_ok=True)
115
  out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
116
 
 
 
 
 
117
  with open(out_path, "w") as f:
118
  f.write(json.dumps(eval_entry))
119
 
src/tools/collections.py CHANGED
@@ -1,11 +1,11 @@
1
  import os
 
2
  import pandas as pd
3
- from pandas import DataFrame
4
- from huggingface_hub import get_collection, add_collection_item, update_collection_item, delete_collection_item
5
  from huggingface_hub.utils._errors import HfHubHTTPError
 
6
 
7
  from src.display.utils import AutoEvalColumn, ModelType
8
-
9
  from src.envs import H4_TOKEN, PATH_TO_COLLECTION
10
 
11
  # Specific intervals for the collections
 
1
  import os
2
+
3
  import pandas as pd
4
+ from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
 
5
  from huggingface_hub.utils._errors import HfHubHTTPError
6
+ from pandas import DataFrame
7
 
8
  from src.display.utils import AutoEvalColumn, ModelType
 
9
  from src.envs import H4_TOKEN, PATH_TO_COLLECTION
10
 
11
  # Specific intervals for the collections
src/tools/plots.py CHANGED
@@ -1,151 +1,84 @@
1
  import pandas as pd
 
2
  import plotly.express as px
3
  from plotly.graph_objs import Figure
4
- import pickle
5
- from datetime import datetime, timezone
6
- from typing import List, Dict, Tuple, Any
7
- from src.leaderboard.filter_models import FLAGGED_MODELS
8
-
9
- # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
10
- # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
11
- # HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
12
- # MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
13
- # TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
14
- # Define the human baselines
15
- HUMAN_BASELINES = {
16
- "Average ⬆️": 0.897 * 100,
17
- "ARC": 0.80 * 100,
18
- "HellaSwag": 0.95 * 100,
19
- "MMLU": 0.898 * 100,
20
- "TruthfulQA": 0.94 * 100,
21
- }
22
-
23
-
24
- def to_datetime(model_info: Tuple[str, Any]) -> datetime:
25
- """
26
- Converts the lastModified attribute of the object to datetime.
27
-
28
- :param model_info: A tuple containing the name and object.
29
- The object must have a lastModified attribute
30
- with a string representing the date and time.
31
- :return: A datetime object converted from the lastModified attribute of the input object.
32
- """
33
- name, obj = model_info
34
- return datetime.strptime(obj.lastModified, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
35
-
36
-
37
- def join_model_info_with_results(results_df: pd.DataFrame) -> pd.DataFrame:
38
- """
39
- Integrates model information with the results DataFrame by matching 'Model sha'.
40
- :param results_df: A DataFrame containing results information including 'Model sha' column.
41
- :return: A DataFrame with updated 'Results Date' columns, which are synchronized with model information.
42
- """
43
- # copy dataframe to avoid modifying the original
44
- df = results_df.copy(deep=True)
45
-
46
- # Filter out FLAGGED_MODELS to ensure graph is not skewed by mistakes
47
- df = df[~df["model_name_for_query"].isin(FLAGGED_MODELS.keys())].reset_index(drop=True)
48
-
49
- # load cache from disk
50
- try:
51
- with open("model_info_cache.pkl", "rb") as f:
52
- model_info_cache = pickle.load(f)
53
- except (EOFError, FileNotFoundError):
54
- model_info_cache = {}
55
 
56
- # Sort date strings using datetime objects as keys
57
- sorted_dates = sorted(list(model_info_cache.items()), key=to_datetime, reverse=True)
58
- df["Results Date"] = datetime.now().replace(tzinfo=timezone.utc)
59
-
60
- # Define the date format string
61
- date_format = "%Y-%m-%dT%H:%M:%S.%fZ"
62
-
63
- # Iterate over sorted_dates and update the dataframe
64
- for name, obj in sorted_dates:
65
- # Convert the lastModified string to a datetime object
66
- last_modified_datetime = datetime.strptime(obj.lastModified, date_format).replace(tzinfo=timezone.utc)
67
 
68
- # Update the "Results Date" column where "Model sha" equals obj.sha
69
- df.loc[df["Model sha"] == obj.sha, "Results Date"] = last_modified_datetime
70
- return df
71
 
72
 
73
- def create_scores_df(results_df: pd.DataFrame) -> pd.DataFrame:
74
  """
75
- Generates a DataFrame containing the maximum scores until each result date.
76
 
77
- :param results_df: A DataFrame containing result information including metric scores and result dates.
78
- :return: A new DataFrame containing the maximum scores until each result date for every metric.
79
  """
80
- # Step 1: Ensure 'Results Date' is in datetime format and sort the DataFrame by it
81
- results_df["Results Date"] = pd.to_datetime(results_df["Results Date"])
82
- results_df.sort_values(by="Results Date", inplace=True)
 
83
 
84
  # Step 2: Initialize the scores dictionary
85
- scores = {
86
- "Average ⬆️": [],
87
- "ARC": [],
88
- "HellaSwag": [],
89
- "MMLU": [],
90
- "TruthfulQA": [],
91
- "Result Date": [],
92
- "Model Name": [],
93
- }
94
 
95
  # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
96
- for i, row in results_df.iterrows():
97
- date = row["Results Date"]
98
- for column in scores.keys():
99
- if column == "Result Date":
100
- if not scores[column] or scores[column][-1] <= date:
101
- scores[column].append(date)
 
102
  continue
103
- if column == "Model Name":
104
- scores[column].append(row["model_name_for_query"])
105
- continue
106
- current_max = scores[column][-1] if scores[column] else float("-inf")
107
- scores[column].append(max(current_max, row[column]))
108
 
109
- # Step 4: Convert the dictionary to a DataFrame
110
- return pd.DataFrame(scores)
 
 
 
 
 
 
 
 
 
 
 
111
 
 
 
112
 
113
- def create_plot_df(scores_df: pd.DataFrame) -> pd.DataFrame:
 
114
  """
115
  Transforms the scores DataFrame into a new format suitable for plotting.
116
 
117
- :param scores_df: A DataFrame containing metric scores and result dates.
118
  :return: A new DataFrame reshaped for plotting purposes.
119
  """
120
- # Sample columns
121
- cols = ["Average ⬆️", "ARC", "HellaSwag", "MMLU", "TruthfulQA"]
122
-
123
  # Initialize the list to store DataFrames
124
  dfs = []
125
 
126
  # Iterate over the cols and create a new DataFrame for each column
127
- for col in cols:
128
- d = scores_df[[col, "Model Name", "Result Date"]].copy().reset_index(drop=True)
129
- d["Metric Name"] = col
130
- d.rename(columns={col: "Metric Value"}, inplace=True)
131
  dfs.append(d)
132
 
133
  # Concatenate all the created DataFrames
134
  concat_df = pd.concat(dfs, ignore_index=True)
135
 
136
- # Sort values by 'Result Date'
137
- concat_df.sort_values(by="Result Date", inplace=True)
138
- concat_df.reset_index(drop=True, inplace=True)
139
-
140
- # Drop duplicates based on 'Metric Name' and 'Metric Value' and keep the first (earliest) occurrence
141
- concat_df.drop_duplicates(subset=["Metric Name", "Metric Value"], keep="first", inplace=True)
142
-
143
  concat_df.reset_index(drop=True, inplace=True)
144
  return concat_df
145
 
146
 
147
  def create_metric_plot_obj(
148
- df: pd.DataFrame, metrics: List[str], human_baselines: Dict[str, float], title: str
149
  ) -> Figure:
150
  """
151
  Create a Plotly figure object with lines representing different metrics
@@ -154,27 +87,25 @@ def create_metric_plot_obj(
154
  :param df: The DataFrame containing the metric values, names, and dates.
155
  :param metrics: A list of strings representing the names of the metrics
156
  to be included in the plot.
157
- :param human_baselines: A dictionary where keys are metric names
158
- and values are human baseline values for the metrics.
159
  :param title: A string representing the title of the plot.
160
  :return: A Plotly figure object with lines representing metrics and
161
  horizontal dotted lines representing human baselines.
162
  """
163
 
164
  # Filter the DataFrame based on the specified metrics
165
- df = df[df["Metric Name"].isin(metrics)]
166
 
167
  # Filter the human baselines based on the specified metrics
168
- filtered_human_baselines = {k: v for k, v in human_baselines.items() if k in metrics}
169
 
170
  # Create a line figure using plotly express with specified markers and custom data
171
  fig = px.line(
172
  df,
173
- x="Result Date",
174
- y="Metric Value",
175
- color="Metric Name",
176
  markers=True,
177
- custom_data=["Metric Name", "Metric Value", "Model Name"],
178
  title=title,
179
  )
180
 
 
1
  import pandas as pd
2
+ import numpy as np
3
  import plotly.express as px
4
  from plotly.graph_objs import Figure
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ from src.leaderboard.filter_models import FLAGGED_MODELS
7
+ from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS
8
+ from src.leaderboard.read_evals import EvalResult
 
 
 
 
 
 
 
 
9
 
 
 
 
10
 
11
 
12
+ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
13
  """
14
+ Generates a DataFrame containing the maximum scores until each date.
15
 
16
+ :param results_df: A DataFrame containing result information including metric scores and dates.
17
+ :return: A new DataFrame containing the maximum scores until each date for every metric.
18
  """
19
+ # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
20
+ results_df = pd.DataFrame(raw_data)
21
+ #results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
22
+ results_df.sort_values(by="date", inplace=True)
23
 
24
  # Step 2: Initialize the scores dictionary
25
+ scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
 
 
 
 
 
 
 
 
26
 
27
  # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
28
+ for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
29
+ current_max = 0
30
+ last_date = ""
31
+ column = task.col_name
32
+ for _, row in results_df.iterrows():
33
+ current_model = row["full_model"]
34
+ if current_model in FLAGGED_MODELS:
35
  continue
 
 
 
 
 
36
 
37
+ current_date = row["date"]
38
+ if task.benchmark == "Average":
39
+ current_score = np.mean(list(row["results"].values()))
40
+ else:
41
+ current_score = row["results"][task.benchmark]
42
+
43
+ if current_score > current_max:
44
+ if current_date == last_date and len(scores[column]) > 0:
45
+ scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
46
+ else:
47
+ scores[column].append({"model": current_model, "date": current_date, "score": current_score})
48
+ current_max = current_score
49
+ last_date = current_date
50
 
51
+ # Step 4: Return all dictionaries as DataFrames
52
+ return {k: pd.DataFrame(v) for k, v in scores.items()}
53
 
54
+
55
+ def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
56
  """
57
  Transforms the scores DataFrame into a new format suitable for plotting.
58
 
59
+ :param scores_df: A DataFrame containing metric scores and dates.
60
  :return: A new DataFrame reshaped for plotting purposes.
61
  """
 
 
 
62
  # Initialize the list to store DataFrames
63
  dfs = []
64
 
65
  # Iterate over the cols and create a new DataFrame for each column
66
+ for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
67
+ d = scores_df[col].reset_index(drop=True)
68
+ d["task"] = col
 
69
  dfs.append(d)
70
 
71
  # Concatenate all the created DataFrames
72
  concat_df = pd.concat(dfs, ignore_index=True)
73
 
74
+ # Sort values by 'date'
75
+ concat_df.sort_values(by="date", inplace=True)
 
 
 
 
 
76
  concat_df.reset_index(drop=True, inplace=True)
77
  return concat_df
78
 
79
 
80
  def create_metric_plot_obj(
81
+ df: pd.DataFrame, metrics: list[str], title: str
82
  ) -> Figure:
83
  """
84
  Create a Plotly figure object with lines representing different metrics
 
87
  :param df: The DataFrame containing the metric values, names, and dates.
88
  :param metrics: A list of strings representing the names of the metrics
89
  to be included in the plot.
 
 
90
  :param title: A string representing the title of the plot.
91
  :return: A Plotly figure object with lines representing metrics and
92
  horizontal dotted lines representing human baselines.
93
  """
94
 
95
  # Filter the DataFrame based on the specified metrics
96
+ df = df[df["task"].isin(metrics)]
97
 
98
  # Filter the human baselines based on the specified metrics
99
+ filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
100
 
101
  # Create a line figure using plotly express with specified markers and custom data
102
  fig = px.line(
103
  df,
104
+ x="date",
105
+ y="score",
106
+ color="task",
107
  markers=True,
108
+ custom_data=["task", "score", "model"],
109
  title=title,
110
  )
111