data_only_open_llm_leaderboard

Runtime error

App Files Files Community

felix commited on Dec 4, 2023

Commit

c5f708b

2 Parent(s): 940ccca 2246286

Merge remote-tracking branch 'upstream/main'

Browse files

Files changed (14) hide show

README.md +1 -1
app.py +38 -48
requirements.txt +3 -3
scripts/create_request_file.py +5 -4
src/display/about.py +78 -37
src/display/formatting.py +4 -52
src/display/utils.py +100 -56
src/envs.py +6 -2
src/leaderboard/read_evals.py +57 -40
src/populate.py +10 -9
src/submission/check_validity.py +30 -13
src/submission/submit.py +32 -26
src/tools/collections.py +3 -3
src/tools/plots.py +50 -119

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🏆
 colorFrom: green
 colorTo: indigo
 sdk: gradio
-sdk_version: 3.43.2
 app_file: app.py
 pinned: true
 license: apache-2.0

 colorFrom: green
 colorTo: indigo
 sdk: gradio
+sdk_version: 4.4.0
 app_file: app.py
 pinned: true
 license: apache-2.0

app.py CHANGED Viewed

@@ -1,56 +1,59 @@
 import json
 import os
 from datetime import datetime, timezone
-import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
-from src.display.utils import (
-    COLS,
-    TYPES,
-    BENCHMARK_COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    NUMERIC_INTERVALS,
-    fields,
-)
-from src.display.css_html_js import custom_css, get_window_url_params
 from src.display.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     EVALUATION_QUEUE_TEXT,
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
 )
 from src.tools.plots import (
     create_metric_plot_obj,
-    create_scores_df,
     create_plot_df,
-    join_model_info_with_results,
-    HUMAN_BASELINES,
 )
-from src.tools.collections import update_collections
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.envs import H4_TOKEN, QUEUE_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO, API, REPO_ID, IS_PUBLIC
-from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
 try:
     snapshot_download(
         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
 except Exception:
     restart_space()
 try:
     snapshot_download(
         repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
@@ -58,13 +61,11 @@ except Exception:
     restart_space()
-original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
 update_collections(original_df.copy())
 leaderboard_df = original_df.copy()
-#models = original_df["model_name_for_query"].tolist()  # needed for model backlinks in their to the leaderboard
-# plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
-#to_be_dumped = f"models = {repr(models)}\n"
 (
     finished_eval_queue_df,
@@ -73,26 +74,15 @@ leaderboard_df = original_df.copy()
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-# Basics
-def change_tab(query_param: str):
-    query_param = query_param.replace("'", '"')
-    query_param = json.loads(query_param)
-    if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "evaluation":
-        return gr.Tabs.update(selected=1)
-    else:
-        return gr.Tabs.update(selected=0)
 # Searching and filtering
 def update_table(
-        hidden_df: pd.DataFrame,
-        columns: list,
-        type_query: list,
-        precision_query: str,
-        size_query: list,
-        show_deleted: bool,
-        query: str,
 ):
     filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
     filtered_df = filter_queries(query, filtered_df)
@@ -112,7 +102,7 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     # We use COLS to maintain sorting
     filtered_df = df[
         always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
-        ]
     return filtered_df
@@ -137,7 +127,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
 def filter_models(
-        df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
 ) -> pd.DataFrame:
     # Show all models
     if show_deleted:
@@ -146,8 +136,8 @@ def filter_models(
         filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
     type_emoji = [t[0] for t in type_query]
-    filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
-    filtered_df = filtered_df[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
     numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
     params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
@@ -478,7 +468,7 @@ dummy1 = gr.Textbox(visible=False)
 hidden_leaderboard_table_for_search = gr.components.Dataframe(
     headers=COLS,
     datatype=TYPES,
-    max_rows=None,
     visible=False,
 )

+import gradio as gr
 import json
 import os
 from datetime import datetime, timezone
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.display.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     EVALUATION_QUEUE_TEXT,
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
+    FAQ_TEXT,
     TITLE,
 )
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    BENCHMARK_COLS,
+    COLS,
+    EVAL_COLS,
+    EVAL_TYPES,
+    NUMERIC_INTERVALS,
+    TYPES,
+    AutoEvalColumn,
+    ModelType,
+    fields,
+    WeightType,
+    Precision
+)
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
+from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval
+from src.submission.check_validity import already_submitted_models
+from src.tools.collections import update_collections
 from src.tools.plots import (
     create_metric_plot_obj,
     create_plot_df,
+    create_scores_df,
 )
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
 try:
+    print(EVAL_REQUESTS_PATH)
     snapshot_download(
         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
 except Exception:
     restart_space()
 try:
+    print(EVAL_RESULTS_PATH)
     snapshot_download(
         repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
     restart_space()
+raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 update_collections(original_df.copy())
 leaderboard_df = original_df.copy()
+plot_df = create_plot_df(create_scores_df(raw_data))
 (
     finished_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 # Searching and filtering
 def update_table(
+    hidden_df: pd.DataFrame,
+    columns: list,
+    type_query: list,
+    precision_query: str,
+    size_query: list,
+    show_deleted: bool,
+    query: str,
 ):
     filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
     filtered_df = filter_queries(query, filtered_df)
     # We use COLS to maintain sorting
     filtered_df = df[
         always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
+    ]
     return filtered_df
 def filter_models(
+    df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
 ) -> pd.DataFrame:
     # Show all models
     if show_deleted:
         filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
     type_emoji = [t[0] for t in type_query]
+    filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
+    filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
     numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
     params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
 hidden_leaderboard_table_for_search = gr.components.Dataframe(
     headers=COLS,
     datatype=TYPES,
+                max_rows=None,
     visible=False,
 )

requirements.txt CHANGED Viewed

@@ -13,8 +13,8 @@ pandas==2.0.0
 plotly==5.14.1
 python-dateutil==2.8.2
 requests==2.28.2
 semantic-version==2.10.0
 tqdm==4.65.0
-git+https://github.com/clefourrier/transformers.git@req-fix#egg=transformers
-#transformers==4.35.1
-tokenizers>=0.15.0

 plotly==5.14.1
 python-dateutil==2.8.2
 requests==2.28.2
+sentencepiece
 semantic-version==2.10.0
 tqdm==4.65.0
+transformers==4.35.2
+tokenizers>=0.15.0

scripts/create_request_file.py CHANGED Viewed

@@ -1,11 +1,12 @@
-from datetime import datetime, timezone
 import json
 import os
 import re
 import click
-from huggingface_hub import HfApi, snapshot_download
 from colorama import Fore
-import pprint
 EVAL_REQUESTS_PATH = "eval-queue"
 QUEUE_REPO = "open-llm-leaderboard/requests"
@@ -19,7 +20,7 @@ def get_model_size(model_info, precision: str):
     size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
         model_size = round(model_info.safetensors["total"] / 1e9, 3)
-    except AttributeError:
         try:
             size_match = re.search(size_pattern, model_info.modelId.lower())
             model_size = size_match.group(0)

 import json
 import os
+import pprint
 import re
+from datetime import datetime, timezone
 import click
 from colorama import Fore
+from huggingface_hub import HfApi, snapshot_download
 EVAL_REQUESTS_PATH = "eval-queue"
 QUEUE_REPO = "open-llm-leaderboard/requests"
     size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
         model_size = round(model_info.safetensors["total"] / 1e9, 3)
+    except (AttributeError, TypeError):
         try:
             size_match = re.search(size_pattern, model_info.modelId.lower())
             model_size = size_match.group(0)

src/display/about.py CHANGED Viewed

@@ -13,20 +13,9 @@ LLM_BENCHMARKS_TEXT = f"""
 # Context
 With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
-## Icons
-{ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
-{ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
-Specific fine-tune subcategories (more adapted to chat):
-{ModelType.IFT.to_str(" : ")} model: instruction fine-tunes, which are model fine-tuned specifically on datasets of task instruction
-{ModelType.RL.to_str(" : ")} model: reinforcement fine-tunes, which usually change the model loss a bit with an added policy.
-If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
-"Flagged" indicates that this model has been flagged by the community, and should probably be ignored! Clicking the link will redirect you to the discussion about the model.
-(For ex, the model was trained on the evaluation data, and is therefore cheating on the leaderboard.)
 ## How it works
-📈 We evaluate models on 4 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank">  Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
 - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
 - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
@@ -34,7 +23,6 @@ If there is no icon, we have not uploaded the information on the model yet, feel
 - <a href="https://arxiv.org/abs/2109.07958" target="_blank">  TruthfulQA </a> (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
 - <a href="https://arxiv.org/abs/1907.10641" target="_blank">  Winogrande </a> (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
 - <a href="https://arxiv.org/abs/2110.14168" target="_blank">  GSM8k </a> (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.
-- <a href="https://arxiv.org/abs/1903.00161" target="_blank">  DROP </a> (3-shot) - English reading comprehension benchmark requiring Discrete Reasoning Over the content of Paragraphs.
 For all these evaluations, a higher score is a better score.
 We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
@@ -47,10 +35,10 @@ You can find:
 ## Reproducibility
 To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
-`python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
-` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=2 --output_path=<output_path>`
-The total batch size we get for models which fit on one A100 node is 16 (8 GPUs * 2). If you don't use parallelism, adapt your batch size to fit.
 *You can expect results to vary slightly for different batch sizes because of padding.*
 The tasks and few shots parameters are:
@@ -60,29 +48,95 @@ The tasks and few shots parameters are:
 - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
 - Winogrande: 5-shot, *winogrande* (`acc`)
 - GSM8k: 5-shot, *gsm8k* (`acc`)
-- DROP: 3-shot, *drop* (`f1`)
 Side note on the baseline scores:
 - for log-likelihood evaluation, we select the random baseline
-- for DROP, we select the best submission score according to [their leaderboard](https://leaderboard.allenai.org/drop/submissions/public) when the paper came out (NAQANet score)
-- for GSM8K, we select the score obtained in the paper after inetuning a 6B model on the full GSM8K training set for 50 epochs
 ## Quantization
 To get more information about quantization, see:
 - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
 - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
-## More resources
-If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/179)!
-We also gather cool resources from the community, other teams, and other labs [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)!
 """
 EVALUATION_QUEUE_TEXT = """
 # Evaluation Queue for the 🤗 Open LLM Leaderboard
 Models added here will be automatically evaluated on the 🤗 cluster.
-## Some good practices before submitting a model
 ### 1) Make sure you can load your model and tokenizer using AutoClasses:
 ```python
@@ -205,17 +259,4 @@ CITATION_BUTTON_TEXT = r"""
       archivePrefix={arXiv},
       primaryClass={cs.CL}
 }
-@misc{DBLP:journals/corr/abs-1903-00161,
-      title={{DROP:} {A} Reading Comprehension Benchmark Requiring Discrete Reasoning
-                  Over Paragraphs},
-      author={Dheeru Dua and
-                  Yizhong Wang and
-                  Pradeep Dasigi and
-                  Gabriel Stanovsky and
-                  Sameer Singh and
-                  Matt Gardner},
-      year={2019},
-      eprinttype={arXiv},
-      eprint={1903.00161},
-      primaryClass={cs.CL}
-}"""

 # Context
 With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
 ## How it works
+📈 We evaluate models on 7 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank">  Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
 - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
 - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
 - <a href="https://arxiv.org/abs/2109.07958" target="_blank">  TruthfulQA </a> (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
 - <a href="https://arxiv.org/abs/1907.10641" target="_blank">  Winogrande </a> (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
 - <a href="https://arxiv.org/abs/2110.14168" target="_blank">  GSM8k </a> (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.
 For all these evaluations, a higher score is a better score.
 We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
 ## Reproducibility
 To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
+`python main.py --model=hf-causal-experimental --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
+` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=1 --output_path=<output_path>`
+The total batch size we get for models which fit on one A100 node is 8 (8 GPUs * 1). If you don't use parallelism, adapt your batch size to fit.
 *You can expect results to vary slightly for different batch sizes because of padding.*
 The tasks and few shots parameters are:
 - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
 - Winogrande: 5-shot, *winogrande* (`acc`)
 - GSM8k: 5-shot, *gsm8k* (`acc`)
 Side note on the baseline scores:
 - for log-likelihood evaluation, we select the random baseline
+- for GSM8K, we select the score obtained in the paper after finetuning a 6B model on the full GSM8K training set for 50 epochs
+## Icons
+- {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
+- {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
+Specific fine-tune subcategories (more adapted to chat):
+- {ModelType.IFT.to_str(" : ")} model: instruction fine-tunes, which are model fine-tuned specifically on datasets of task instruction
+- {ModelType.RL.to_str(" : ")} model: reinforcement fine-tunes, which usually change the model loss a bit with an added policy.
+If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
+"Flagged" indicates that this model has been flagged by the community, and should probably be ignored! Clicking the link will redirect you to the discussion about the model.
 ## Quantization
 To get more information about quantization, see:
 - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
 - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
+## Useful links
+- [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)
+- [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
 """
+FAQ_TEXT = """
+---------------------------
+# FAQ
+Below are some common questions - if this FAQ does not answer you, feel free to create a new issue, and we'll take care of it as soon as we can!
+## 1) Submitting a model
+My model requires `trust_remote_code=True`, can I submit it?
+- *We only support models that have been integrated in a stable version of the `transformers` library for automatic submission, as we don't want to run possibly unsage code on our cluster.*
+What about models of type X?
+- *We only support models that have been integrated in a stable version of the `transformers` library for automatic submission.*
+How can I follow when my model is launched?
+- *You can look for its request file [here](https://huggingface.co/datasets/open-llm-leaderboard/requests) and follow the status evolution, or directly in the queues above the submit form.*
+My model disappeared from all the queues, what happened?
+- *A model disappearing from all the queues usually means that there has been a failure. You can check if that is the case by looking for your model [here](https://huggingface.co/datasets/open-llm-leaderboard/requests).*
+What causes an evaluation failure?
+- *Most of the failures we get come from problems in the submissions (corrupted files, config problems, wrong parameters selected for eval ...), so we'll be grateful if you first make sure you have followed the steps in `About`. However, from time to time, we have failures on our side (hardware/node failures, problem with an update of our backend, connectivity problem ending up in the results not being saved, ...).*
+How can I report an evaluation failure?
+- *As we store the logs for all models, feel free to create an issue, **where you link to the requests file of your model** (look for it [here](https://huggingface.co/datasets/open-llm-leaderboard/requests/tree/main)), so we can investigate! If the model failed due to a problem on our side, we'll relaunch it right away!*
+*Note: Please do not re-upload your model under a different name, it will not help*
+## 2) Model results
+What kind of information can I find?
+- *Let's imagine you are interested in the Yi-34B results. You have access to 3 different information categories:*
+      - *The [request file](https://huggingface.co/datasets/open-llm-leaderboard/requests/blob/main/01-ai/Yi-34B_eval_request_False_bfloat16_Original.json): it gives you information about the status of the evaluation*
+      - *The [aggregated results folder](https://huggingface.co/datasets/open-llm-leaderboard/results/tree/main/01-ai/Yi-34B): it gives you aggregated scores, per experimental run*
+      - *The [details dataset](https://huggingface.co/datasets/open-llm-leaderboard/details_01-ai__Yi-34B/tree/main): it gives you the full details (scores and examples for each task and a given model)*
+Why do models appear several times in the leaderboard?
+- *We run evaluations with user selected precision and model commit. Sometimes, users submit specific models at different commits and at different precisions (for example, in float16 and 4bit to see how quantization affects performance). You should be able to verify this by displaying the `precision` and `model sha` columns in the display. If, however, you see models appearing several time with the same precision and hash commit, this is not normal.*
+What is this concept of "flagging"?
+- *This mechanism allows user to report models that have unfair performance on the leaderboard. This contains several categories: exceedingly good results on the leaderboard because the model was (maybe accidentally) trained on the evaluation data, models that are copy of other models not atrributed properly, etc.*
+My model has been flagged improperly, what can I do?
+- *Every flagged model has a discussion associated with it - feel free to plead your case there, and we'll see what to do together with the community.*
+## 3) Editing a submission
+I upgraded my model and want to re-submit, how can I do that?
+- *Please open an issue with the precise name of your model, and we'll remove your model from the leaderboard so you can resubmit. You can also resubmit directly with the new commit hash!*
+I need to rename my model, how can I do that?
+- *You can use @Weyaxi 's [super cool tool](https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-renamer) to request model name changes, then open a discussion where you link to the created pull request, and we'll check them and merge them as needed.*
+## 4) Other
+Why don't you display closed source model scores?
+- *This is a leaderboard for Open models, both for philosophical reasons (openness is cool) and for practical reasons: we want to ensure that the results we display are accurate and reproducible, but 1) commercial closed models can change their API thus rendering any scoring at a given time incorrect 2) we re-run everything on our cluster to ensure all models are run on the same setup and you can't do that for these models.*
+I have an issue about accessing the leaderboard through the Gradio API
+- *Since this is not the recommended way to access the leaderboard, we won't provide support for this, but you can look at tools provided by the community for inspiration!*
+"""
 EVALUATION_QUEUE_TEXT = """
 # Evaluation Queue for the 🤗 Open LLM Leaderboard
 Models added here will be automatically evaluated on the 🤗 cluster.
+## First steps before submitting a model
 ### 1) Make sure you can load your model and tokenizer using AutoClasses:
 ```python
       archivePrefix={arXiv},
       primaryClass={cs.CL}
 }
+"""

src/display/formatting.py CHANGED Viewed

@@ -1,24 +1,11 @@
 import os
-from huggingface_hub import HfApi
-API = HfApi()
-LLAMAS = [
-    "huggingface/llama-7b",
-    "huggingface/llama-13b",
-    "huggingface/llama-30b",
-    "huggingface/llama-65b",
-]
-KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
-VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
-OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
-DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
-MODEL_PAGE = "https://huggingface.co/models"
-LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
-VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
-ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
 def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
@@ -27,44 +14,9 @@ def model_hyperlink(link, model_name):
 def make_clickable_model(model_name):
     link = f"https://huggingface.co/{model_name}"
-    if model_name in LLAMAS:
-        link = LLAMA_LINK
-        model_name = model_name.split("/")[1]
-    elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
-        link = VICUNA_LINK
-        model_name = "stable-vicuna-13b"
-    elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
-        link = ALPACA_LINK
-        model_name = "alpaca-13b"
-    if model_name == "dolly-12b":
-        link = DOLLY_LINK
-    elif model_name == "vicuna-13b":
-        link = VICUNA_LINK
-    elif model_name == "koala-13b":
-        link = KOALA_LINK
-    elif model_name == "oasst-12b":
-        link = OASST_LINK
     details_model_name = model_name.replace("/", "__")
     details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
-    if not bool(os.getenv("DEBUG", "False")):
-        # We only add these checks when not debugging, as they are extremely slow
-        print(f"details_link: {details_link}")
-        try:
-            check_path = list(
-                API.list_files_info(
-                    repo_id=f"open-llm-leaderboard/details_{details_model_name}",
-                    paths="README.md",
-                    repo_type="dataset",
-                )
-            )
-            print(f"check_path: {check_path}")
-        except Exception as err:
-            # No details repo for this model
-            print(f"No details repo for this model: {err}")
-            return model_hyperlink(link, model_name)
     return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "📑")

 import os
+from datetime import datetime, timezone
+from huggingface_hub import HfApi
+from huggingface_hub.hf_api import ModelInfo
+API = HfApi()
 def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 def make_clickable_model(model_name):
     link = f"https://huggingface.co/{model_name}"
     details_model_name = model_name.replace("/", "__")
     details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
     return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "📑")

src/display/utils.py CHANGED Viewed

@@ -1,7 +1,25 @@
-from dataclasses import dataclass
-import pandas as pd
 from enum import Enum
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
@@ -15,35 +33,29 @@ class ColumnContent:
     never_hidden: bool = False
     dummy: bool = False
-def fields(raw_class):
-    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
-@dataclass(frozen=True)
-class AutoEvalColumn:  # Auto evals column
-    model_type_symbol = ColumnContent("T", "str", True, never_hidden=True)
-    model = ColumnContent("Model", "markdown", True, never_hidden=True)
-    average = ColumnContent("Average ⬆️", "number", True)
-    arc = ColumnContent("ARC", "number", True)
-    hellaswag = ColumnContent("HellaSwag", "number", True)
-    mmlu = ColumnContent("MMLU", "number", True)
-    truthfulqa = ColumnContent("TruthfulQA", "number", True)
-    winogrande = ColumnContent("Winogrande", "number", True)
-    gsm8k = ColumnContent("GSM8K", "number", True)
-    drop = ColumnContent("DROP", "number", True)
-    model_type = ColumnContent("Type", "str", False)
-    weight_type = ColumnContent("Weight type", "str", False, True)
-    precision = ColumnContent("Precision", "str", False)  # , True)
-    license = ColumnContent("Hub License", "str", False)
-    params = ColumnContent("#Params (B)", "number", False)
-    likes = ColumnContent("Hub ❤️", "number", False)
-    still_on_hub = ColumnContent("Available on the hub", "bool", False)
-    revision = ColumnContent("Model sha", "str", False, False)
-    dummy = ColumnContent(
-        "model_name_for_query", "str", False, dummy=True
-    )  # dummy col to implement search bar (hidden by custom CSS)
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
@@ -59,31 +71,52 @@ baseline_row = {
     AutoEvalColumn.model.name: "<p>Baseline</p>",
     AutoEvalColumn.revision.name: "N/A",
     AutoEvalColumn.precision.name: None,
-    AutoEvalColumn.average.name: 25.0,
     AutoEvalColumn.arc.name: 25.0,
     AutoEvalColumn.hellaswag.name: 25.0,
     AutoEvalColumn.mmlu.name: 25.0,
     AutoEvalColumn.truthfulqa.name: 25.0,
     AutoEvalColumn.winogrande.name: 50.0,
     AutoEvalColumn.gsm8k.name: 0.21,
-    AutoEvalColumn.drop.name: 0.47,
     AutoEvalColumn.dummy.name: "baseline",
     AutoEvalColumn.model_type.name: "",
 }
 @dataclass
-class ModelInfo:
     name: str
-    symbol: str  # emoji
 class ModelType(Enum):
-    PT = ModelInfo(name="pretrained", symbol="🟢")
-    FT = ModelInfo(name="fine-tuned", symbol="🔶")
-    IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
-    RL = ModelInfo(name="RL-tuned", symbol="🟦")
-    Unknown = ModelInfo(name="", symbol="?")
     def to_str(self, separator=" "):
         return f"{self.value.symbol}{separator}{self.value.name}"
@@ -100,22 +133,33 @@ class ModelType(Enum):
             return ModelType.IFT
         return ModelType.Unknown
-@dataclass
-class Task:
-    benchmark: str
-    metric: str
-    col_name: str
-class Tasks(Enum):
-    arc = Task("arc:challenge", "acc_norm", AutoEvalColumn.arc.name)
-    hellaswag = Task("hellaswag", "acc_norm", AutoEvalColumn.hellaswag.name)
-    mmlu = Task("hendrycksTest", "acc", AutoEvalColumn.mmlu.name)
-    truthfulqa = Task("truthfulqa:mc", "mc2", AutoEvalColumn.truthfulqa.name)
-    winogrande = Task("winogrande", "acc", AutoEvalColumn.winogrande.name)
-    gsm8k = Task("gsm8k", "acc", AutoEvalColumn.gsm8k.name)
-    drop = Task("drop", "f1", AutoEvalColumn.drop.name)
 # Column selection
@@ -127,7 +171,7 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks if t.value.col_name in fields(AutoEvalColumn)]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

+from dataclasses import dataclass, make_dataclass
 from enum import Enum
+import pandas as pd
+def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+class Tasks(Enum):
+    arc = Task("arc:challenge", "acc_norm", "ARC")
+    hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
+    mmlu = Task("hendrycksTest", "acc", "MMLU")
+    truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
+    winogrande = Task("winogrande", "acc", "Winogrande")
+    gsm8k = Task("gsm8k", "acc", "GSM8K")
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
     never_hidden: bool = False
     dummy: bool = False
+auto_eval_column_dict = []
+# Init
+auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
+auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+#Scores
+auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
+for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
+# Model information
+auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
+auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
+auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
+# Dummy column for the search bar (hidden by the custom CSS)
+auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
+# We use make dataclass to dynamically fill the scores from Tasks
+AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
     AutoEvalColumn.model.name: "<p>Baseline</p>",
     AutoEvalColumn.revision.name: "N/A",
     AutoEvalColumn.precision.name: None,
+    AutoEvalColumn.average.name: 31.0,
     AutoEvalColumn.arc.name: 25.0,
     AutoEvalColumn.hellaswag.name: 25.0,
     AutoEvalColumn.mmlu.name: 25.0,
     AutoEvalColumn.truthfulqa.name: 25.0,
     AutoEvalColumn.winogrande.name: 50.0,
     AutoEvalColumn.gsm8k.name: 0.21,
     AutoEvalColumn.dummy.name: "baseline",
     AutoEvalColumn.model_type.name: "",
 }
+# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
+# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
+# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
+# MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
+# TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
+# Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
+# GSM8K: paper
+# Define the human baselines
+human_baseline_row = {
+    AutoEvalColumn.model.name: "<p>Human performance</p>",
+    AutoEvalColumn.revision.name: "N/A",
+    AutoEvalColumn.precision.name: None,
+    AutoEvalColumn.average.name: 92.75,
+    AutoEvalColumn.arc.name: 80.0,
+    AutoEvalColumn.hellaswag.name: 95.0,
+    AutoEvalColumn.mmlu.name: 89.8,
+    AutoEvalColumn.truthfulqa.name: 94.0,
+    AutoEvalColumn.winogrande.name: 94.0,
+    AutoEvalColumn.gsm8k.name: 100,
+    AutoEvalColumn.dummy.name: "human_baseline",
+    AutoEvalColumn.model_type.name: "",
+}
 @dataclass
+class ModelDetails:
     name: str
+    symbol: str = "" # emoji, only for the model type
 class ModelType(Enum):
+    PT = ModelDetails(name="pretrained", symbol="🟢")
+    FT = ModelDetails(name="fine-tuned", symbol="🔶")
+    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
+    RL = ModelDetails(name="RL-tuned", symbol="🟦")
+    Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
         return f"{self.value.symbol}{separator}{self.value.name}"
             return ModelType.IFT
         return ModelType.Unknown
+class WeightType(Enum):
+    Adapter = ModelDetails("Adapter")
+    Original = ModelDetails("Original")
+    Delta = ModelDetails("Delta")
+class Precision(Enum):
+    float16 = ModelDetails("float16")
+    bfloat16 = ModelDetails("bfloat16")
+    qt_8bit = ModelDetails("8bit")
+    qt_4bit = ModelDetails("4bit")
+    qt_GPTQ = ModelDetails("GPTQ")
+    Unknown = ModelDetails("?")
+    def from_str(precision):
+        if precision in ["torch.float16", "float16"]:
+            return Precision.float16
+        if precision in ["torch.bfloat16", "bfloat16"]:
+            return Precision.bfloat16
+        if precision in ["8bit"]:
+            return Precision.qt_8bit
+        if precision in ["4bit"]:
+            return Precision.qt_4bit
+        if precision in ["GPTQ", "None"]:
+            return Precision.qt_GPTQ
+        return Precision.Unknown
 # Column selection
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

src/envs.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 from huggingface_hub import HfApi
 # clone / pull the lmeh eval data
@@ -13,8 +14,10 @@ PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
 IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
-EVAL_REQUESTS_PATH = "eval-queue"
-EVAL_RESULTS_PATH = "eval-results"
 EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
 EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
@@ -24,5 +27,6 @@ PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c796
 # Rate limit variables
 RATE_LIMIT_PERIOD = 7
 RATE_LIMIT_QUOTA = 5
 API = HfApi(token=H4_TOKEN)

 import os
 from huggingface_hub import HfApi
 # clone / pull the lmeh eval data
 IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
+CACHE_PATH=os.getenv("HF_HOME", ".")
+EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
 EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
 # Rate limit variables
 RATE_LIMIT_PERIOD = 7
 RATE_LIMIT_QUOTA = 5
+HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
 API = HfApi(token=H4_TOKEN)

src/leaderboard/read_evals.py CHANGED Viewed

@@ -1,37 +1,41 @@
 import json
-import os
 import math
-import glob
 from dataclasses import dataclass
-from typing import Dict, List, Tuple
 import dateutil
 import numpy as np
-from src.display.utils import AutoEvalColumn, ModelType, Tasks
 from src.display.formatting import make_clickable_model
 from src.submission.check_validity import is_model_on_hub
 @dataclass
 class EvalResult:
-    eval_name: str
-    full_model: str
-    org: str
     model: str
-    revision: str
     results: dict
-    precision: str = ""
-    model_type: ModelType = ModelType.Unknown
-    weight_type: str = "Original"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
-    date: str = ""
     still_on_hub: bool = False
     @classmethod
     def init_from_json_file(self, json_filepath):
         with open(json_filepath) as fp:
             data = json.load(fp)
@@ -39,9 +43,7 @@ class EvalResult:
         config = data.get("config", data.get("config_general", None))
         # Precision
-        precision = config.get("model_dtype")
-        if precision == "None":
-            precision = "GPTQ"
         # Get model and org
         org_and_model = config.get("model_name", config.get("model_args", None))
@@ -50,13 +52,21 @@ class EvalResult:
         if len(org_and_model) == 1:
             org = None
             model = org_and_model[0]
-            result_key = f"{model}_{precision}"
         else:
             org = org_and_model[0]
             model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision}"
-        still_on_hub = is_model_on_hub("/".join(org_and_model), config.get("model_sha", "main"), trust_remote_code=True)[0]
         # Extract results available in this file (some results are split in several files)
         results = {}
@@ -73,8 +83,8 @@ class EvalResult:
                 continue
             # Some truthfulQA values are NaNs
-            if task.benchmark == "truthfulqa:mc" and task.benchmark in data["results"]:
-                if math.isnan(float(data["results"][task.benchmark][task.metric])):
                     results[task.benchmark] = 0.0
                     continue
@@ -88,37 +98,42 @@ class EvalResult:
         return self(
             eval_name=result_key,
-            full_model="/".join(org_and_model),
             org=org,
             model=model,
             results=results,
-            precision=precision,  # todo model_type=, weight_type=
-            revision=config.get("model_sha", ""),
-            date=config.get("submission_date", ""),
             still_on_hub=still_on_hub,
         )
-    def update_with_request_file(self):
-        request_file = get_request_file_for_model(self.full_model, self.precision)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
             self.model_type = ModelType.from_str(request.get("model_type", ""))
             self.license = request.get("license", "?")
             self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
         except Exception:
             print(f"Could not find request file for {self.org}/{self.model}")
     def to_dict(self):
         average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision,
             AutoEvalColumn.model_type.name: self.model_type.value.name,
             AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
             AutoEvalColumn.dummy.name: self.full_model,
             AutoEvalColumn.revision.name: self.revision,
@@ -135,9 +150,10 @@ class EvalResult:
         return data_dict
-def get_request_file_for_model(model_name, precision):
     request_files = os.path.join(
-        "eval-queue",
         f"{model_name}_eval_request_*.json",
     )
     request_files = glob.glob(request_files)
@@ -149,15 +165,16 @@ def get_request_file_for_model(model_name, precision):
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
             if (
-                req_content["status"] in ["FINISHED", "PENDING_NEW_EVAL"]
                 and req_content["precision"] == precision.split(".")[-1]
             ):
                 request_file = tmp_request_file
     return request_file
-def get_eval_results(results_path: str) -> List[EvalResult]:
-    json_filepaths = []
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
@@ -170,15 +187,14 @@ def get_eval_results(results_path: str) -> List[EvalResult]:
         except dateutil.parser._parser.ParserError:
             files = [files[-1]]
-        # up_to_date = files[-1]
         for file in files:
-            json_filepaths.append(os.path.join(root, file))
     eval_results = {}
-    for json_filepath in json_filepaths:
         # Creation of result
-        eval_result = EvalResult.init_from_json_file(json_filepath)
-        eval_result.update_with_request_file()
         # Store results of same eval together
         eval_name = eval_result.eval_name
@@ -190,8 +206,9 @@ def get_eval_results(results_path: str) -> List[EvalResult]:
     results = []
     for v in eval_results.values():
         try:
-            results.append(v.to_dict())
-        except KeyError: # not all eval values present
             continue
     return results

+import glob
 import json
 import math
+import os
 from dataclasses import dataclass
 import dateutil
+from datetime import datetime
+from transformers import AutoConfig
 import numpy as np
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
 @dataclass
 class EvalResult:
+    # Also see src.display.utils.AutoEvalColumn for what will be displayed.
+    eval_name: str # org_model_precision (uid)
+    full_model: str # org/model (path on hub)
+    org: str
     model: str
+    revision: str # commit hash, "" if main
     results: dict
+    precision: Precision = Precision.Unknown
+    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
+    weight_type: WeightType = WeightType.Original # Original or Adapter
+    architecture: str = "Unknown" # From config file
     license: str = "?"
     likes: int = 0
     num_params: int = 0
+    date: str = "" # submission date of request file
     still_on_hub: bool = False
     @classmethod
     def init_from_json_file(self, json_filepath):
+        """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
         config = data.get("config", data.get("config_general", None))
         # Precision
+        precision = Precision.from_str(config.get("model_dtype"))
         # Get model and org
         org_and_model = config.get("model_name", config.get("model_args", None))
         if len(org_and_model) == 1:
             org = None
             model = org_and_model[0]
+            result_key = f"{model}_{precision.value.name}"
         else:
             org = org_and_model[0]
             model = org_and_model[1]
+            result_key = f"{org}_{model}_{precision.value.name}"
+        full_model = "/".join(org_and_model)
+        still_on_hub, error, model_config = is_model_on_hub(
+            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
+        )
+        architecture = "?"
+        if model_config is not None:
+            architectures = getattr(model_config, "architectures", None)
+            if architectures:
+                architecture = ";".join(architectures)
         # Extract results available in this file (some results are split in several files)
         results = {}
                 continue
             # Some truthfulQA values are NaNs
+            if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
+                if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
                     results[task.benchmark] = 0.0
                     continue
         return self(
             eval_name=result_key,
+            full_model=full_model,
             org=org,
             model=model,
             results=results,
+            precision=precision,
+            revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
+            architecture=architecture
         )
+    def update_with_request_file(self, requests_path):
+        """Finds the relevant request file for the current model and updates info with it"""
+        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
             self.model_type = ModelType.from_str(request.get("model_type", ""))
+            self.weight_type = WeightType[request.get("weight_type", "Original")]
             self.license = request.get("license", "?")
             self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
+            self.date = request.get("submitted_time", "")
         except Exception:
             print(f"Could not find request file for {self.org}/{self.model}")
     def to_dict(self):
+        """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
+            AutoEvalColumn.precision.name: self.precision.value.name,
             AutoEvalColumn.model_type.name: self.model_type.value.name,
             AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
+            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
+            AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
             AutoEvalColumn.dummy.name: self.full_model,
             AutoEvalColumn.revision.name: self.revision,
         return data_dict
+def get_request_file_for_model(requests_path, model_name, precision):
+    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
     request_files = os.path.join(
+        requests_path,
         f"{model_name}_eval_request_*.json",
     )
     request_files = glob.glob(request_files)
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
             if (
+                req_content["status"] in ["FINISHED"]
                 and req_content["precision"] == precision.split(".")[-1]
             ):
                 request_file = tmp_request_file
     return request_file
+def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
+    """From the path of the results folder root, extract all needed info for results"""
+    model_result_filepaths = []
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
         except dateutil.parser._parser.ParserError:
             files = [files[-1]]
         for file in files:
+            model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
+    for model_result_filepath in model_result_filepaths:
         # Creation of result
+        eval_result = EvalResult.init_from_json_file(model_result_filepath)
+        eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
         eval_name = eval_result.eval_name
     results = []
     for v in eval_results.values():
         try:
+            v.to_dict() # we test if the dict version is complete
+            results.append(v)
+        except KeyError:  # not all eval values present
             continue
     return results

src/populate.py CHANGED Viewed

@@ -3,24 +3,25 @@ import os
 import pandas as pd
-from src.leaderboard.filter_models import filter_models
-from src.leaderboard.read_evals import get_eval_results
-from src.display.formatting import make_clickable_model, has_no_nan_values
 from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
-def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
-    all_data = get_eval_results(results_path)
-    all_data.append(baseline_row)
-    filter_models(all_data)
-    df = pd.DataFrame.from_records(all_data)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
-    return df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:

 import pandas as pd
+from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
+from src.leaderboard.filter_models import filter_models
+from src.leaderboard.read_evals import get_raw_eval_results
+def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
+    raw_data = get_raw_eval_results(results_path, requests_path)
+    all_data_json = [v.to_dict() for v in raw_data]
+    all_data_json.append(baseline_row)
+    filter_models(all_data_json)
+    df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
+    return raw_data, df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:

src/submission/check_validity.py CHANGED Viewed

@@ -1,13 +1,16 @@
-import huggingface_hub
-import os
 import json
 import re
 from collections import defaultdict
-from huggingface_hub.hf_api import ModelInfo
 from huggingface_hub import ModelCard
-from transformers import AutoConfig
-from datetime import datetime, timedelta, timezone
 # ht to @Wauplin, thank you for the snippet!
@@ -34,26 +37,36 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
     return True, ""
-def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False) -> tuple[bool, str]:
     try:
-        AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
-        return True, None
     except ValueError:
         return (
             False,
             "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
         )
-    except Exception:
-        return False, "was not found on hub!"
 def get_model_size(model_info: ModelInfo, precision: str):
     size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
         model_size = round(model_info.safetensors["total"] / 1e9, 3)
-    except AttributeError:
         try:
             size_match = re.search(size_pattern, model_info.modelId.lower())
             model_size = size_match.group(0)
@@ -65,9 +78,10 @@ def get_model_size(model_info: ModelInfo, precision: str):
     model_size = size_factor * model_size
     return model_size
-def user_submission_permission(submission_name, users_to_submission_dates, rate_limit_period, rate_limit_quota):
-    org_or_user, _ = submission_name.split("/")
     if org_or_user not in users_to_submission_dates:
         return True, ""
     submission_dates = sorted(users_to_submission_dates[org_or_user])
@@ -76,6 +90,9 @@ def user_submission_permission(submission_name, users_to_submission_dates, rate_
     submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
     num_models_submitted_in_period = len(submissions_after_timelimit)
     if num_models_submitted_in_period > rate_limit_quota:
         error_msg = f"Organisation or user `{org_or_user}`"
         error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "

 import json
+import os
 import re
 from collections import defaultdict
+from datetime import datetime, timedelta, timezone
+import huggingface_hub
 from huggingface_hub import ModelCard
+from huggingface_hub.hf_api import ModelInfo
+from transformers import AutoConfig, AutoTokenizer
+from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
+from src.envs import HAS_HIGHER_RATE_LIMIT
 # ht to @Wauplin, thank you for the snippet!
     return True, ""
+def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
     try:
+        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
+        if test_tokenizer:
+            try:
+                AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
+            except ValueError as e:
+                return (
+                    False,
+                    f"uses a tokenizer which is not in a transformers release: {e}",
+                    None
+                )
+        return True, None, config
     except ValueError:
         return (
             False,
             "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
+            None
         )
+    except Exception as e:
+        return False, "was not found on hub!", None
 def get_model_size(model_info: ModelInfo, precision: str):
     size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
         model_size = round(model_info.safetensors["total"] / 1e9, 3)
+    except (AttributeError, TypeError ):
         try:
             size_match = re.search(size_pattern, model_info.modelId.lower())
             model_size = size_match.group(0)
     model_size = size_factor * model_size
     return model_size
+def get_model_arch(model_info: ModelInfo):
+    return model_info.config.get("architectures", "Unknown")
+def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
     if org_or_user not in users_to_submission_dates:
         return True, ""
     submission_dates = sorted(users_to_submission_dates[org_or_user])
     submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
     num_models_submitted_in_period = len(submissions_after_timelimit)
+    if org_or_user in HAS_HIGHER_RATE_LIMIT:
+        rate_limit_quota = 2 * rate_limit_quota
     if num_models_submitted_in_period > rate_limit_quota:
         error_msg = f"Organisation or user `{org_or_user}`"
         error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "

src/submission/submit.py CHANGED Viewed

@@ -1,20 +1,20 @@
-import os, json
 from datetime import datetime, timezone
-from src.display.formatting import styled_error, styled_warning, styled_message
 from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
 from src.submission.check_validity import (
-    user_submission_permission,
-    is_model_on_hub,
-    get_model_size,
-    check_model_card,
     already_submitted_models,
 )
-from src.envs import RATE_LIMIT_QUOTA, RATE_LIMIT_PERIOD, H4_TOKEN, EVAL_REQUESTS_PATH, API, QUEUE_REPO
-requested_models, users_to_submission_dates = already_submitted_models(EVAL_REQUESTS_PATH)
 def add_new_eval(
     model: str,
@@ -25,6 +25,17 @@ def add_new_eval(
     weight_type: str,
     model_type: str,
 ):
     precision = precision.split(" ")[0]
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
@@ -32,11 +43,12 @@ def add_new_eval(
         return styled_error("Please select a model type.")
     # Is the user rate limited?
-    user_can_submit, error_msg = user_submission_permission(
-        model, users_to_submission_dates, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
-    )
-    if not user_can_submit:
-        return styled_error(error_msg)
     # Did the model authors forbid its submission to the leaderboard?
     if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
@@ -48,12 +60,12 @@ def add_new_eval(
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error = is_model_on_hub(base_model, revision, H4_TOKEN)
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')
     if not weight_type == "Adapter":
-        model_on_hub, error = is_model_on_hub(model, revision)
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
@@ -93,21 +105,15 @@ def add_new_eval(
         "license": license,
     }
-    user_name = ""
-    model_path = model
-    if "/" in model:
-        user_name = model.split("/")[0]
-        model_path = model.split("/")[1]
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
     out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
-    # Check for duplicate submission
-    if f"{model}_{revision}_{precision}" in requested_models:
-        return styled_warning("This model has been already submitted.")
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))

+import json
+import os
 from datetime import datetime, timezone
+from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
 from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
 from src.submission.check_validity import (
     already_submitted_models,
+    check_model_card,
+    get_model_size,
+    is_model_on_hub,
+    user_submission_permission,
 )
+REQUESTED_MODELS = None
+USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model: str,
     weight_type: str,
     model_type: str,
 ):
+    global REQUESTED_MODELS
+    global USERS_TO_SUBMISSION_DATES
+    if not REQUESTED_MODELS:
+        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
+    user_name = ""
+    model_path = model
+    if "/" in model:
+        user_name = model.split("/")[0]
+        model_path = model.split("/")[1]
     precision = precision.split(" ")[0]
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
         return styled_error("Please select a model type.")
     # Is the user rate limited?
+    if user_name != "":
+        user_can_submit, error_msg = user_submission_permission(
+            user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
+        )
+        if not user_can_submit:
+            return styled_error(error_msg)
     # Did the model authors forbid its submission to the leaderboard?
     if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
+        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')
     if not weight_type == "Adapter":
+        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
         "license": license,
     }
+    # Check for duplicate submission
+    if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
+        return styled_warning("This model has been already submitted.")
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
     out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))

src/tools/collections.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import os
 import pandas as pd
-from pandas import DataFrame
-from huggingface_hub import get_collection, add_collection_item, update_collection_item, delete_collection_item
 from huggingface_hub.utils._errors import HfHubHTTPError
 from src.display.utils import AutoEvalColumn, ModelType
 from src.envs import H4_TOKEN, PATH_TO_COLLECTION
 # Specific intervals for the collections

 import os
 import pandas as pd
+from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
 from huggingface_hub.utils._errors import HfHubHTTPError
+from pandas import DataFrame
 from src.display.utils import AutoEvalColumn, ModelType
 from src.envs import H4_TOKEN, PATH_TO_COLLECTION
 # Specific intervals for the collections

src/tools/plots.py CHANGED Viewed

@@ -1,151 +1,84 @@
 import pandas as pd
 import plotly.express as px
 from plotly.graph_objs import Figure
-import pickle
-from datetime import datetime, timezone
-from typing import List, Dict, Tuple, Any
-from src.leaderboard.filter_models import FLAGGED_MODELS
-# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
-# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
-# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
-# MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
-# TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
-# Define the human baselines
-HUMAN_BASELINES = {
-    "Average ⬆️": 0.897 * 100,
-    "ARC": 0.80 * 100,
-    "HellaSwag": 0.95 * 100,
-    "MMLU": 0.898 * 100,
-    "TruthfulQA": 0.94 * 100,
-}
-def to_datetime(model_info: Tuple[str, Any]) -> datetime:
-    """
-    Converts the lastModified attribute of the object to datetime.
-    :param model_info: A tuple containing the name and object.
-                       The object must have a lastModified attribute
-                       with a string representing the date and time.
-    :return: A datetime object converted from the lastModified attribute of the input object.
-    """
-    name, obj = model_info
-    return datetime.strptime(obj.lastModified, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
-def join_model_info_with_results(results_df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Integrates model information with the results DataFrame by matching 'Model sha'.
-    :param results_df: A DataFrame containing results information including 'Model sha' column.
-    :return: A DataFrame with updated 'Results Date' columns, which are synchronized with model information.
-    """
-    # copy dataframe to avoid modifying the original
-    df = results_df.copy(deep=True)
-    # Filter out FLAGGED_MODELS to ensure graph is not skewed by mistakes
-    df = df[~df["model_name_for_query"].isin(FLAGGED_MODELS.keys())].reset_index(drop=True)
-    # load cache from disk
-    try:
-        with open("model_info_cache.pkl", "rb") as f:
-            model_info_cache = pickle.load(f)
-    except (EOFError, FileNotFoundError):
-        model_info_cache = {}
-    # Sort date strings using datetime objects as keys
-    sorted_dates = sorted(list(model_info_cache.items()), key=to_datetime, reverse=True)
-    df["Results Date"] = datetime.now().replace(tzinfo=timezone.utc)
-    # Define the date format string
-    date_format = "%Y-%m-%dT%H:%M:%S.%fZ"
-    # Iterate over sorted_dates and update the dataframe
-    for name, obj in sorted_dates:
-        # Convert the lastModified string to a datetime object
-        last_modified_datetime = datetime.strptime(obj.lastModified, date_format).replace(tzinfo=timezone.utc)
-        # Update the "Results Date" column where "Model sha" equals obj.sha
-        df.loc[df["Model sha"] == obj.sha, "Results Date"] = last_modified_datetime
-    return df
-def create_scores_df(results_df: pd.DataFrame) -> pd.DataFrame:
     """
-    Generates a DataFrame containing the maximum scores until each result date.
-    :param results_df: A DataFrame containing result information including metric scores and result dates.
-    :return: A new DataFrame containing the maximum scores until each result date for every metric.
     """
-    # Step 1: Ensure 'Results Date' is in datetime format and sort the DataFrame by it
-    results_df["Results Date"] = pd.to_datetime(results_df["Results Date"])
-    results_df.sort_values(by="Results Date", inplace=True)
     # Step 2: Initialize the scores dictionary
-    scores = {
-        "Average ⬆️": [],
-        "ARC": [],
-        "HellaSwag": [],
-        "MMLU": [],
-        "TruthfulQA": [],
-        "Result Date": [],
-        "Model Name": [],
-    }
     # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
-    for i, row in results_df.iterrows():
-        date = row["Results Date"]
-        for column in scores.keys():
-            if column == "Result Date":
-                if not scores[column] or scores[column][-1] <= date:
-                    scores[column].append(date)
                 continue
-            if column == "Model Name":
-                scores[column].append(row["model_name_for_query"])
-                continue
-            current_max = scores[column][-1] if scores[column] else float("-inf")
-            scores[column].append(max(current_max, row[column]))
-    # Step 4: Convert the dictionary to a DataFrame
-    return pd.DataFrame(scores)
-def create_plot_df(scores_df: pd.DataFrame) -> pd.DataFrame:
     """
     Transforms the scores DataFrame into a new format suitable for plotting.
-    :param scores_df: A DataFrame containing metric scores and result dates.
     :return: A new DataFrame reshaped for plotting purposes.
     """
-    # Sample columns
-    cols = ["Average ⬆️", "ARC", "HellaSwag", "MMLU", "TruthfulQA"]
     # Initialize the list to store DataFrames
     dfs = []
     # Iterate over the cols and create a new DataFrame for each column
-    for col in cols:
-        d = scores_df[[col, "Model Name", "Result Date"]].copy().reset_index(drop=True)
-        d["Metric Name"] = col
-        d.rename(columns={col: "Metric Value"}, inplace=True)
         dfs.append(d)
     # Concatenate all the created DataFrames
     concat_df = pd.concat(dfs, ignore_index=True)
-    # Sort values by 'Result Date'
-    concat_df.sort_values(by="Result Date", inplace=True)
-    concat_df.reset_index(drop=True, inplace=True)
-    # Drop duplicates based on 'Metric Name' and 'Metric Value' and keep the first (earliest) occurrence
-    concat_df.drop_duplicates(subset=["Metric Name", "Metric Value"], keep="first", inplace=True)
     concat_df.reset_index(drop=True, inplace=True)
     return concat_df
 def create_metric_plot_obj(
-    df: pd.DataFrame, metrics: List[str], human_baselines: Dict[str, float], title: str
 ) -> Figure:
     """
     Create a Plotly figure object with lines representing different metrics
@@ -154,27 +87,25 @@ def create_metric_plot_obj(
     :param df: The DataFrame containing the metric values, names, and dates.
     :param metrics: A list of strings representing the names of the metrics
                     to be included in the plot.
-    :param human_baselines: A dictionary where keys are metric names
-                            and values are human baseline values for the metrics.
     :param title: A string representing the title of the plot.
     :return: A Plotly figure object with lines representing metrics and
              horizontal dotted lines representing human baselines.
     """
     # Filter the DataFrame based on the specified metrics
-    df = df[df["Metric Name"].isin(metrics)]
     # Filter the human baselines based on the specified metrics
-    filtered_human_baselines = {k: v for k, v in human_baselines.items() if k in metrics}
     # Create a line figure using plotly express with specified markers and custom data
     fig = px.line(
         df,
-        x="Result Date",
-        y="Metric Value",
-        color="Metric Name",
         markers=True,
-        custom_data=["Metric Name", "Metric Value", "Model Name"],
         title=title,
     )

 import pandas as pd
+import numpy as np
 import plotly.express as px
 from plotly.graph_objs import Figure
+from src.leaderboard.filter_models import FLAGGED_MODELS
+from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS
+from src.leaderboard.read_evals import EvalResult
+def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
     """
+    Generates a DataFrame containing the maximum scores until each date.
+    :param results_df: A DataFrame containing result information including metric scores and dates.
+    :return: A new DataFrame containing the maximum scores until each date for every metric.
     """
+    # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
+    results_df = pd.DataFrame(raw_data)
+    #results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
+    results_df.sort_values(by="date", inplace=True)
     # Step 2: Initialize the scores dictionary
+    scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
     # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
+    for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
+        current_max = 0
+        last_date = ""
+        column = task.col_name
+        for _, row in results_df.iterrows():
+            current_model = row["full_model"]
+            if current_model in FLAGGED_MODELS:
                 continue
+            current_date = row["date"]
+            if task.benchmark == "Average":
+                current_score = np.mean(list(row["results"].values()))
+            else:
+                current_score = row["results"][task.benchmark]
+            if current_score > current_max:
+                if current_date == last_date and len(scores[column]) > 0:
+                    scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
+                else:
+                    scores[column].append({"model": current_model, "date": current_date, "score": current_score})
+                current_max = current_score
+                last_date = current_date
+    # Step 4: Return all dictionaries as DataFrames
+    return {k: pd.DataFrame(v) for k, v in scores.items()}
+def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
     """
     Transforms the scores DataFrame into a new format suitable for plotting.
+    :param scores_df: A DataFrame containing metric scores and dates.
     :return: A new DataFrame reshaped for plotting purposes.
     """
     # Initialize the list to store DataFrames
     dfs = []
     # Iterate over the cols and create a new DataFrame for each column
+    for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
+        d = scores_df[col].reset_index(drop=True)
+        d["task"] = col
         dfs.append(d)
     # Concatenate all the created DataFrames
     concat_df = pd.concat(dfs, ignore_index=True)
+    # Sort values by 'date'
+    concat_df.sort_values(by="date", inplace=True)
     concat_df.reset_index(drop=True, inplace=True)
     return concat_df
 def create_metric_plot_obj(
+    df: pd.DataFrame, metrics: list[str], title: str
 ) -> Figure:
     """
     Create a Plotly figure object with lines representing different metrics
     :param df: The DataFrame containing the metric values, names, and dates.
     :param metrics: A list of strings representing the names of the metrics
                     to be included in the plot.
     :param title: A string representing the title of the plot.
     :return: A Plotly figure object with lines representing metrics and
              horizontal dotted lines representing human baselines.
     """
     # Filter the DataFrame based on the specified metrics
+    df = df[df["task"].isin(metrics)]
     # Filter the human baselines based on the specified metrics
+    filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
     # Create a line figure using plotly express with specified markers and custom data
     fig = px.line(
         df,
+        x="date",
+        y="score",
+        color="task",
         markers=True,
+        custom_data=["task", "score", "model"],
         title=title,
     )