Commit
·
d16cee2
1
Parent(s):
e868f35
Using the new backend
Browse files- README.md +1 -0
- app.py +52 -27
- src/assets/text_content.py +35 -5
- src/auto_leaderboard/load_results.py +44 -41
- src/init.py +21 -10
- src/utils_display.py +6 -6
README.md
CHANGED
|
@@ -8,6 +8,7 @@ sdk_version: 3.27.0
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
| 11 |
+
duplicated_from: HuggingFaceH4/open_llm_leaderboard
|
| 12 |
---
|
| 13 |
|
| 14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
|
@@ -15,26 +15,40 @@ from src.assets.text_content import *
|
|
| 15 |
from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
|
| 16 |
from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
|
| 17 |
from src.assets.css_html_js import custom_css, get_window_url_params
|
| 18 |
-
from src.utils_display import AutoEvalColumn, EvalQueueColumn,
|
| 19 |
-
from src.init import load_all_info_from_hub
|
| 20 |
|
| 21 |
# clone / pull the lmeh eval data
|
| 22 |
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
| 25 |
ADD_PLOTS = False
|
| 26 |
|
| 27 |
-
EVAL_REQUESTS_PATH = "
|
|
|
|
| 28 |
|
| 29 |
-
|
|
|
|
| 30 |
|
|
|
|
| 31 |
|
| 32 |
def restart_space():
|
| 33 |
api.restart_space(
|
| 34 |
repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
|
| 35 |
)
|
| 36 |
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
| 40 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
|
@@ -60,9 +74,12 @@ def has_nan_values(df, columns):
|
|
| 60 |
|
| 61 |
|
| 62 |
def get_leaderboard_df():
|
| 63 |
-
if
|
| 64 |
print("Pulling evaluation results for the leaderboard.")
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
all_data = get_eval_results_dicts(IS_PUBLIC)
|
| 68 |
|
|
@@ -84,9 +101,12 @@ def get_leaderboard_df():
|
|
| 84 |
|
| 85 |
def get_evaluation_queue_df():
|
| 86 |
# todo @saylortwift: replace the repo by the one you created for the eval queue
|
| 87 |
-
if
|
|
|
|
|
|
|
|
|
|
| 88 |
print("Pulling changes for the evaluation queue.")
|
| 89 |
-
|
| 90 |
|
| 91 |
entries = [
|
| 92 |
entry
|
|
@@ -106,7 +126,7 @@ def get_evaluation_queue_df():
|
|
| 106 |
data["revision"] = data.get("revision", "main")
|
| 107 |
|
| 108 |
all_evals.append(data)
|
| 109 |
-
|
| 110 |
# this is a folder
|
| 111 |
sub_entries = [
|
| 112 |
e
|
|
@@ -124,10 +144,10 @@ def get_evaluation_queue_df():
|
|
| 124 |
|
| 125 |
pending_list = [e for e in all_evals if e["status"] == "PENDING"]
|
| 126 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
| 127 |
-
finished_list = [e for e in all_evals if e["status"]
|
| 128 |
-
df_pending = pd.DataFrame.from_records(pending_list)
|
| 129 |
-
df_running = pd.DataFrame.from_records(running_list)
|
| 130 |
-
df_finished = pd.DataFrame.from_records(finished_list)
|
| 131 |
return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
|
| 132 |
|
| 133 |
|
|
@@ -149,7 +169,7 @@ def is_model_on_hub(model_name, revision) -> bool:
|
|
| 149 |
return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
|
| 150 |
|
| 151 |
except Exception as e:
|
| 152 |
-
print("Could not get the model config from the hub.:
|
| 153 |
return False, "was not found on hub!"
|
| 154 |
|
| 155 |
|
|
@@ -200,7 +220,7 @@ def add_new_eval(
|
|
| 200 |
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
|
| 201 |
|
| 202 |
# Check for duplicate submission
|
| 203 |
-
if out_path.split("
|
| 204 |
return styled_warning("This model has been already submitted.")
|
| 205 |
|
| 206 |
with open(out_path, "w") as f:
|
|
@@ -208,13 +228,17 @@ def add_new_eval(
|
|
| 208 |
|
| 209 |
api.upload_file(
|
| 210 |
path_or_fileobj=out_path,
|
| 211 |
-
path_in_repo=out_path,
|
| 212 |
-
repo_id=
|
| 213 |
token=H4_TOKEN,
|
| 214 |
repo_type="dataset",
|
|
|
|
| 215 |
)
|
| 216 |
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
|
| 220 |
def refresh():
|
|
@@ -310,13 +334,6 @@ with demo:
|
|
| 310 |
)
|
| 311 |
with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
| 312 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 313 |
-
with gr.Accordion("📙 Citation", open=False):
|
| 314 |
-
citation_button = gr.Textbox(
|
| 315 |
-
value=CITATION_BUTTON_TEXT,
|
| 316 |
-
label=CITATION_BUTTON_LABEL,
|
| 317 |
-
elem_id="citation-button",
|
| 318 |
-
).style(show_copy_button=True)
|
| 319 |
-
|
| 320 |
|
| 321 |
with gr.Column():
|
| 322 |
with gr.Row():
|
|
@@ -396,6 +413,14 @@ with demo:
|
|
| 396 |
submission_result,
|
| 397 |
)
|
| 398 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
dummy = gr.Textbox(visible=False)
|
| 400 |
demo.load(
|
| 401 |
change_tab,
|
|
|
|
| 15 |
from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
|
| 16 |
from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
|
| 17 |
from src.assets.css_html_js import custom_css, get_window_url_params
|
| 18 |
+
from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
|
| 19 |
+
from src.init import get_all_requested_models, load_all_info_from_hub
|
| 20 |
|
| 21 |
# clone / pull the lmeh eval data
|
| 22 |
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
| 23 |
+
|
| 24 |
+
QUEUE_REPO = "open-llm-leaderboard/requests"
|
| 25 |
+
RESULTS_REPO = "open-llm-leaderboard/results"
|
| 26 |
+
|
| 27 |
+
PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
|
| 28 |
+
PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
|
| 29 |
+
|
| 30 |
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
| 31 |
ADD_PLOTS = False
|
| 32 |
|
| 33 |
+
EVAL_REQUESTS_PATH = "eval-queue"
|
| 34 |
+
EVAL_RESULTS_PATH = "eval-results"
|
| 35 |
|
| 36 |
+
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
| 37 |
+
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
| 38 |
|
| 39 |
+
api = HfApi()
|
| 40 |
|
| 41 |
def restart_space():
|
| 42 |
api.restart_space(
|
| 43 |
repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
|
| 44 |
)
|
| 45 |
|
| 46 |
+
eval_queue, requested_models, eval_results = load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH)
|
| 47 |
+
|
| 48 |
+
if not IS_PUBLIC:
|
| 49 |
+
eval_queue_private, requested_models_private, eval_results_private = load_all_info_from_hub(PRIVATE_QUEUE_REPO, PRIVATE_RESULTS_REPO, EVAL_REQUESTS_PATH_PRIVATE, EVAL_RESULTS_PATH_PRIVATE)
|
| 50 |
+
else:
|
| 51 |
+
eval_queue_private, eval_results_private = None, None
|
| 52 |
|
| 53 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
| 54 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
|
| 74 |
|
| 75 |
|
| 76 |
def get_leaderboard_df():
|
| 77 |
+
if eval_results:
|
| 78 |
print("Pulling evaluation results for the leaderboard.")
|
| 79 |
+
eval_results.git_pull()
|
| 80 |
+
if eval_results_private:
|
| 81 |
+
print("Pulling evaluation results for the leaderboard.")
|
| 82 |
+
eval_results_private.git_pull()
|
| 83 |
|
| 84 |
all_data = get_eval_results_dicts(IS_PUBLIC)
|
| 85 |
|
|
|
|
| 101 |
|
| 102 |
def get_evaluation_queue_df():
|
| 103 |
# todo @saylortwift: replace the repo by the one you created for the eval queue
|
| 104 |
+
if eval_queue:
|
| 105 |
+
print("Pulling changes for the evaluation queue.")
|
| 106 |
+
eval_queue.git_pull()
|
| 107 |
+
if eval_queue_private:
|
| 108 |
print("Pulling changes for the evaluation queue.")
|
| 109 |
+
eval_queue_private.git_pull()
|
| 110 |
|
| 111 |
entries = [
|
| 112 |
entry
|
|
|
|
| 126 |
data["revision"] = data.get("revision", "main")
|
| 127 |
|
| 128 |
all_evals.append(data)
|
| 129 |
+
elif ".md" not in entry:
|
| 130 |
# this is a folder
|
| 131 |
sub_entries = [
|
| 132 |
e
|
|
|
|
| 144 |
|
| 145 |
pending_list = [e for e in all_evals if e["status"] == "PENDING"]
|
| 146 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
| 147 |
+
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
|
| 148 |
+
df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
|
| 149 |
+
df_running = pd.DataFrame.from_records(running_list, columns=EVAL_COLS)
|
| 150 |
+
df_finished = pd.DataFrame.from_records(finished_list, columns=EVAL_COLS)
|
| 151 |
return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
|
| 152 |
|
| 153 |
|
|
|
|
| 169 |
return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
|
| 170 |
|
| 171 |
except Exception as e:
|
| 172 |
+
print(f"Could not get the model config from the hub.: {e}")
|
| 173 |
return False, "was not found on hub!"
|
| 174 |
|
| 175 |
|
|
|
|
| 220 |
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
|
| 221 |
|
| 222 |
# Check for duplicate submission
|
| 223 |
+
if out_path.split("eval-queue/")[1].lower() in requested_models:
|
| 224 |
return styled_warning("This model has been already submitted.")
|
| 225 |
|
| 226 |
with open(out_path, "w") as f:
|
|
|
|
| 228 |
|
| 229 |
api.upload_file(
|
| 230 |
path_or_fileobj=out_path,
|
| 231 |
+
path_in_repo=out_path.split("eval-queue/")[1],
|
| 232 |
+
repo_id=QUEUE_REPO,
|
| 233 |
token=H4_TOKEN,
|
| 234 |
repo_type="dataset",
|
| 235 |
+
commit_message=f"Add {model} to eval queue",
|
| 236 |
)
|
| 237 |
|
| 238 |
+
# remove the local file
|
| 239 |
+
os.remove(out_path)
|
| 240 |
+
|
| 241 |
+
return styled_message("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list.")
|
| 242 |
|
| 243 |
|
| 244 |
def refresh():
|
|
|
|
| 334 |
)
|
| 335 |
with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
| 336 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
|
| 338 |
with gr.Column():
|
| 339 |
with gr.Row():
|
|
|
|
| 413 |
submission_result,
|
| 414 |
)
|
| 415 |
|
| 416 |
+
with gr.Row():
|
| 417 |
+
with gr.Accordion("📙 Citation", open=False):
|
| 418 |
+
citation_button = gr.Textbox(
|
| 419 |
+
value=CITATION_BUTTON_TEXT,
|
| 420 |
+
label=CITATION_BUTTON_LABEL,
|
| 421 |
+
elem_id="citation-button",
|
| 422 |
+
).style(show_copy_button=True)
|
| 423 |
+
|
| 424 |
dummy = gr.Textbox(visible=False)
|
| 425 |
demo.load(
|
| 426 |
change_tab,
|
src/assets/text_content.py
CHANGED
|
@@ -61,7 +61,7 @@ INTRODUCTION_TEXT = f"""
|
|
| 61 |
|
| 62 |
🤗 Anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
|
| 63 |
|
| 64 |
-
Other cool benchmarks for LLMs are developped at HuggingFace, go check them out: 🙋🤖 [human and GPT4 evals](https://huggingface.co/spaces/HuggingFaceH4/human_eval_llm_leaderboard), 🖥️ [performance
|
| 65 |
"""
|
| 66 |
|
| 67 |
LLM_BENCHMARKS_TEXT = f"""
|
|
@@ -78,6 +78,29 @@ With the plethora of large language models (LLMs) and chatbots being released we
|
|
| 78 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
| 79 |
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
# Reproduction
|
| 82 |
To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/e47e01beea79cfe87421e2dac49e64d499c240b4) of the Eleuther AI Harness:
|
| 83 |
`python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
|
|
@@ -87,10 +110,17 @@ The total batch size we get for models which fit on one A100 node is 16 (8 GPUs
|
|
| 87 |
*You can expect results to vary slightly for different batch sizes because of padding.*
|
| 88 |
|
| 89 |
The tasks and few shots parameters are:
|
| 90 |
-
- ARC: 25-shot, *arc-challenge*
|
| 91 |
-
- HellaSwag: 10-shot, *hellaswag*
|
| 92 |
-
- TruthfulQA: 0-shot, *truthfulqa-mc* (mc2
|
| 93 |
-
- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
"""
|
| 95 |
|
| 96 |
EVALUATION_QUEUE_TEXT = f"""
|
|
|
|
| 61 |
|
| 62 |
🤗 Anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
|
| 63 |
|
| 64 |
+
Other cool benchmarks for LLMs are developped at HuggingFace, go check them out: 🙋🤖 [human and GPT4 evals](https://huggingface.co/spaces/HuggingFaceH4/human_eval_llm_leaderboard), 🖥️ [performance benchmarks](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
|
| 65 |
"""
|
| 66 |
|
| 67 |
LLM_BENCHMARKS_TEXT = f"""
|
|
|
|
| 78 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
| 79 |
|
| 80 |
|
| 81 |
+
# Some good practices before submitting a model
|
| 82 |
+
|
| 83 |
+
## 1) Make sure you can load your model and tokenizer using AutoClasses:
|
| 84 |
+
```python
|
| 85 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
| 86 |
+
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
| 87 |
+
model = AutoModel.from_pretrained("your model name", revision=revision)
|
| 88 |
+
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
| 89 |
+
```
|
| 90 |
+
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
| 91 |
+
|
| 92 |
+
Note: make sure your model is public!
|
| 93 |
+
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
| 94 |
+
|
| 95 |
+
## 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
| 96 |
+
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of weights of your model to the `Extended Viewer`!
|
| 97 |
+
|
| 98 |
+
## 3) Make sure your model has an open license!
|
| 99 |
+
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
| 100 |
+
|
| 101 |
+
## 4) Fill up your model card
|
| 102 |
+
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
| 103 |
+
|
| 104 |
# Reproduction
|
| 105 |
To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/e47e01beea79cfe87421e2dac49e64d499c240b4) of the Eleuther AI Harness:
|
| 106 |
`python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
|
|
|
|
| 110 |
*You can expect results to vary slightly for different batch sizes because of padding.*
|
| 111 |
|
| 112 |
The tasks and few shots parameters are:
|
| 113 |
+
- ARC: 25-shot, *arc-challenge* (`acc_norm`)
|
| 114 |
+
- HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
|
| 115 |
+
- TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
|
| 116 |
+
- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (`acc` of `all`)
|
| 117 |
+
|
| 118 |
+
# In case of model failure
|
| 119 |
+
If your model is displayed in the `FAILED` category, its execution stopped.
|
| 120 |
+
Make sure you have followed the above steps first.
|
| 121 |
+
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
| 122 |
+
|
| 123 |
+
|
| 124 |
"""
|
| 125 |
|
| 126 |
EVALUATION_QUEUE_TEXT = f"""
|
src/auto_leaderboard/load_results.py
CHANGED
|
@@ -7,14 +7,13 @@ from typing import Dict, List, Tuple
|
|
| 7 |
from src.utils_display import AutoEvalColumn, make_clickable_model
|
| 8 |
import numpy as np
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
|
| 13 |
BENCH_TO_NAME = {
|
| 14 |
-
"
|
| 15 |
"hellaswag": AutoEvalColumn.hellaswag.name,
|
| 16 |
-
"
|
| 17 |
-
"
|
| 18 |
}
|
| 19 |
|
| 20 |
|
|
@@ -24,8 +23,8 @@ class EvalResult:
|
|
| 24 |
org: str
|
| 25 |
model: str
|
| 26 |
revision: str
|
| 27 |
-
is_8bit: bool
|
| 28 |
results: dict
|
|
|
|
| 29 |
|
| 30 |
def to_dict(self):
|
| 31 |
if self.org is not None:
|
|
@@ -44,7 +43,7 @@ class EvalResult:
|
|
| 44 |
)
|
| 45 |
|
| 46 |
for benchmark in BENCHMARKS:
|
| 47 |
-
if not
|
| 48 |
self.results[benchmark] = None
|
| 49 |
|
| 50 |
for k, v in BENCH_TO_NAME.items():
|
|
@@ -53,57 +52,61 @@ class EvalResult:
|
|
| 53 |
return data_dict
|
| 54 |
|
| 55 |
|
| 56 |
-
def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
|
| 57 |
with open(json_filepath) as fp:
|
| 58 |
data = json.load(fp)
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
model
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
else:
|
| 69 |
-
org =
|
| 70 |
-
|
|
|
|
| 71 |
|
| 72 |
-
|
| 73 |
for benchmark, metric in zip(BENCHMARKS, METRICS):
|
| 74 |
-
if benchmark in
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
| 80 |
|
| 81 |
-
return result_key,
|
| 82 |
|
| 83 |
|
| 84 |
def get_eval_results(is_public) -> List[EvalResult]:
|
| 85 |
json_filepaths = glob.glob(
|
| 86 |
-
"
|
| 87 |
)
|
| 88 |
if not is_public:
|
| 89 |
json_filepaths += glob.glob(
|
| 90 |
-
"
|
| 91 |
-
)
|
| 92 |
-
json_filepaths += glob.glob(
|
| 93 |
-
"auto_evals/eval_results/private/**/*.json", recursive=True
|
| 94 |
)
|
| 95 |
-
|
| 96 |
-
json_filepaths += glob.glob(
|
| 97 |
-
"auto_evals/eval_results/public/**/8bit/*.json", recursive=True
|
| 98 |
-
)
|
| 99 |
eval_results = {}
|
| 100 |
|
| 101 |
for json_filepath in json_filepaths:
|
| 102 |
-
result_key,
|
| 103 |
-
|
| 104 |
-
eval_results
|
| 105 |
-
|
| 106 |
-
|
|
|
|
| 107 |
|
| 108 |
eval_results = [v for v in eval_results.values()]
|
| 109 |
|
|
|
|
| 7 |
from src.utils_display import AutoEvalColumn, make_clickable_model
|
| 8 |
import numpy as np
|
| 9 |
|
| 10 |
+
METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
|
| 11 |
+
BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
|
|
|
|
| 12 |
BENCH_TO_NAME = {
|
| 13 |
+
"arc:challenge": AutoEvalColumn.arc.name,
|
| 14 |
"hellaswag": AutoEvalColumn.hellaswag.name,
|
| 15 |
+
"hendrycksTest": AutoEvalColumn.mmlu.name,
|
| 16 |
+
"truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
|
| 17 |
}
|
| 18 |
|
| 19 |
|
|
|
|
| 23 |
org: str
|
| 24 |
model: str
|
| 25 |
revision: str
|
|
|
|
| 26 |
results: dict
|
| 27 |
+
is_8bit: bool = False
|
| 28 |
|
| 29 |
def to_dict(self):
|
| 30 |
if self.org is not None:
|
|
|
|
| 43 |
)
|
| 44 |
|
| 45 |
for benchmark in BENCHMARKS:
|
| 46 |
+
if benchmark not in self.results.keys():
|
| 47 |
self.results[benchmark] = None
|
| 48 |
|
| 49 |
for k, v in BENCH_TO_NAME.items():
|
|
|
|
| 52 |
return data_dict
|
| 53 |
|
| 54 |
|
| 55 |
+
def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
|
| 56 |
with open(json_filepath) as fp:
|
| 57 |
data = json.load(fp)
|
| 58 |
|
| 59 |
+
config = data["config"]
|
| 60 |
+
model = config.get("model_name", None)
|
| 61 |
+
if model is None:
|
| 62 |
+
model = config.get("model_args", None)
|
| 63 |
+
|
| 64 |
+
model_sha = config.get("model_sha", "")
|
| 65 |
+
eval_sha = config.get("lighteval_sha", "")
|
| 66 |
+
model_split = model.split("/", 1)
|
| 67 |
+
|
| 68 |
+
model = model_split[-1]
|
| 69 |
+
|
| 70 |
+
if len(model_split) == 1:
|
| 71 |
+
org = None
|
| 72 |
+
model = model_split[0]
|
| 73 |
+
result_key = f"{model}_{model_sha}_{eval_sha}"
|
| 74 |
else:
|
| 75 |
+
org = model_split[0]
|
| 76 |
+
model = model_split[1]
|
| 77 |
+
result_key = f"{org}_{model}_{model_sha}_{eval_sha}"
|
| 78 |
|
| 79 |
+
eval_results = []
|
| 80 |
for benchmark, metric in zip(BENCHMARKS, METRICS):
|
| 81 |
+
accs = np.array([v[metric] for k, v in data["results"].items() if benchmark in k])
|
| 82 |
+
if accs.size == 0:
|
| 83 |
+
continue
|
| 84 |
+
mean_acc = round(np.mean(accs) * 100.0, 1)
|
| 85 |
+
eval_results.append(EvalResult(
|
| 86 |
+
result_key, org, model, model_sha, {benchmark: mean_acc}
|
| 87 |
+
))
|
| 88 |
|
| 89 |
+
return result_key, eval_results
|
| 90 |
|
| 91 |
|
| 92 |
def get_eval_results(is_public) -> List[EvalResult]:
|
| 93 |
json_filepaths = glob.glob(
|
| 94 |
+
"eval-results/**/results*.json", recursive=True
|
| 95 |
)
|
| 96 |
if not is_public:
|
| 97 |
json_filepaths += glob.glob(
|
| 98 |
+
"private-eval-results/**/results*.json", recursive=True
|
|
|
|
|
|
|
|
|
|
| 99 |
)
|
| 100 |
+
|
|
|
|
|
|
|
|
|
|
| 101 |
eval_results = {}
|
| 102 |
|
| 103 |
for json_filepath in json_filepaths:
|
| 104 |
+
result_key, results = parse_eval_result(json_filepath)
|
| 105 |
+
for eval_result in results:
|
| 106 |
+
if result_key in eval_results.keys():
|
| 107 |
+
eval_results[result_key].results.update(eval_result.results)
|
| 108 |
+
else:
|
| 109 |
+
eval_results[result_key] = eval_result
|
| 110 |
|
| 111 |
eval_results = [v for v in eval_results.values()]
|
| 112 |
|
src/init.py
CHANGED
|
@@ -13,26 +13,37 @@ def get_all_requested_models(requested_models_dir):
|
|
| 13 |
if current_depth == depth:
|
| 14 |
file_names.extend([os.path.join(root, file) for file in files])
|
| 15 |
|
| 16 |
-
return set([file_name.lower().split("
|
| 17 |
|
| 18 |
-
def load_all_info_from_hub(
|
| 19 |
-
|
|
|
|
| 20 |
requested_models = None
|
|
|
|
| 21 |
if H4_TOKEN:
|
| 22 |
print("Pulling evaluation requests and results.")
|
| 23 |
|
| 24 |
-
|
| 25 |
-
local_dir=
|
| 26 |
-
clone_from=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
use_auth_token=H4_TOKEN,
|
| 28 |
repo_type="dataset",
|
| 29 |
)
|
| 30 |
-
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
|
|
|
| 34 |
|
| 35 |
-
return
|
| 36 |
|
| 37 |
|
| 38 |
#def load_results(model, benchmark, metric):
|
|
|
|
| 13 |
if current_depth == depth:
|
| 14 |
file_names.extend([os.path.join(root, file) for file in files])
|
| 15 |
|
| 16 |
+
return set([file_name.lower().split("eval-queue/")[1] for file_name in file_names])
|
| 17 |
|
| 18 |
+
def load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, QUEUE_PATH, RESULTS_PATH):
|
| 19 |
+
eval_queue_repo = None
|
| 20 |
+
eval_results_repo = None
|
| 21 |
requested_models = None
|
| 22 |
+
|
| 23 |
if H4_TOKEN:
|
| 24 |
print("Pulling evaluation requests and results.")
|
| 25 |
|
| 26 |
+
eval_queue_repo = Repository(
|
| 27 |
+
local_dir=QUEUE_PATH,
|
| 28 |
+
clone_from=QUEUE_REPO,
|
| 29 |
+
use_auth_token=H4_TOKEN,
|
| 30 |
+
repo_type="dataset",
|
| 31 |
+
)
|
| 32 |
+
eval_queue_repo.git_pull()
|
| 33 |
+
|
| 34 |
+
eval_results_repo = Repository(
|
| 35 |
+
local_dir=RESULTS_PATH,
|
| 36 |
+
clone_from=RESULTS_REPO,
|
| 37 |
use_auth_token=H4_TOKEN,
|
| 38 |
repo_type="dataset",
|
| 39 |
)
|
| 40 |
+
eval_results_repo.git_pull()
|
| 41 |
|
| 42 |
+
requested_models = get_all_requested_models("eval-queue")
|
| 43 |
+
else:
|
| 44 |
+
print("No HuggingFace token provided. Skipping evaluation requests and results.")
|
| 45 |
|
| 46 |
+
return eval_queue_repo, requested_models, eval_results_repo
|
| 47 |
|
| 48 |
|
| 49 |
#def load_results(model, benchmark, metric):
|
src/utils_display.py
CHANGED
|
@@ -15,17 +15,17 @@ def fields(raw_class):
|
|
| 15 |
@dataclass(frozen=True)
|
| 16 |
class AutoEvalColumn: # Auto evals column
|
| 17 |
model = ColumnContent("Model", "markdown", True)
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
model_type = ColumnContent("Type", "bool", False)
|
| 20 |
is_8bit = ColumnContent("8bit", "bool", False, True)
|
| 21 |
license = ColumnContent("Hub License", "str", False)
|
| 22 |
params = ColumnContent("#Params (B)", "number", False)
|
| 23 |
likes = ColumnContent("Hub ❤️", "number", False)
|
| 24 |
-
|
| 25 |
-
arc = ColumnContent("ARC (25-s) ⬆️", "number", True)
|
| 26 |
-
hellaswag = ColumnContent("HellaSwag (10-s) ⬆️", "number", True)
|
| 27 |
-
mmlu = ColumnContent("MMLU (5-s) ⬆️", "number", True)
|
| 28 |
-
truthfulqa = ColumnContent("TruthfulQA (MC) (0-s) ⬆️", "number", True)
|
| 29 |
dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
|
| 30 |
|
| 31 |
@dataclass(frozen=True)
|
|
|
|
| 15 |
@dataclass(frozen=True)
|
| 16 |
class AutoEvalColumn: # Auto evals column
|
| 17 |
model = ColumnContent("Model", "markdown", True)
|
| 18 |
+
average = ColumnContent("Average ⬆️", "number", True)
|
| 19 |
+
arc = ColumnContent("ARC ⬆️", "number", True)
|
| 20 |
+
hellaswag = ColumnContent("HellaSwag ⬆️", "number", True)
|
| 21 |
+
mmlu = ColumnContent("MMLU ⬆️", "number", True)
|
| 22 |
+
truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
|
| 23 |
model_type = ColumnContent("Type", "bool", False)
|
| 24 |
is_8bit = ColumnContent("8bit", "bool", False, True)
|
| 25 |
license = ColumnContent("Hub License", "str", False)
|
| 26 |
params = ColumnContent("#Params (B)", "number", False)
|
| 27 |
likes = ColumnContent("Hub ❤️", "number", False)
|
| 28 |
+
revision = ColumnContent("Model sha", "str", False, False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
|
| 30 |
|
| 31 |
@dataclass(frozen=True)
|