Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
Β·
b6b9254
1
Parent(s):
85128b4
update result format
Browse files- README.md +2 -2
- src/display/about.py +5 -5
- src/display/utils.py +9 -6
- src/leaderboard/read_evals.py +2 -2
- src/scripts/create_request_file.py +1 -1
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title: Open LLM Leaderboard
|
| 3 |
emoji: π
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: indigo
|
|
@@ -8,7 +8,7 @@ sdk_version: 4.9.0
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
| 11 |
-
duplicated_from: HuggingFaceH4/open_llm_leaderboard
|
| 12 |
fullWidth: true
|
| 13 |
space_ci: # See https://huggingface.co/spaces/Wauplin/gradio-space-ci
|
| 14 |
private: true
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Chinese Open LLM Leaderboard
|
| 3 |
emoji: π
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: indigo
|
|
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
| 11 |
+
# duplicated_from: HuggingFaceH4/open_llm_leaderboard
|
| 12 |
fullWidth: true
|
| 13 |
space_ci: # See https://huggingface.co/spaces/Wauplin/gradio-space-ci
|
| 14 |
private: true
|
src/display/about.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
from src.display.utils import ModelType
|
| 2 |
|
| 3 |
-
TITLE = """<h1 align="center" id="space-title">π€
|
| 4 |
|
| 5 |
INTRODUCTION_TEXT = """
|
| 6 |
-
π The π€
|
| 7 |
-
[FlagEval](https://flageval.baai.ac.cn/)
|
| 8 |
|
| 9 |
π€ Submit a model for automated evaluation on the π€ GPU cluster on the "Submit" page!
|
| 10 |
The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
|
|
@@ -69,8 +69,8 @@ To get more information about quantization, see:
|
|
| 69 |
- 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
|
| 70 |
|
| 71 |
## Useful links
|
| 72 |
-
- [Community resources](https://huggingface.co/spaces/
|
| 73 |
-
- [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-
|
| 74 |
"""
|
| 75 |
|
| 76 |
FAQ_TEXT = """
|
|
|
|
| 1 |
from src.display.utils import ModelType
|
| 2 |
|
| 3 |
+
TITLE = """<h1 align="center" id="space-title">π€ Open Chinese LLM Leaderboard</h1>"""
|
| 4 |
|
| 5 |
INTRODUCTION_TEXT = """
|
| 6 |
+
π The π€ Open Chinese LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
|
| 7 |
+
This leaderboard is subset of the [FlagEval](https://flageval.baai.ac.cn/)
|
| 8 |
|
| 9 |
π€ Submit a model for automated evaluation on the π€ GPU cluster on the "Submit" page!
|
| 10 |
The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
|
|
|
|
| 69 |
- 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
|
| 70 |
|
| 71 |
## Useful links
|
| 72 |
+
- [Community resources](https://huggingface.co/spaces/BAAI/open_cn_llm_leaderboard/discussions/174)
|
| 73 |
+
- [Collection of best models](https://huggingface.co/collections/open-cn-llm-leaderboard/chinese-llm-leaderboard-best-models-65b0d4511dbd85fd0c3ad9cd)
|
| 74 |
"""
|
| 75 |
|
| 76 |
FAQ_TEXT = """
|
src/display/utils.py
CHANGED
|
@@ -14,12 +14,13 @@ class Task:
|
|
| 14 |
col_name: str
|
| 15 |
|
| 16 |
class Tasks(Enum):
|
| 17 |
-
arc = Task("arc:challenge", "acc_norm", "ARC")
|
| 18 |
-
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
| 23 |
|
| 24 |
# These classes are for user facing column names,
|
| 25 |
# to avoid having to change them all around the code
|
|
@@ -82,6 +83,7 @@ baseline_row = {
|
|
| 82 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
| 83 |
AutoEvalColumn.winogrande.name: 50.0,
|
| 84 |
AutoEvalColumn.gsm8k.name: 0.21,
|
|
|
|
| 85 |
AutoEvalColumn.dummy.name: "baseline",
|
| 86 |
AutoEvalColumn.model_type.name: "",
|
| 87 |
AutoEvalColumn.flagged.name: False,
|
|
@@ -107,6 +109,7 @@ human_baseline_row = {
|
|
| 107 |
AutoEvalColumn.truthfulqa.name: 94.0,
|
| 108 |
AutoEvalColumn.winogrande.name: 94.0,
|
| 109 |
AutoEvalColumn.gsm8k.name: 100,
|
|
|
|
| 110 |
AutoEvalColumn.dummy.name: "human_baseline",
|
| 111 |
AutoEvalColumn.model_type.name: "",
|
| 112 |
AutoEvalColumn.flagged.name: False,
|
|
|
|
| 14 |
col_name: str
|
| 15 |
|
| 16 |
class Tasks(Enum):
|
| 17 |
+
arc = Task("arc:challenge", "acc_norm", "C-ARC")
|
| 18 |
+
hellaswag = Task("hellaswag", "acc_norm", "C-HellaSwag")
|
| 19 |
+
truthfulqa = Task("truthfulqa:mc", "mc2", "C-TruthfulQA")
|
| 20 |
+
winogrande = Task("winogrande", "acc", "C-Winogrande")
|
| 21 |
+
gsm8k = Task("gsm8k", "acc", "C-GSM8K")
|
| 22 |
+
c_sem = Task("c-sem-v2", "acc", "C-SEM")
|
| 23 |
+
mmlu = Task("cmmlu", "acc", "C-MMLU")
|
| 24 |
|
| 25 |
# These classes are for user facing column names,
|
| 26 |
# to avoid having to change them all around the code
|
|
|
|
| 83 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
| 84 |
AutoEvalColumn.winogrande.name: 50.0,
|
| 85 |
AutoEvalColumn.gsm8k.name: 0.21,
|
| 86 |
+
AutoEvalColumn.c_sem.name: 25.0,
|
| 87 |
AutoEvalColumn.dummy.name: "baseline",
|
| 88 |
AutoEvalColumn.model_type.name: "",
|
| 89 |
AutoEvalColumn.flagged.name: False,
|
|
|
|
| 109 |
AutoEvalColumn.truthfulqa.name: 94.0,
|
| 110 |
AutoEvalColumn.winogrande.name: 94.0,
|
| 111 |
AutoEvalColumn.gsm8k.name: 100,
|
| 112 |
+
AutoEvalColumn.c_sem.name: 100,
|
| 113 |
AutoEvalColumn.dummy.name: "human_baseline",
|
| 114 |
AutoEvalColumn.model_type.name: "",
|
| 115 |
AutoEvalColumn.flagged.name: False,
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -87,7 +87,7 @@ class EvalResult:
|
|
| 87 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 88 |
continue
|
| 89 |
|
| 90 |
-
mean_acc = np.mean(accs)
|
| 91 |
results[task.benchmark] = mean_acc
|
| 92 |
|
| 93 |
return self(
|
|
@@ -149,7 +149,7 @@ class EvalResult:
|
|
| 149 |
}
|
| 150 |
|
| 151 |
for task in Tasks:
|
| 152 |
-
data_dict[task.value.col_name] = self.results
|
| 153 |
|
| 154 |
return data_dict
|
| 155 |
|
|
|
|
| 87 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 88 |
continue
|
| 89 |
|
| 90 |
+
mean_acc = np.mean(accs)
|
| 91 |
results[task.benchmark] = mean_acc
|
| 92 |
|
| 93 |
return self(
|
|
|
|
| 149 |
}
|
| 150 |
|
| 151 |
for task in Tasks:
|
| 152 |
+
data_dict[task.value.col_name] = self.results.get(task.value.benchmark, 0)
|
| 153 |
|
| 154 |
return data_dict
|
| 155 |
|
src/scripts/create_request_file.py
CHANGED
|
@@ -11,7 +11,7 @@ from src.submission.check_validity import get_model_size
|
|
| 11 |
from src.display.utils import ModelType, WeightType
|
| 12 |
|
| 13 |
EVAL_REQUESTS_PATH = "eval-queue"
|
| 14 |
-
QUEUE_REPO = "
|
| 15 |
|
| 16 |
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
|
| 17 |
model_types = [e.name for e in ModelType]
|
|
|
|
| 11 |
from src.display.utils import ModelType, WeightType
|
| 12 |
|
| 13 |
EVAL_REQUESTS_PATH = "eval-queue"
|
| 14 |
+
QUEUE_REPO = "open-cn-llm-leaderboard/requests"
|
| 15 |
|
| 16 |
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
|
| 17 |
model_types = [e.name for e in ModelType]
|