open_cn_llm_leaderboard

Running on CPU Upgrade

xuanricheng commited on Mar 7, 2024

Commit

b6b9254

1 Parent(s): 85128b4

update result format

Files changed (5) hide show

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Open LLM Leaderboard
 emoji: 🏆
 colorFrom: green
 colorTo: indigo
@@ -8,7 +8,7 @@ sdk_version: 4.9.0
 app_file: app.py
 pinned: true
 license: apache-2.0
-duplicated_from: HuggingFaceH4/open_llm_leaderboard
 fullWidth: true
 space_ci: # See https://huggingface.co/spaces/Wauplin/gradio-space-ci
   private: true

 ---
+title: Chinese Open LLM Leaderboard
 emoji: 🏆
 colorFrom: green
 colorTo: indigo
 app_file: app.py
 pinned: true
 license: apache-2.0
+# duplicated_from: HuggingFaceH4/open_llm_leaderboard
 fullWidth: true
 space_ci: # See https://huggingface.co/spaces/Wauplin/gradio-space-ci
   private: true

src/display/about.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from src.display.utils import ModelType
-TITLE = """<h1 align="center" id="space-title">🤗 FlagEval Chinese LLM Leaderboard</h1>"""
 INTRODUCTION_TEXT = """
-📐 The 🤗 FlagEval Chinese LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
-[FlagEval](https://flageval.baai.ac.cn/)
 🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
 The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
@@ -69,8 +69,8 @@ To get more information about quantization, see:
 - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
 ## Useful links
-- [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)
-- [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
 """
 FAQ_TEXT = """

 from src.display.utils import ModelType
+TITLE = """<h1 align="center" id="space-title">🤗 Open Chinese LLM Leaderboard</h1>"""
 INTRODUCTION_TEXT = """
+📐 The 🤗 Open Chinese LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
+This leaderboard is subset of the [FlagEval](https://flageval.baai.ac.cn/)
 🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
 The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
 - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
 ## Useful links
+- [Community resources](https://huggingface.co/spaces/BAAI/open_cn_llm_leaderboard/discussions/174)
+- [Collection of best models](https://huggingface.co/collections/open-cn-llm-leaderboard/chinese-llm-leaderboard-best-models-65b0d4511dbd85fd0c3ad9cd)
 """
 FAQ_TEXT = """

src/display/utils.py CHANGED Viewed

@@ -14,12 +14,13 @@ class Task:
     col_name: str
 class Tasks(Enum):
-    arc = Task("arc:challenge", "acc_norm", "ARC")
-    hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
-    mmlu = Task("hendrycksTest", "acc", "MMLU")
-    truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
-    winogrande = Task("winogrande", "acc", "Winogrande")
-    gsm8k = Task("gsm8k", "acc", "GSM8K")
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
@@ -82,6 +83,7 @@ baseline_row = {
     AutoEvalColumn.truthfulqa.name: 25.0,
     AutoEvalColumn.winogrande.name: 50.0,
     AutoEvalColumn.gsm8k.name: 0.21,
     AutoEvalColumn.dummy.name: "baseline",
     AutoEvalColumn.model_type.name: "",
     AutoEvalColumn.flagged.name: False,
@@ -107,6 +109,7 @@ human_baseline_row = {
     AutoEvalColumn.truthfulqa.name: 94.0,
     AutoEvalColumn.winogrande.name: 94.0,
     AutoEvalColumn.gsm8k.name: 100,
     AutoEvalColumn.dummy.name: "human_baseline",
     AutoEvalColumn.model_type.name: "",
     AutoEvalColumn.flagged.name: False,

     col_name: str
 class Tasks(Enum):
+    arc = Task("arc:challenge", "acc_norm", "C-ARC")
+    hellaswag = Task("hellaswag", "acc_norm", "C-HellaSwag")
+    truthfulqa = Task("truthfulqa:mc", "mc2", "C-TruthfulQA")
+    winogrande = Task("winogrande", "acc", "C-Winogrande")
+    gsm8k = Task("gsm8k", "acc", "C-GSM8K")
+    c_sem = Task("c-sem-v2", "acc", "C-SEM")
+    mmlu = Task("cmmlu", "acc", "C-MMLU")
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
     AutoEvalColumn.truthfulqa.name: 25.0,
     AutoEvalColumn.winogrande.name: 50.0,
     AutoEvalColumn.gsm8k.name: 0.21,
+    AutoEvalColumn.c_sem.name: 25.0,
     AutoEvalColumn.dummy.name: "baseline",
     AutoEvalColumn.model_type.name: "",
     AutoEvalColumn.flagged.name: False,
     AutoEvalColumn.truthfulqa.name: 94.0,
     AutoEvalColumn.winogrande.name: 94.0,
     AutoEvalColumn.gsm8k.name: 100,
+    AutoEvalColumn.c_sem.name: 100,
     AutoEvalColumn.dummy.name: "human_baseline",
     AutoEvalColumn.model_type.name: "",
     AutoEvalColumn.flagged.name: False,

src/leaderboard/read_evals.py CHANGED Viewed

@@ -87,7 +87,7 @@ class EvalResult:
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
-            mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
         return self(
@@ -149,7 +149,7 @@ class EvalResult:
         }
         for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict

             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
+            mean_acc = np.mean(accs)
             results[task.benchmark] = mean_acc
         return self(
         }
         for task in Tasks:
+            data_dict[task.value.col_name] = self.results.get(task.value.benchmark, 0)
         return data_dict

src/scripts/create_request_file.py CHANGED Viewed

@@ -11,7 +11,7 @@ from src.submission.check_validity import get_model_size
 from src.display.utils import ModelType, WeightType
 EVAL_REQUESTS_PATH = "eval-queue"
-QUEUE_REPO = "open_cn_llm_leaderboard/requests"
 precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
 model_types = [e.name for e in ModelType]

 from src.display.utils import ModelType, WeightType
 EVAL_REQUESTS_PATH = "eval-queue"
+QUEUE_REPO = "open-cn-llm-leaderboard/requests"
 precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
 model_types = [e.name for e in ModelType]