Spaces:

upstage
/

open-ko-llm-leaderboard

Runtime error

App Files Files Community

pr/87

#87

by choco9966 - opened Aug 9, 2024

base: refs/heads/main

←

from: refs/pr/87

Discussion Files changed

+223

-227

Files changed (9) hide show

app.py +42 -20
requirements.txt +9 -14
src/display/about.py +87 -138
src/display/formatting.py +4 -3
src/display/host_sponsor.png +0 -0
src/display/main_logo.png +0 -0
src/display/utils.py +37 -33
src/leaderboard/read_evals.py +40 -12
src/submission/submit.py +4 -7

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
-from gradio_space_ci.webhook import configure_space_ci
 from src.display.about import (
     CITATION_BUTTON_LABEL,
@@ -32,6 +32,11 @@ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PU
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 from src.tools.collections import update_collections
 def restart_space():
@@ -58,6 +63,8 @@ if REPO_ID == "upstage/open-ko-llm-leaderboard": # update only when it's from re
     update_collections(original_df.copy())
 leaderboard_df = original_df.copy()
 (
     finished_eval_queue_df,
     running_eval_queue_df,
@@ -148,6 +155,7 @@ def filter_models(
     params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
     mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
     filtered_df = filtered_df.loc[mask]
     return filtered_df
 leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, False, False)
@@ -291,13 +299,28 @@ with demo:
                     leaderboard_table,
                     queue=True,
                 )
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
             gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
-        # with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-        with gr.TabItem("Submission Info", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
@@ -360,7 +383,7 @@ with demo:
                         choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                         label="Model type",
                         multiselect=False,
-                        value=ModelType.FT.to_str(" : "),
                         interactive=True,
                     )
@@ -381,22 +404,21 @@ with demo:
                     )
                     base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            # submit_button = gr.Button("Submit Evalulation!")
-            submit_button = gr.Button("We are no longer accepting submissions.", interactive=False)
             submission_result = gr.Markdown()
-            # submit_button.click(
-            #     add_new_eval,
-            #     [
-            #         model_name_textbox,
-            #         base_model_name_textbox,
-            #         revision_name_textbox,
-            #         precision,
-            #         private,
-            #         weight_type,
-            #         model_type,
-            #     ],
-            #     submission_result,
-            # )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
+from gradio_space_ci import configure_space_ci # FOR CI
 from src.display.about import (
     CITATION_BUTTON_LABEL,
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 from src.tools.collections import update_collections
+from src.tools.plots import (
+    create_metric_plot_obj,
+    create_plot_df,
+    create_scores_df,
+)
 def restart_space():
     update_collections(original_df.copy())
 leaderboard_df = original_df.copy()
+plot_df = create_plot_df(create_scores_df(raw_data))
 (
     finished_eval_queue_df,
     running_eval_queue_df,
     params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
     mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
     filtered_df = filtered_df.loc[mask]
     return filtered_df
 leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, False, False)
                     leaderboard_table,
                     queue=True,
                 )
+        with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=4):
+            with gr.Row():
+                with gr.Column():
+                    chart = create_metric_plot_obj(
+                        plot_df,
+                        [AutoEvalColumn.average.name],
+                        title="Average of Top Scores Over Time (from last update)",
+                    )
+                    gr.Plot(value=chart, min_width=500)
+                with gr.Column():
+                    chart = create_metric_plot_obj(
+                        plot_df,
+                        BENCHMARK_COLS,
+                        title="Top Scores Over Time (from last update)",
+                    )
+                    gr.Plot(value=chart, min_width=500)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
             gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                         choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                         label="Model type",
                         multiselect=False,
+                        value=ModelType.IFT.to_str(" : "),
                         interactive=True,
                     )
                     )
                     base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+            submit_button = gr.Button("Submit Evalulation!")
             submission_result = gr.Markdown()
+            submit_button.click(
+                add_new_eval,
+                [
+                    model_name_textbox,
+                    base_model_name_textbox,
+                    revision_name_textbox,
+                    precision,
+                    private,
+                    weight_type,
+                    model_type,
+                ],
+                submission_result,
+            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

requirements.txt CHANGED Viewed

@@ -2,22 +2,17 @@ APScheduler==3.10.1
 black==23.11.0
 click==8.1.3
 datasets==2.14.5
-huggingface-hub==0.24.7
-matplotlib==3.8.4
-numpy==1.26.0
-pandas==2.2.2
 plotly==5.14.1
 python-dateutil==2.8.2
 sentencepiece
 tqdm==4.65.0
-transformers==4.43.1
 tokenizers>=0.15.0
-gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected].3 # CI !!!
-isort
-ruff
-gradio==4.31.0
-gradio[oauth]
-gradio_leaderboard==0.0.11
-requests==2.31.0
-requests-oauthlib== 1.3.1
-schedule == 1.2.2

 black==23.11.0
 click==8.1.3
 datasets==2.14.5
+gradio==4.19.2
+gradio_client==0.10.1
+huggingface-hub>=0.18.0
+matplotlib==3.7.1
+numpy==1.24.2
+pandas==2.0.0
 plotly==5.14.1
 python-dateutil==2.8.2
+requests==2.28.2
 sentencepiece
 tqdm==4.65.0
+transformers==4.38.2
 tokenizers>=0.15.0
+gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.1.2 # CI !!!

src/display/about.py CHANGED Viewed

@@ -1,60 +1,63 @@
-import os
-import base64
 from src.display.utils import ModelType
-current_dir = os.path.dirname(os.path.realpath(__file__))
-with open(os.path.join(current_dir, "main_logo.png"), "rb") as image_file:
-    main_logo = base64.b64encode(image_file.read()).decode('utf-8')
-with open(os.path.join(current_dir, "host_sponsor.png"), "rb") as image_file:
-    host_sponsor = base64.b64encode(image_file.read()).decode('utf-8')
-TITLE = f"""<img src="data:image/jpeg;base64,{main_logo}" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
-BOTTOM_LOGO = f"""<img src="data:image/jpeg;base64,{host_sponsor}" style="width:75%;display:block;margin-left:auto;margin-right:auto">"""
 INTRODUCTION_TEXT = f"""
-The previous Leaderboard version is live [here](https://huggingface.co/spaces/choco9966/open-ko-llm-leaderboard-old) 📊
-🚀 The Open Ko-LLM Leaderboard2 🇰🇷 objectively evaluates the performance of Korean Large Language Model (LLM). When you submit a model on the "Submit here!" page, it is automatically evaluated.
-This leaderboard is co-hosted by [Upstage](https://www.upstage.ai/), and [NIA](https://www.nia.or.kr/site/nia_kor/main.do) that provides various Korean Data Sets through [AI-Hub](https://aihub.or.kr/), and operated by [Upstage](https://www.upstage.ai/). The GPU used for evaluation is operated with the support of [KT](https://cloud.kt.com/) and [AICA](https://aica-gj.kr/main.php). If Season 1 focused on evaluating the capabilities of the LLM in terms of reasoning, language understanding, hallucination, and commonsense through academic benchmarks, Season 2 will focus on assessing the LLM's practical abilities and reliability. The datasets for this season are sponsored by [Flitto](https://www.flitto.com/portal/en), [SELECTSTAR](https://selectstar.ai/ko/), and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1). The evaluation dataset is exclusively private and only available for evaluation process. More detailed information about the benchmark dataset is provided on the “About” page.
-You'll notably find explanations on the evaluations we are using, reproducibility guidelines, best practices on how to submit a model, and our FAQ.
 """
 LLM_BENCHMARKS_TEXT = f"""
-# Motivation
 While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Korean leaderboard, 🚀 Open Ko-LLM, to evaluate models that reflect the characteristics of the Korean language and Korean culture. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in Korean.
-## How it works
-📈 We evaluate models on 9 key benchmarks using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) , a unified framework to test generative language models on a large number of different evaluation tasks.
-- Ko-GPQA (provided by [Flitto](https://www.flitto.com/portal/en))
-- Ko-WinoGrande (provided by [Flitto](https://www.flitto.com/portal/en))
-- Ko-GSM8K (provided by [Flitto](https://www.flitto.com/portal/en))
-- Ko-EQ-Bench (provided by [Flitto](https://www.flitto.com/portal/en))
-- Ko-IFEval (provided by [Flitto](https://www.flitto.com/portal/en))
-- KorNAT-Knowledge (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
-- KorNAT-Social-Value (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
-- Ko-Harmlessness (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
-- Ko-Helpfulness (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
-For all these evaluations, a higher score is a better score. We chose these benchmarks as they test a variety of reasoning, harmlessness, helpfulness and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
-The final score is converted to the average score from each evaluation datasets.
-GPUs are provided by [KT](https://cloud.kt.com/) and [AICA](https://aica-gj.kr/main.php) for the evaluations.
-## **Results**
-- Detailed numerical results in the `results` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/results
-- Community queries and running status in the `requests` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/requests
 ## More resources
-If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)!
 """
@@ -63,71 +66,38 @@ FAQ_TEXT = """
 EVALUATION_QUEUE_TEXT = f"""
-# Evaluation Queue for the 🤗 Open Ko-LLM Leaderboard
-Models added here will be automatically evaluated on the 🤗 cluster.
-## Submission Disclaimer
-**By submitting a model, you acknowledge that:**
-- We store information about who submitted each model in [Requests dataset](https://huggingface.co/datasets/open-ko-llm-leaderboard/requests).
-- This practice helps maintain the integrity of our leaderboard, prevent spam, and ensure responsible submissions.
-- Your submission will be visible to the community and you may be contacted regarding your model.
-- Please submit carefully and responsibly 💛
-## First Steps Before Submitting a Model
-### 1. Ensure Your Model Loads with AutoClasses
-Verify that you can load your model and tokenizer using AutoClasses:
-```jsx
 from transformers import AutoConfig, AutoModel, AutoTokenizer
 config = AutoConfig.from_pretrained("your model name", revision=revision)
 model = AutoModel.from_pretrained("your model name", revision=revision)
 tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
 ```
-Note:
-- If this step fails, debug your model before submitting.
-- Ensure your model is public.
-- We are working on adding support for models requiring `use_remote_code=True`.
-### 2. Convert Weights to Safetensors
-[Safetensors](https://huggingface.co/docs/safetensors/index) is a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3. Verify Your Model Open License
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4. Complete Your Model Card
 When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-### 5. Select Correct Precision
-Choose the right precision to avoid evaluation errors:
-- Not all models convert properly from float16 to bfloat16.
-- Incorrect precision can cause issues (e.g., loading a bf16 model in fp16 may generate NaNs).
-> Important: When submitting, git branches and tags will be strictly tied to the specific commit present at the time of submission to ensure revision consistency.
->
-## Model types
-- 🟢 : 🟢 pretrained model: new, base models, trained on a given text corpora using masked modelling
-- 🟩 : 🟩 continuously pretrained model: new, base models, continuously trained on further corpus (which may include IFT/chat data) using masked modelling
-- 🔶 : 🔶 fine-tuned on domain-specific datasets model: pretrained models finetuned on more data
-- 💬 : 💬 chat models (RLHF, DPO, IFT, ...) model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
-- 🤝 : 🤝 base merges and moerges model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
-Please provide information about the model through an issue! 🤩
-🏴‍☠️ : 🏴‍☠️ This icon indicates that the model has been selected as a subject of caution by the community, implying that users should exercise restraint when using it. Clicking on the icon will take you to a discussion about that model. (Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, are selected as subjects of caution.)
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Authors of open-ko-llm-leaderboard are ordered alphabetically."
@@ -136,10 +106,8 @@ CITATION_BUTTON_TEXT = r"""
       title={Open Ko-LLM Leaderboard: Evaluating Large Language Models in Korean with Ko-H5 Benchmark},
       author={Chanjun Park and Hyeonwoo Kim and Dahyun Kim and Seonghwan Cho and Sanghoon Kim and Sukyung Lee and Yungi Kim and Hwalsuk Lee},
       year={2024},
-      booktitle={The 62nd Annual Meeting of the Association for Computational Linguistics (ACL 2024) }
 }
 @software{eval-harness,
   author       = {Gao, Leo and
                   Tow, Jonathan and
@@ -164,59 +132,40 @@ CITATION_BUTTON_TEXT = r"""
   publisher    = {Zenodo},
   version      = {v0.0.1},
   doi          = {10.5281/zenodo.5371628},
-  url          = {https://doi.org/10.5281/zenodo.5371628},
-}
-@misc{rein2023gpqagraduatelevelgoogleproofqa,
-  title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark},
-  author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman},
-  year={2023},
-  eprint={2311.12022},
-  archivePrefix={arXiv},
-  primaryClass={cs.AI},
-  url={https://arxiv.org/abs/2311.12022},
-}
-@article{sakaguchi2021winogrande,
-  title={Winogrande: An adversarial winograd schema challenge at scale},
-  author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
-  journal={Communications of the ACM},
-  volume={64},
-  number={9},
-  pages={99--106},
-  year={2021},
-  publisher={ACM New York, NY, USA}
 }
-@article{cobbe2021training,
-  title={Training verifiers to solve math word problems},
-  author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
-  journal={arXiv preprint arXiv:2110.14168},
-  year={2021}
 }
-article{paech2023eq,
-  title={Eq-bench: An emotional intelligence benchmark for large language models},
-  author={Paech, Samuel J},
-  journal={arXiv preprint arXiv:2312.06281},
-  year={2023}
 }
-@misc{zhou2023instructionfollowingevaluationlargelanguage,
-  title={Instruction-Following Evaluation for Large Language Models},
-  author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
-  year={2023},
-  eprint={2311.07911},
-  archivePrefix={arXiv},
-  primaryClass={cs.CL},
-  url={https://arxiv.org/abs/2311.07911},
 }
-@article{lee2024kornat,
-  title={KorNAT: LLM Alignment Benchmark for Korean Social Values and Common Knowledge},
-  author={Lee, Jiyoung and Kim, Minwoo and Kim, Seungho and Kim, Junghwan and Won, Seunghyun and Lee, Hwaran and Choi, Edward},
-  journal={arXiv preprint arXiv:2402.13605},
-  year={2024}
 }
 """

 from src.display.utils import ModelType
+TITLE = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/header_logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
+BOTTOM_LOGO = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/footer_logo_240715.png" style="width:50%;display:block;margin-left:auto;margin-right:auto">"""
 INTRODUCTION_TEXT = f"""
+🚀 The Open Ko-LLM Leaderboard 🇰🇷 objectively evaluates the performance of Korean Large Language Model (LLM).
+When you submit a model on the "Submit here!" page, it is automatically evaluated. The GPU used for evaluation is operated with the support of  __[KT](https://cloud.kt.com/)__.
+The data used for evaluation consists of datasets to assess reasoning, language understanding, hallucination, and commonsense.
+The evaluation dataset is exclusively private and only available for evaluation process.
+More detailed information about the benchmark dataset is provided on the “About” page.
+This leaderboard is co-hosted by __[Upstage](https://www.upstage.ai)__, and __[NIA](https://www.nia.or.kr/site/nia_kor/main.do)__ that provides various Korean Data Sets through __[AI-Hub](https://aihub.or.kr)__, and operated by __[Upstage](https://www.upstage.ai)__.
 """
 LLM_BENCHMARKS_TEXT = f"""
+# Context
 While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Korean leaderboard, 🚀 Open Ko-LLM, to evaluate models that reflect the characteristics of the Korean language and Korean culture. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in Korean.
+## Icons
+{ModelType.PT.to_str(" : ")} model
+{ModelType.IFT.to_str(" : ")} model
+{ModelType.RL.to_str(" : ")} model
+If there is no icon, it indicates that there is insufficient information about the model.
+Please provide information about the model through an issue! 🤩
+🏴‍☠️ : This icon indicates that the model has been selected as a subject of caution by the community, implying that users should exercise restraint when using it. Clicking on the icon will take you to a discussion about that model.
+(Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, are selected as subjects of caution.)
+## How it works
+📈 We evaluate models using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), a unified framework to test generative language models on a large number of different evaluation tasks.
+We have set up a benchmark using datasets translated into Korean, and applied variations by human experts, from the six tasks (HellaSwag, MMLU, Arc, Truthful QA, Winogrande, GSM8k) operated by __HuggingFace [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)__. We have also added a new dataset prepared from scratch.
+- Ko-HellaSwag (provided by __[Upstage](https://www.upstage.ai/)__, machine translation)
+- Ko-MMLU (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
+- Ko-Arc (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
+- Ko-Truthful QA (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
+- Ko-Winogrande (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
+- Ko-GSM8k (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
+- Ko-CommonGen V2 (provided by __[Korea University NLP&AI Lab](http://nlp.korea.ac.kr/)__, created from scratch)
+- Ko-EQ Bench (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
+- Ko-InstFollow (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
+- KorNAT-CKA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
+- KorNAT-SVA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
+- Ko-Harmlessness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
+- Ko-Helpfulness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
+To provide an evaluation befitting the LLM era, we've selected benchmark datasets suitable for assessing these elements: expertise, inference, hallucination, truthfulness and common sense. The final score is converted to the average score from each evaluation datasets.
+GPUs are provided by __[KT](https://cloud.kt.com/)__ for the evaluations.
+## Details and Logs
+- Detailed numerical results in the `results` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/results
+- Community queries and running status in the `requests` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/requests
 ## More resources
+If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)!
 """
 EVALUATION_QUEUE_TEXT = f"""
+# Evaluation Queue for the 🚀 Open Ko-LLM Leaderboard
+Models added here will be automatically evaluated on the KT GPU cluster.
+## <Some good practices before submitting a model>
+### 1️⃣ Make sure you can load your model and tokenizer using AutoClasses
+```python
 from transformers import AutoConfig, AutoModel, AutoTokenizer
 config = AutoConfig.from_pretrained("your model name", revision=revision)
 model = AutoModel.from_pretrained("your model name", revision=revision)
 tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
 ```
+If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
+⚠️ Make sure your model is public!
+⚠️ Maker sure your model runs with [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)
+⚠️ If your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
+### 2️⃣ Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
+It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
+### 3️⃣ Make sure your model has an open license!
+This is a leaderboard for 🚀 Open Ko-LLMs, and we'd love for as many people as possible to know they can use your model
+### 4️⃣ Fill up your model card
 When we add extra information about models to the leaderboard, it will be automatically taken from the model card
+## In case of model failure
+If your model is displayed in the `FAILED` category, its execution stopped. Make sure you have followed the above steps first. If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Authors of open-ko-llm-leaderboard are ordered alphabetically."
       title={Open Ko-LLM Leaderboard: Evaluating Large Language Models in Korean with Ko-H5 Benchmark},
       author={Chanjun Park and Hyeonwoo Kim and Dahyun Kim and Seonghwan Cho and Sanghoon Kim and Sukyung Lee and Yungi Kim and Hwalsuk Lee},
       year={2024},
+      booktitle={ACL Main}
 }
 @software{eval-harness,
   author       = {Gao, Leo and
                   Tow, Jonathan and
   publisher    = {Zenodo},
   version      = {v0.0.1},
   doi          = {10.5281/zenodo.5371628},
+  url          = {https://doi.org/10.5281/zenodo.5371628}
 }
+@misc{seo2023kocommongen,
+      title={Korean Commonsense Reasoning Evaluation for Large Language Models},
+      author={Jaehyung Seo, Chanjun Park, Hyeonseok Moon, Sugyeong Eo, Aram So, Heuiseok Lim},
+      year={2023},
+      affilation={Korea University, NLP&AI},
+      booktitle={Proceedings of the 35th Annual Conference on Human & Cognitive Language Technology}}
+@misc{park2023koarc,
+      title={Ko-ARC},
+      original_title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
+      author={Hyunbyung Park, Chanjun Park},
+      original_author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
+      year={2023}
 }
+@misc{park2023kohellaswag,
+      title={Ko-HellaSwag},
+      original_title={HellaSwag: Can a Machine Really Finish Your Sentence?},
+      author={Hyunbyung Park, Chanjun Park},
+      original_author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
+      year={2023}
 }
+@misc{park2023kommlu,
+      title={Ko-MMLU},
+      original_title={Measuring Massive Multitask Language Understanding},
+      author={Hyunbyung Park, Chanjun Park},
+      original_author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
+      year={2023}
 }
+@misc{park2023kotruthfulqa,
+      title={Ko-TruthfulQA},
+      original_title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
+      author={Hyunbyung Park, Chanjun Park},
+      original_author={Stephanie Lin and Jacob Hilton and Owain Evans},
+      year={2023}
 }
 """

src/display/formatting.py CHANGED Viewed

@@ -14,9 +14,10 @@ def model_hyperlink(link, model_name):
 def make_clickable_model(model_name):
     link = f"https://huggingface.co/{model_name}"
-    # details_model_name = model_name.replace("/", "__")
-    # details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
-    return model_hyperlink(link, model_name) # + "  " + model_hyperlink(details_link, "📑")
 def styled_error(error):

 def make_clickable_model(model_name):
     link = f"https://huggingface.co/{model_name}"
+    details_model_name = model_name.replace("/", "__")
+    details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
+    return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "📑")
 def styled_error(error):

src/display/host_sponsor.png DELETED Viewed

Binary file (131 kB)

src/display/main_logo.png DELETED Viewed

Binary file (345 kB)

src/display/utils.py CHANGED Viewed

@@ -14,15 +14,19 @@ class Task:
     col_name: str
 class Tasks(Enum):
-    gpqa = Task("ko_gpqa_diamond_zeroshot", "acc_norm,none", "Ko-GPQA")
-    winogrande = Task("ko_winogrande", "acc,none", "Ko-Winogrande")
-    gsm8k = Task("ko_gsm8k", "exact_match,strict-match", "Ko-GSM8k")
-    eqBench = Task("ko_eqbench", "eqbench,none", "Ko-EQ Bench")
-    instFollow = Task("ko_ifeval", "strict_acc,none", "Ko-IFEval")
-    korNatCka = Task("kornat_common", "acc_norm,none", "KorNAT-CKA")
-    korNatSva = Task("kornat_social", "A-SVA,none", "KorNAT-SVA")
-    harmlessness = Task("kornat_harmless", "acc_norm,none", "Ko-Harmlessness")
-    helpfulness = Task("kornat_helpful", "acc_norm,none", "Ko-Helpfulness")
 # These classes are for user facing column names,
@@ -85,30 +89,26 @@ class ModelDetails:
 class ModelType(Enum):
     PT = ModelDetails(name="pretrained", symbol="🟢")
-    CPT = ModelDetails(name="continuously pretrained", symbol="🟩")
-    FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶")
-    chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
-    merges = ModelDetails(name="base merges and moerges", symbol="🤝")
-    Unknown = ModelDetails(name="other", symbol="❓")
     def to_str(self, separator=" "):
         return f"{self.value.symbol}{separator}{self.value.name}"
     @staticmethod
-    def from_str(m_type):
-        if any([k for k in m_type if k in ["fine-tuned","🔶", "finetuned"]]):
-            return ModelType.FT
-        if "continuously pretrained" in m_type or "🟩" in m_type:
-            return ModelType.CPT
-        if "pretrained" in m_type or "🟢" in m_type:
             return ModelType.PT
-        if any([k in m_type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
-            return ModelType.chat
-        if "merge" in m_type or "🤝" in m_type:
-            return ModelType.merges
         return ModelType.Unknown
 class WeightType(Enum):
     Adapter = ModelDetails("Adapter")
     Original = ModelDetails("Original")
@@ -116,13 +116,12 @@ class WeightType(Enum):
 class Precision(Enum):
     float16 = ModelDetails("float16")
-    bfloat16 = ModelDetails("bfloat16")
-    qt_8bit = ModelDetails("8bit")
-    qt_4bit = ModelDetails("4bit")
-    qt_GPTQ = ModelDetails("GPTQ")
     Unknown = ModelDetails("?")
-    @staticmethod
     def from_str(precision):
         if precision in ["torch.float16", "float16"]:
             return Precision.float16
@@ -135,10 +134,15 @@ class Precision(Enum):
         if precision in ["GPTQ", "None"]:
             return Precision.qt_GPTQ
         return Precision.Unknown
 # Column selection
-COLS = [c.name for c in fields(AutoEvalColumn)]
-TYPES = [c.type for c in fields(AutoEvalColumn)]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
@@ -153,4 +157,4 @@ NUMERIC_INTERVALS = {
     "13~35B": pd.Interval(13, 35, closed="right"),
     "35~60B": pd.Interval(35, 60, closed="right"),
     "60B+": pd.Interval(60, 10000, closed="right"),
-}

     col_name: str
 class Tasks(Enum):
+    arc = Task("ko_arc_challenge", "acc_norm", "Ko-ARC")
+    hellaswag = Task("ko_hellaswag", "acc_norm", "Ko-HellaSwag")
+    mmlu = Task("ko_mmlu", "acc", "Ko-MMLU")
+    truthfulqa = Task("ko_truthfulqa_mc", "mc2", "Ko-TruthfulQA")
+    winogrande = Task("ko_winogrande", "acc_norm", "Ko-Winogrande")
+    gsm8k = Task("ko_gsm8k", "acc_norm", "Ko-GSM8k")
+    commongen_v2 = Task("ko_commongen_v2", "acc_norm", "Ko-CommonGen V2")
+    eqBench = Task("ko_eq_bench", "acc_norm", "Ko-EQ Bench")
+    instFollow = Task("ko_inst_follow", "acc_norm", "Ko-InstFollow")
+    korNatCka = Task("kor_nat_cka", "acc_norm", "KorNAT-CKA")
+    korNatSva = Task("kor_nat_sva", "acc_norm", "KorNAT-SVA")
+    harmlessness = Task("ko_harmlessness", "acc_norm", "Ko-Harmlessness")
+    helpfulness = Task("ko_helpfulness", "acc_norm", "Ko-Helpfulness")
 # These classes are for user facing column names,
 class ModelType(Enum):
     PT = ModelDetails(name="pretrained", symbol="🟢")
+    # FT = ModelDetails(name="fine-tuned", symbol="🔶")
+    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
+    RL = ModelDetails(name="RL-tuned", symbol="🟦")
+    Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
         return f"{self.value.symbol}{separator}{self.value.name}"
     @staticmethod
+    def from_str(type):
+        # if "fine-tuned" in type or "🔶" in type:
+        #     return ModelType.FT
+        if "pretrained" in type or "🟢" in type:
             return ModelType.PT
+        if "RL-tuned" in type or "🟦" in type:
+            return ModelType.RL
+        if "instruction-tuned" in type or "⭕" in type:
+            return ModelType.IFT
         return ModelType.Unknown
 class WeightType(Enum):
     Adapter = ModelDetails("Adapter")
     Original = ModelDetails("Original")
 class Precision(Enum):
     float16 = ModelDetails("float16")
+    # bfloat16 = ModelDetails("bfloat16")
+    # qt_8bit = ModelDetails("8bit")
+    # qt_4bit = ModelDetails("4bit")
+    # qt_GPTQ = ModelDetails("GPTQ")
     Unknown = ModelDetails("?")
     def from_str(precision):
         if precision in ["torch.float16", "float16"]:
             return Precision.float16
         if precision in ["GPTQ", "None"]:
             return Precision.qt_GPTQ
         return Precision.Unknown
 # Column selection
+COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
+COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
+TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
     "13~35B": pd.Interval(13, 35, closed="right"),
     "35~60B": pd.Interval(35, 60, closed="right"),
     "60B+": pd.Interval(60, 10000, closed="right"),
+}

src/leaderboard/read_evals.py CHANGED Viewed

@@ -48,7 +48,7 @@ class EvalResult:
         precision = Precision.from_str(config.get("model_dtype"))
         # Get model and org
-        org_and_model = config.get("model_name", None)
         org_and_model = org_and_model.split("/", 1)
         if len(org_and_model) == 1:
@@ -96,18 +96,26 @@ class EvalResult:
         results = {}
         for task in Tasks:
             task = task.value
-            if task.benchmark in ["ko_ifeval"]:
-                ko_ifeval = data["results"]["ko_ifeval"]
-                accs = np.mean([ko_ifeval["prompt_level_strict_acc,none"], ko_ifeval["inst_level_strict_acc,none"]])
-                mean_acc = np.mean(accs) * 100.0
-            if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eqbench", "kornat_common", "kornat_social", "kornat_harmless", "kornat_helpful", "ko_gpqa_diamond_zeroshot"]:
-                accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
                 if accs.size == 0 or any([acc is None for acc in accs]):
                     continue
-                if task.benchmark not in ["ko_eqbench"]:
-                    mean_acc = accs[0] * 100.0
-                else:
-                    mean_acc = accs[0]
             results[task.benchmark] = mean_acc
         return self(
@@ -143,7 +151,27 @@ class EvalResult:
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks))
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,

         precision = Precision.from_str(config.get("model_dtype"))
         # Get model and org
+        org_and_model = config.get("model_name", config.get("model_args", None))
         org_and_model = org_and_model.split("/", 1)
         if len(org_and_model) == 1:
         results = {}
         for task in Tasks:
             task = task.value
+            # Some truthfulQA values are NaNs
+            if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
+                if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
+                    results[task.benchmark] = 0.0
+                    continue
+            # New tasks have been added, we need to skip them if not exists
+            if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eq_bench", "ko_inst_follow", "kor_nat_cka", "kor_nat_sva", "ko_harmlessness", "ko_helpfulness"]:
+                accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
                 if accs.size == 0 or any([acc is None for acc in accs]):
+                    results[task.benchmark] = 0.0
                     continue
+            # We average all scores of a given metric (mostly for mmlu)
+            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
+            if accs.size == 0 or any([acc is None for acc in accs]):
+                continue
+            mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
         return self(
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
+        # Skip the new tasks for now
+        # TODO: safely remove this code when the task results are all added
+        skip_avg_len = 0
+        if self.results['ko_winogrande'] == 0.0:
+            skip_avg_len += 1
+        if self.results['ko_gsm8k'] == 0.0:
+            skip_avg_len += 1
+        if self.results['ko_eq_bench'] == 0.0:
+            skip_avg_len += 1
+        if self.results['ko_inst_follow'] == 0.0:
+            skip_avg_len += 1
+        if self.results['kor_nat_cka'] == 0.0:
+            skip_avg_len += 1
+        if self.results['kor_nat_sva'] == 0.0:
+            skip_avg_len += 1
+        if self.results['ko_harmlessness'] == 0.0:
+            skip_avg_len += 1
+        if self.results['ko_helpfulness'] == 0.0:
+            skip_avg_len += 1
+        average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,

src/submission/submit.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import json
 import os
 from datetime import datetime, timezone
-import pandas as pd
 from src.display.formatting import styled_error, styled_message, styled_warning
 from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
@@ -13,7 +12,6 @@ from src.submission.check_validity import (
     is_model_on_hub,
     user_submission_permission,
 )
-from src.populate import get_evaluation_queue_df
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
@@ -40,7 +38,10 @@ def add_new_eval(
     precision = precision.split(" ")[0]
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     if model_type is None or model_type == "":
         return styled_error("Please select a model type.")
@@ -99,9 +100,6 @@ def add_new_eval(
     # Seems good, creating the eval
     print("Adding new eval")
-    # dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, cols=["job_id"])
-    # dfs = pd.concat(dfs).reset_index(drop=True)
-    # max_job_id = max([int(c) for c in dfs["job_id"].values])
     eval_entry = {
         "model": model,
@@ -116,7 +114,6 @@ def add_new_eval(
         "likes": model_info.likes,
         "params": model_size,
         "license": license,
-        # "job_id": max_job_id+1
     }
     # Check for duplicate submission

 import json
 import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
 from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
     is_model_on_hub,
     user_submission_permission,
 )
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
     precision = precision.split(" ")[0]
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    # 리더보드 종료
+    if True:
+        return styled_error("The current Season 1 will conclude on Friday, August 2, and the new season will commence on August 12.")
     if model_type is None or model_type == "":
         return styled_error("Please select a model type.")
     # Seems good, creating the eval
     print("Adding new eval")
     eval_entry = {
         "model": model,
         "likes": model_info.likes,
         "params": model_size,
         "license": license,
     }
     # Check for duplicate submission