app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
- from gradio_space_ci.webhook import configure_space_ci
6
 
7
  from src.display.about import (
8
  CITATION_BUTTON_LABEL,
@@ -32,6 +32,11 @@ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PU
32
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
33
  from src.submission.submit import add_new_eval
34
  from src.tools.collections import update_collections
 
 
 
 
 
35
 
36
 
37
  def restart_space():
@@ -58,6 +63,8 @@ if REPO_ID == "upstage/open-ko-llm-leaderboard": # update only when it's from re
58
  update_collections(original_df.copy())
59
  leaderboard_df = original_df.copy()
60
 
 
 
61
  (
62
  finished_eval_queue_df,
63
  running_eval_queue_df,
@@ -148,6 +155,7 @@ def filter_models(
148
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
149
  mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
150
  filtered_df = filtered_df.loc[mask]
 
151
  return filtered_df
152
 
153
  leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, False, False)
@@ -291,13 +299,28 @@ with demo:
291
  leaderboard_table,
292
  queue=True,
293
  )
294
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
296
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
297
  gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
298
 
299
- # with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
300
- with gr.TabItem("Submission Info", elem_id="llm-benchmark-tab-table", id=3):
301
  with gr.Column():
302
  with gr.Row():
303
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
@@ -360,7 +383,7 @@ with demo:
360
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
361
  label="Model type",
362
  multiselect=False,
363
- value=ModelType.FT.to_str(" : "),
364
  interactive=True,
365
  )
366
 
@@ -381,22 +404,21 @@ with demo:
381
  )
382
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
383
 
384
- # submit_button = gr.Button("Submit Evalulation!")
385
- submit_button = gr.Button("We are no longer accepting submissions.", interactive=False)
386
  submission_result = gr.Markdown()
387
- # submit_button.click(
388
- # add_new_eval,
389
- # [
390
- # model_name_textbox,
391
- # base_model_name_textbox,
392
- # revision_name_textbox,
393
- # precision,
394
- # private,
395
- # weight_type,
396
- # model_type,
397
- # ],
398
- # submission_result,
399
- # )
400
 
401
  with gr.Row():
402
  with gr.Accordion("📙 Citation", open=False):
 
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
+ from gradio_space_ci import configure_space_ci # FOR CI
6
 
7
  from src.display.about import (
8
  CITATION_BUTTON_LABEL,
 
32
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
33
  from src.submission.submit import add_new_eval
34
  from src.tools.collections import update_collections
35
+ from src.tools.plots import (
36
+ create_metric_plot_obj,
37
+ create_plot_df,
38
+ create_scores_df,
39
+ )
40
 
41
 
42
  def restart_space():
 
63
  update_collections(original_df.copy())
64
  leaderboard_df = original_df.copy()
65
 
66
+ plot_df = create_plot_df(create_scores_df(raw_data))
67
+
68
  (
69
  finished_eval_queue_df,
70
  running_eval_queue_df,
 
155
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
156
  mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
157
  filtered_df = filtered_df.loc[mask]
158
+
159
  return filtered_df
160
 
161
  leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, False, False)
 
299
  leaderboard_table,
300
  queue=True,
301
  )
302
+
303
+ with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=4):
304
+ with gr.Row():
305
+ with gr.Column():
306
+ chart = create_metric_plot_obj(
307
+ plot_df,
308
+ [AutoEvalColumn.average.name],
309
+ title="Average of Top Scores Over Time (from last update)",
310
+ )
311
+ gr.Plot(value=chart, min_width=500)
312
+ with gr.Column():
313
+ chart = create_metric_plot_obj(
314
+ plot_df,
315
+ BENCHMARK_COLS,
316
+ title="Top Scores Over Time (from last update)",
317
+ )
318
+ gr.Plot(value=chart, min_width=500)
319
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
320
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
321
  gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
322
 
323
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
 
324
  with gr.Column():
325
  with gr.Row():
326
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
383
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
384
  label="Model type",
385
  multiselect=False,
386
+ value=ModelType.IFT.to_str(" : "),
387
  interactive=True,
388
  )
389
 
 
404
  )
405
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
406
 
407
+ submit_button = gr.Button("Submit Evalulation!")
 
408
  submission_result = gr.Markdown()
409
+ submit_button.click(
410
+ add_new_eval,
411
+ [
412
+ model_name_textbox,
413
+ base_model_name_textbox,
414
+ revision_name_textbox,
415
+ precision,
416
+ private,
417
+ weight_type,
418
+ model_type,
419
+ ],
420
+ submission_result,
421
+ )
422
 
423
  with gr.Row():
424
  with gr.Accordion("📙 Citation", open=False):
requirements.txt CHANGED
@@ -2,22 +2,17 @@ APScheduler==3.10.1
2
  black==23.11.0
3
  click==8.1.3
4
  datasets==2.14.5
5
- huggingface-hub==0.24.7
6
- matplotlib==3.8.4
7
- numpy==1.26.0
8
- pandas==2.2.2
 
 
9
  plotly==5.14.1
10
  python-dateutil==2.8.2
 
11
  sentencepiece
12
  tqdm==4.65.0
13
- transformers==4.43.1
14
  tokenizers>=0.15.0
15
- gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected].3 # CI !!!
16
- isort
17
- ruff
18
- gradio==4.31.0
19
- gradio[oauth]
20
- gradio_leaderboard==0.0.11
21
- requests==2.31.0
22
- requests-oauthlib== 1.3.1
23
- schedule == 1.2.2
 
2
  black==23.11.0
3
  click==8.1.3
4
  datasets==2.14.5
5
+ gradio==4.19.2
6
+ gradio_client==0.10.1
7
+ huggingface-hub>=0.18.0
8
+ matplotlib==3.7.1
9
+ numpy==1.24.2
10
+ pandas==2.0.0
11
  plotly==5.14.1
12
  python-dateutil==2.8.2
13
+ requests==2.28.2
14
  sentencepiece
15
  tqdm==4.65.0
16
+ transformers==4.38.2
17
  tokenizers>=0.15.0
18
+ gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.1.2 # CI !!!
 
 
 
 
 
 
 
 
src/display/about.py CHANGED
@@ -1,60 +1,63 @@
1
- import os
2
- import base64
3
  from src.display.utils import ModelType
4
 
5
- current_dir = os.path.dirname(os.path.realpath(__file__))
6
 
7
- with open(os.path.join(current_dir, "main_logo.png"), "rb") as image_file:
8
- main_logo = base64.b64encode(image_file.read()).decode('utf-8')
9
- with open(os.path.join(current_dir, "host_sponsor.png"), "rb") as image_file:
10
- host_sponsor = base64.b64encode(image_file.read()).decode('utf-8')
11
-
12
- TITLE = f"""<img src="data:image/jpeg;base64,{main_logo}" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
13
- BOTTOM_LOGO = f"""<img src="data:image/jpeg;base64,{host_sponsor}" style="width:75%;display:block;margin-left:auto;margin-right:auto">"""
14
 
15
  INTRODUCTION_TEXT = f"""
16
- The previous Leaderboard version is live [here](https://huggingface.co/spaces/choco9966/open-ko-llm-leaderboard-old) 📊
17
-
18
- 🚀 The Open Ko-LLM Leaderboard2 🇰🇷 objectively evaluates the performance of Korean Large Language Model (LLM). When you submit a model on the "Submit here!" page, it is automatically evaluated.
19
 
20
- This leaderboard is co-hosted by [Upstage](https://www.upstage.ai/), and [NIA](https://www.nia.or.kr/site/nia_kor/main.do) that provides various Korean Data Sets through [AI-Hub](https://aihub.or.kr/), and operated by [Upstage](https://www.upstage.ai/). The GPU used for evaluation is operated with the support of [KT](https://cloud.kt.com/) and [AICA](https://aica-gj.kr/main.php). If Season 1 focused on evaluating the capabilities of the LLM in terms of reasoning, language understanding, hallucination, and commonsense through academic benchmarks, Season 2 will focus on assessing the LLM's practical abilities and reliability. The datasets for this season are sponsored by [Flitto](https://www.flitto.com/portal/en), [SELECTSTAR](https://selectstar.ai/ko/), and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1). The evaluation dataset is exclusively private and only available for evaluation process. More detailed information about the benchmark dataset is provided on the “About” page.
 
 
 
21
 
22
- You'll notably find explanations on the evaluations we are using, reproducibility guidelines, best practices on how to submit a model, and our FAQ.
23
  """
24
 
25
  LLM_BENCHMARKS_TEXT = f"""
26
- # Motivation
27
-
28
  While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Korean leaderboard, 🚀 Open Ko-LLM, to evaluate models that reflect the characteristics of the Korean language and Korean culture. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in Korean.
29
 
30
- ## How it works
 
 
 
 
 
31
 
32
- 📈 We evaluate models on 9 key benchmarks using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) , a unified framework to test generative language models on a large number of different evaluation tasks.
 
33
 
34
- - Ko-GPQA (provided by [Flitto](https://www.flitto.com/portal/en))
35
- - Ko-WinoGrande (provided by [Flitto](https://www.flitto.com/portal/en))
36
- - Ko-GSM8K (provided by [Flitto](https://www.flitto.com/portal/en))
37
- - Ko-EQ-Bench (provided by [Flitto](https://www.flitto.com/portal/en))
38
- - Ko-IFEval (provided by [Flitto](https://www.flitto.com/portal/en))
39
- - KorNAT-Knowledge (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
40
- - KorNAT-Social-Value (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
41
- - Ko-Harmlessness (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
42
- - Ko-Helpfulness (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
43
 
44
- For all these evaluations, a higher score is a better score. We chose these benchmarks as they test a variety of reasoning, harmlessness, helpfulness and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
45
 
46
- The final score is converted to the average score from each evaluation datasets.
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- GPUs are provided by [KT](https://cloud.kt.com/) and [AICA](https://aica-gj.kr/main.php) for the evaluations.
49
 
50
- ## **Results**
51
 
52
- - Detailed numerical results in the `results` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/results
53
- - Community queries and running status in the `requests` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/requests
 
54
 
55
  ## More resources
56
-
57
- If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)!
58
  """
59
 
60
 
@@ -63,71 +66,38 @@ FAQ_TEXT = """
63
 
64
 
65
  EVALUATION_QUEUE_TEXT = f"""
66
- # Evaluation Queue for the 🤗 Open Ko-LLM Leaderboard
67
-
68
- Models added here will be automatically evaluated on the 🤗 cluster.
69
-
70
- ## Submission Disclaimer
71
 
72
- **By submitting a model, you acknowledge that:**
73
 
74
- - We store information about who submitted each model in [Requests dataset](https://huggingface.co/datasets/open-ko-llm-leaderboard/requests).
75
- - This practice helps maintain the integrity of our leaderboard, prevent spam, and ensure responsible submissions.
76
- - Your submission will be visible to the community and you may be contacted regarding your model.
77
- - Please submit carefully and responsibly 💛
78
-
79
- ## First Steps Before Submitting a Model
80
-
81
- ### 1. Ensure Your Model Loads with AutoClasses
82
-
83
- Verify that you can load your model and tokenizer using AutoClasses:
84
-
85
- ```jsx
86
  from transformers import AutoConfig, AutoModel, AutoTokenizer
87
  config = AutoConfig.from_pretrained("your model name", revision=revision)
88
  model = AutoModel.from_pretrained("your model name", revision=revision)
89
  tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
90
  ```
91
 
92
- Note:
93
 
94
- - If this step fails, debug your model before submitting.
95
- - Ensure your model is public.
96
- - We are working on adding support for models requiring `use_remote_code=True`.
97
 
98
- ### 2. Convert Weights to Safetensors
99
 
100
- [Safetensors](https://huggingface.co/docs/safetensors/index) is a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
101
 
102
- ### 3. Verify Your Model Open License
 
103
 
104
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
105
-
106
- ### 4. Complete Your Model Card
107
 
 
108
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
109
 
110
- ### 5. Select Correct Precision
111
-
112
- Choose the right precision to avoid evaluation errors:
113
-
114
- - Not all models convert properly from float16 to bfloat16.
115
- - Incorrect precision can cause issues (e.g., loading a bf16 model in fp16 may generate NaNs).
116
-
117
- > Important: When submitting, git branches and tags will be strictly tied to the specific commit present at the time of submission to ensure revision consistency.
118
- >
119
-
120
- ## Model types
121
-
122
- - 🟢 : 🟢 pretrained model: new, base models, trained on a given text corpora using masked modelling
123
- - 🟩 : 🟩 continuously pretrained model: new, base models, continuously trained on further corpus (which may include IFT/chat data) using masked modelling
124
- - 🔶 : 🔶 fine-tuned on domain-specific datasets model: pretrained models finetuned on more data
125
- - 💬 : 💬 chat models (RLHF, DPO, IFT, ...) model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
126
- - 🤝 : 🤝 base merges and moerges model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
127
-
128
- Please provide information about the model through an issue! 🤩
129
-
130
- 🏴‍☠️ : 🏴‍☠️ This icon indicates that the model has been selected as a subject of caution by the community, implying that users should exercise restraint when using it. Clicking on the icon will take you to a discussion about that model. (Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, are selected as subjects of caution.)
131
  """
132
 
133
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Authors of open-ko-llm-leaderboard are ordered alphabetically."
@@ -136,10 +106,8 @@ CITATION_BUTTON_TEXT = r"""
136
  title={Open Ko-LLM Leaderboard: Evaluating Large Language Models in Korean with Ko-H5 Benchmark},
137
  author={Chanjun Park and Hyeonwoo Kim and Dahyun Kim and Seonghwan Cho and Sanghoon Kim and Sukyung Lee and Yungi Kim and Hwalsuk Lee},
138
  year={2024},
139
- booktitle={The 62nd Annual Meeting of the Association for Computational Linguistics (ACL 2024) }
140
  }
141
-
142
-
143
  @software{eval-harness,
144
  author = {Gao, Leo and
145
  Tow, Jonathan and
@@ -164,59 +132,40 @@ CITATION_BUTTON_TEXT = r"""
164
  publisher = {Zenodo},
165
  version = {v0.0.1},
166
  doi = {10.5281/zenodo.5371628},
167
- url = {https://doi.org/10.5281/zenodo.5371628},
168
- }
169
-
170
- @misc{rein2023gpqagraduatelevelgoogleproofqa,
171
- title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark},
172
- author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman},
173
- year={2023},
174
- eprint={2311.12022},
175
- archivePrefix={arXiv},
176
- primaryClass={cs.AI},
177
- url={https://arxiv.org/abs/2311.12022},
178
- }
179
-
180
- @article{sakaguchi2021winogrande,
181
- title={Winogrande: An adversarial winograd schema challenge at scale},
182
- author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
183
- journal={Communications of the ACM},
184
- volume={64},
185
- number={9},
186
- pages={99--106},
187
- year={2021},
188
- publisher={ACM New York, NY, USA}
189
  }
190
-
191
- @article{cobbe2021training,
192
- title={Training verifiers to solve math word problems},
193
- author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
194
- journal={arXiv preprint arXiv:2110.14168},
195
- year={2021}
 
 
 
 
 
 
196
  }
197
-
198
- article{paech2023eq,
199
- title={Eq-bench: An emotional intelligence benchmark for large language models},
200
- author={Paech, Samuel J},
201
- journal={arXiv preprint arXiv:2312.06281},
202
- year={2023}
203
  }
204
-
205
-
206
- @misc{zhou2023instructionfollowingevaluationlargelanguage,
207
- title={Instruction-Following Evaluation for Large Language Models},
208
- author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
209
- year={2023},
210
- eprint={2311.07911},
211
- archivePrefix={arXiv},
212
- primaryClass={cs.CL},
213
- url={https://arxiv.org/abs/2311.07911},
214
  }
215
-
216
- @article{lee2024kornat,
217
- title={KorNAT: LLM Alignment Benchmark for Korean Social Values and Common Knowledge},
218
- author={Lee, Jiyoung and Kim, Minwoo and Kim, Seungho and Kim, Junghwan and Won, Seunghyun and Lee, Hwaran and Choi, Edward},
219
- journal={arXiv preprint arXiv:2402.13605},
220
- year={2024}
221
  }
222
  """
 
 
 
1
  from src.display.utils import ModelType
2
 
 
3
 
4
+ TITLE = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/header_logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
5
+ BOTTOM_LOGO = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/footer_logo_240715.png" style="width:50%;display:block;margin-left:auto;margin-right:auto">"""
 
 
 
 
 
6
 
7
  INTRODUCTION_TEXT = f"""
8
+ 🚀 The Open Ko-LLM Leaderboard 🇰🇷 objectively evaluates the performance of Korean Large Language Model (LLM).
 
 
9
 
10
+ When you submit a model on the "Submit here!" page, it is automatically evaluated. The GPU used for evaluation is operated with the support of __[KT](https://cloud.kt.com/)__.
11
+ The data used for evaluation consists of datasets to assess reasoning, language understanding, hallucination, and commonsense.
12
+ The evaluation dataset is exclusively private and only available for evaluation process.
13
+ More detailed information about the benchmark dataset is provided on the “About” page.
14
 
15
+ This leaderboard is co-hosted by __[Upstage](https://www.upstage.ai)__, and __[NIA](https://www.nia.or.kr/site/nia_kor/main.do)__ that provides various Korean Data Sets through __[AI-Hub](https://aihub.or.kr)__, and operated by __[Upstage](https://www.upstage.ai)__.
16
  """
17
 
18
  LLM_BENCHMARKS_TEXT = f"""
19
+ # Context
 
20
  While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Korean leaderboard, 🚀 Open Ko-LLM, to evaluate models that reflect the characteristics of the Korean language and Korean culture. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in Korean.
21
 
22
+ ## Icons
23
+ {ModelType.PT.to_str(" : ")} model
24
+ {ModelType.IFT.to_str(" : ")} model
25
+ {ModelType.RL.to_str(" : ")} model
26
+ If there is no icon, it indicates that there is insufficient information about the model.
27
+ Please provide information about the model through an issue! 🤩
28
 
29
+ 🏴‍☠️ : This icon indicates that the model has been selected as a subject of caution by the community, implying that users should exercise restraint when using it. Clicking on the icon will take you to a discussion about that model.
30
+ (Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, are selected as subjects of caution.)
31
 
32
+ ## How it works
 
 
 
 
 
 
 
 
33
 
34
+ 📈 We evaluate models using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), a unified framework to test generative language models on a large number of different evaluation tasks.
35
 
36
+ We have set up a benchmark using datasets translated into Korean, and applied variations by human experts, from the six tasks (HellaSwag, MMLU, Arc, Truthful QA, Winogrande, GSM8k) operated by __HuggingFace [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)__. We have also added a new dataset prepared from scratch.
37
+ - Ko-HellaSwag (provided by __[Upstage](https://www.upstage.ai/)__, machine translation)
38
+ - Ko-MMLU (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
39
+ - Ko-Arc (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
40
+ - Ko-Truthful QA (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
41
+ - Ko-Winogrande (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
42
+ - Ko-GSM8k (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
43
+ - Ko-CommonGen V2 (provided by __[Korea University NLP&AI Lab](http://nlp.korea.ac.kr/)__, created from scratch)
44
+ - Ko-EQ Bench (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
45
+ - Ko-InstFollow (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
46
+ - KorNAT-CKA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
47
+ - KorNAT-SVA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
48
+ - Ko-Harmlessness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
49
+ - Ko-Helpfulness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
50
 
51
+ To provide an evaluation befitting the LLM era, we've selected benchmark datasets suitable for assessing these elements: expertise, inference, hallucination, truthfulness and common sense. The final score is converted to the average score from each evaluation datasets.
52
 
53
+ GPUs are provided by __[KT](https://cloud.kt.com/)__ for the evaluations.
54
 
55
+ ## Details and Logs
56
+ - Detailed numerical results in the `results` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/results
57
+ - Community queries and running status in the `requests` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/requests
58
 
59
  ## More resources
60
+ If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)!
 
61
  """
62
 
63
 
 
66
 
67
 
68
  EVALUATION_QUEUE_TEXT = f"""
69
+ # Evaluation Queue for the 🚀 Open Ko-LLM Leaderboard
70
+ Models added here will be automatically evaluated on the KT GPU cluster.
 
 
 
71
 
72
+ ## <Some good practices before submitting a model>
73
 
74
+ ### 1️⃣ Make sure you can load your model and tokenizer using AutoClasses
75
+ ```python
 
 
 
 
 
 
 
 
 
 
76
  from transformers import AutoConfig, AutoModel, AutoTokenizer
77
  config = AutoConfig.from_pretrained("your model name", revision=revision)
78
  model = AutoModel.from_pretrained("your model name", revision=revision)
79
  tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
80
  ```
81
 
82
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
83
 
84
+ ⚠️ Make sure your model is public!
 
 
85
 
86
+ ⚠️ Maker sure your model runs with [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)
87
 
88
+ ⚠️ If your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
89
 
90
+ ### 2️⃣ Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
91
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
92
 
93
+ ### 3️⃣ Make sure your model has an open license!
94
+ This is a leaderboard for 🚀 Open Ko-LLMs, and we'd love for as many people as possible to know they can use your model
 
95
 
96
+ ### 4️⃣ Fill up your model card
97
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
98
 
99
+ ## In case of model failure
100
+ If your model is displayed in the `FAILED` category, its execution stopped. Make sure you have followed the above steps first. If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  """
102
 
103
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Authors of open-ko-llm-leaderboard are ordered alphabetically."
 
106
  title={Open Ko-LLM Leaderboard: Evaluating Large Language Models in Korean with Ko-H5 Benchmark},
107
  author={Chanjun Park and Hyeonwoo Kim and Dahyun Kim and Seonghwan Cho and Sanghoon Kim and Sukyung Lee and Yungi Kim and Hwalsuk Lee},
108
  year={2024},
109
+ booktitle={ACL Main}
110
  }
 
 
111
  @software{eval-harness,
112
  author = {Gao, Leo and
113
  Tow, Jonathan and
 
132
  publisher = {Zenodo},
133
  version = {v0.0.1},
134
  doi = {10.5281/zenodo.5371628},
135
+ url = {https://doi.org/10.5281/zenodo.5371628}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  }
137
+ @misc{seo2023kocommongen,
138
+ title={Korean Commonsense Reasoning Evaluation for Large Language Models},
139
+ author={Jaehyung Seo, Chanjun Park, Hyeonseok Moon, Sugyeong Eo, Aram So, Heuiseok Lim},
140
+ year={2023},
141
+ affilation={Korea University, NLP&AI},
142
+ booktitle={Proceedings of the 35th Annual Conference on Human & Cognitive Language Technology}}
143
+ @misc{park2023koarc,
144
+ title={Ko-ARC},
145
+ original_title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
146
+ author={Hyunbyung Park, Chanjun Park},
147
+ original_author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
148
+ year={2023}
149
  }
150
+ @misc{park2023kohellaswag,
151
+ title={Ko-HellaSwag},
152
+ original_title={HellaSwag: Can a Machine Really Finish Your Sentence?},
153
+ author={Hyunbyung Park, Chanjun Park},
154
+ original_author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
155
+ year={2023}
156
  }
157
+ @misc{park2023kommlu,
158
+ title={Ko-MMLU},
159
+ original_title={Measuring Massive Multitask Language Understanding},
160
+ author={Hyunbyung Park, Chanjun Park},
161
+ original_author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
162
+ year={2023}
 
 
 
 
163
  }
164
+ @misc{park2023kotruthfulqa,
165
+ title={Ko-TruthfulQA},
166
+ original_title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
167
+ author={Hyunbyung Park, Chanjun Park},
168
+ original_author={Stephanie Lin and Jacob Hilton and Owain Evans},
169
+ year={2023}
170
  }
171
  """
src/display/formatting.py CHANGED
@@ -14,9 +14,10 @@ def model_hyperlink(link, model_name):
14
  def make_clickable_model(model_name):
15
  link = f"https://huggingface.co/{model_name}"
16
 
17
- # details_model_name = model_name.replace("/", "__")
18
- # details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
19
- return model_hyperlink(link, model_name) # + " " + model_hyperlink(details_link, "📑")
 
20
 
21
 
22
  def styled_error(error):
 
14
  def make_clickable_model(model_name):
15
  link = f"https://huggingface.co/{model_name}"
16
 
17
+ details_model_name = model_name.replace("/", "__")
18
+ details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
19
+
20
+ return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
21
 
22
 
23
  def styled_error(error):
src/display/host_sponsor.png DELETED
Binary file (131 kB)
 
src/display/main_logo.png DELETED
Binary file (345 kB)
 
src/display/utils.py CHANGED
@@ -14,15 +14,19 @@ class Task:
14
  col_name: str
15
 
16
  class Tasks(Enum):
17
- gpqa = Task("ko_gpqa_diamond_zeroshot", "acc_norm,none", "Ko-GPQA")
18
- winogrande = Task("ko_winogrande", "acc,none", "Ko-Winogrande")
19
- gsm8k = Task("ko_gsm8k", "exact_match,strict-match", "Ko-GSM8k")
20
- eqBench = Task("ko_eqbench", "eqbench,none", "Ko-EQ Bench")
21
- instFollow = Task("ko_ifeval", "strict_acc,none", "Ko-IFEval")
22
- korNatCka = Task("kornat_common", "acc_norm,none", "KorNAT-CKA")
23
- korNatSva = Task("kornat_social", "A-SVA,none", "KorNAT-SVA")
24
- harmlessness = Task("kornat_harmless", "acc_norm,none", "Ko-Harmlessness")
25
- helpfulness = Task("kornat_helpful", "acc_norm,none", "Ko-Helpfulness")
 
 
 
 
26
 
27
 
28
  # These classes are for user facing column names,
@@ -85,30 +89,26 @@ class ModelDetails:
85
 
86
  class ModelType(Enum):
87
  PT = ModelDetails(name="pretrained", symbol="🟢")
88
- CPT = ModelDetails(name="continuously pretrained", symbol="🟩")
89
- FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶")
90
- chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
91
- merges = ModelDetails(name="base merges and moerges", symbol="🤝")
92
- Unknown = ModelDetails(name="other", symbol="❓")
93
 
94
  def to_str(self, separator=" "):
95
  return f"{self.value.symbol}{separator}{self.value.name}"
96
 
97
  @staticmethod
98
- def from_str(m_type):
99
- if any([k for k in m_type if k in ["fine-tuned","🔶", "finetuned"]]):
100
- return ModelType.FT
101
- if "continuously pretrained" in m_type or "🟩" in m_type:
102
- return ModelType.CPT
103
- if "pretrained" in m_type or "🟢" in m_type:
104
  return ModelType.PT
105
- if any([k in m_type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
106
- return ModelType.chat
107
- if "merge" in m_type or "🤝" in m_type:
108
- return ModelType.merges
109
  return ModelType.Unknown
110
 
111
-
112
  class WeightType(Enum):
113
  Adapter = ModelDetails("Adapter")
114
  Original = ModelDetails("Original")
@@ -116,13 +116,12 @@ class WeightType(Enum):
116
 
117
  class Precision(Enum):
118
  float16 = ModelDetails("float16")
119
- bfloat16 = ModelDetails("bfloat16")
120
- qt_8bit = ModelDetails("8bit")
121
- qt_4bit = ModelDetails("4bit")
122
- qt_GPTQ = ModelDetails("GPTQ")
123
  Unknown = ModelDetails("?")
124
 
125
- @staticmethod
126
  def from_str(precision):
127
  if precision in ["torch.float16", "float16"]:
128
  return Precision.float16
@@ -135,10 +134,15 @@ class Precision(Enum):
135
  if precision in ["GPTQ", "None"]:
136
  return Precision.qt_GPTQ
137
  return Precision.Unknown
 
 
 
138
 
139
  # Column selection
140
- COLS = [c.name for c in fields(AutoEvalColumn)]
141
- TYPES = [c.type for c in fields(AutoEvalColumn)]
 
 
142
 
143
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
144
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
@@ -153,4 +157,4 @@ NUMERIC_INTERVALS = {
153
  "13~35B": pd.Interval(13, 35, closed="right"),
154
  "35~60B": pd.Interval(35, 60, closed="right"),
155
  "60B+": pd.Interval(60, 10000, closed="right"),
156
- }
 
14
  col_name: str
15
 
16
  class Tasks(Enum):
17
+ arc = Task("ko_arc_challenge", "acc_norm", "Ko-ARC")
18
+ hellaswag = Task("ko_hellaswag", "acc_norm", "Ko-HellaSwag")
19
+ mmlu = Task("ko_mmlu", "acc", "Ko-MMLU")
20
+ truthfulqa = Task("ko_truthfulqa_mc", "mc2", "Ko-TruthfulQA")
21
+ winogrande = Task("ko_winogrande", "acc_norm", "Ko-Winogrande")
22
+ gsm8k = Task("ko_gsm8k", "acc_norm", "Ko-GSM8k")
23
+ commongen_v2 = Task("ko_commongen_v2", "acc_norm", "Ko-CommonGen V2")
24
+ eqBench = Task("ko_eq_bench", "acc_norm", "Ko-EQ Bench")
25
+ instFollow = Task("ko_inst_follow", "acc_norm", "Ko-InstFollow")
26
+ korNatCka = Task("kor_nat_cka", "acc_norm", "KorNAT-CKA")
27
+ korNatSva = Task("kor_nat_sva", "acc_norm", "KorNAT-SVA")
28
+ harmlessness = Task("ko_harmlessness", "acc_norm", "Ko-Harmlessness")
29
+ helpfulness = Task("ko_helpfulness", "acc_norm", "Ko-Helpfulness")
30
 
31
 
32
  # These classes are for user facing column names,
 
89
 
90
  class ModelType(Enum):
91
  PT = ModelDetails(name="pretrained", symbol="🟢")
92
+ # FT = ModelDetails(name="fine-tuned", symbol="🔶")
93
+ IFT = ModelDetails(name="instruction-tuned", symbol="")
94
+ RL = ModelDetails(name="RL-tuned", symbol="🟦")
95
+ Unknown = ModelDetails(name="", symbol="?")
 
96
 
97
  def to_str(self, separator=" "):
98
  return f"{self.value.symbol}{separator}{self.value.name}"
99
 
100
  @staticmethod
101
+ def from_str(type):
102
+ # if "fine-tuned" in type or "🔶" in type:
103
+ # return ModelType.FT
104
+ if "pretrained" in type or "🟢" in type:
 
 
105
  return ModelType.PT
106
+ if "RL-tuned" in type or "🟦" in type:
107
+ return ModelType.RL
108
+ if "instruction-tuned" in type or "" in type:
109
+ return ModelType.IFT
110
  return ModelType.Unknown
111
 
 
112
  class WeightType(Enum):
113
  Adapter = ModelDetails("Adapter")
114
  Original = ModelDetails("Original")
 
116
 
117
  class Precision(Enum):
118
  float16 = ModelDetails("float16")
119
+ # bfloat16 = ModelDetails("bfloat16")
120
+ # qt_8bit = ModelDetails("8bit")
121
+ # qt_4bit = ModelDetails("4bit")
122
+ # qt_GPTQ = ModelDetails("GPTQ")
123
  Unknown = ModelDetails("?")
124
 
 
125
  def from_str(precision):
126
  if precision in ["torch.float16", "float16"]:
127
  return Precision.float16
 
134
  if precision in ["GPTQ", "None"]:
135
  return Precision.qt_GPTQ
136
  return Precision.Unknown
137
+
138
+
139
+
140
 
141
  # Column selection
142
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
143
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
144
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
145
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
146
 
147
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
148
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 
157
  "13~35B": pd.Interval(13, 35, closed="right"),
158
  "35~60B": pd.Interval(35, 60, closed="right"),
159
  "60B+": pd.Interval(60, 10000, closed="right"),
160
+ }
src/leaderboard/read_evals.py CHANGED
@@ -48,7 +48,7 @@ class EvalResult:
48
  precision = Precision.from_str(config.get("model_dtype"))
49
 
50
  # Get model and org
51
- org_and_model = config.get("model_name", None)
52
  org_and_model = org_and_model.split("/", 1)
53
 
54
  if len(org_and_model) == 1:
@@ -96,18 +96,26 @@ class EvalResult:
96
  results = {}
97
  for task in Tasks:
98
  task = task.value
99
- if task.benchmark in ["ko_ifeval"]:
100
- ko_ifeval = data["results"]["ko_ifeval"]
101
- accs = np.mean([ko_ifeval["prompt_level_strict_acc,none"], ko_ifeval["inst_level_strict_acc,none"]])
102
- mean_acc = np.mean(accs) * 100.0
103
- if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eqbench", "kornat_common", "kornat_social", "kornat_harmless", "kornat_helpful", "ko_gpqa_diamond_zeroshot"]:
104
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
 
 
 
 
105
  if accs.size == 0 or any([acc is None for acc in accs]):
 
106
  continue
107
- if task.benchmark not in ["ko_eqbench"]:
108
- mean_acc = accs[0] * 100.0
109
- else:
110
- mean_acc = accs[0]
 
 
 
111
  results[task.benchmark] = mean_acc
112
 
113
  return self(
@@ -143,7 +151,27 @@ class EvalResult:
143
  def to_dict(self):
144
  """Converts the Eval Result to a dict compatible with our dataframe display"""
145
 
146
- average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  data_dict = {
149
  "eval_name": self.eval_name, # not a column, just a save name,
 
48
  precision = Precision.from_str(config.get("model_dtype"))
49
 
50
  # Get model and org
51
+ org_and_model = config.get("model_name", config.get("model_args", None))
52
  org_and_model = org_and_model.split("/", 1)
53
 
54
  if len(org_and_model) == 1:
 
96
  results = {}
97
  for task in Tasks:
98
  task = task.value
99
+
100
+ # Some truthfulQA values are NaNs
101
+ if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
102
+ if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
103
+ results[task.benchmark] = 0.0
104
+ continue
105
+
106
+ # New tasks have been added, we need to skip them if not exists
107
+ if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eq_bench", "ko_inst_follow", "kor_nat_cka", "kor_nat_sva", "ko_harmlessness", "ko_helpfulness"]:
108
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
109
  if accs.size == 0 or any([acc is None for acc in accs]):
110
+ results[task.benchmark] = 0.0
111
  continue
112
+
113
+ # We average all scores of a given metric (mostly for mmlu)
114
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
115
+ if accs.size == 0 or any([acc is None for acc in accs]):
116
+ continue
117
+
118
+ mean_acc = np.mean(accs) * 100.0
119
  results[task.benchmark] = mean_acc
120
 
121
  return self(
 
151
  def to_dict(self):
152
  """Converts the Eval Result to a dict compatible with our dataframe display"""
153
 
154
+ # Skip the new tasks for now
155
+ # TODO: safely remove this code when the task results are all added
156
+ skip_avg_len = 0
157
+ if self.results['ko_winogrande'] == 0.0:
158
+ skip_avg_len += 1
159
+ if self.results['ko_gsm8k'] == 0.0:
160
+ skip_avg_len += 1
161
+ if self.results['ko_eq_bench'] == 0.0:
162
+ skip_avg_len += 1
163
+ if self.results['ko_inst_follow'] == 0.0:
164
+ skip_avg_len += 1
165
+ if self.results['kor_nat_cka'] == 0.0:
166
+ skip_avg_len += 1
167
+ if self.results['kor_nat_sva'] == 0.0:
168
+ skip_avg_len += 1
169
+ if self.results['ko_harmlessness'] == 0.0:
170
+ skip_avg_len += 1
171
+ if self.results['ko_helpfulness'] == 0.0:
172
+ skip_avg_len += 1
173
+
174
+ average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
175
 
176
  data_dict = {
177
  "eval_name": self.eval_name, # not a column, just a save name,
src/submission/submit.py CHANGED
@@ -1,7 +1,6 @@
1
  import json
2
  import os
3
  from datetime import datetime, timezone
4
- import pandas as pd
5
 
6
  from src.display.formatting import styled_error, styled_message, styled_warning
7
  from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
@@ -13,7 +12,6 @@ from src.submission.check_validity import (
13
  is_model_on_hub,
14
  user_submission_permission,
15
  )
16
- from src.populate import get_evaluation_queue_df
17
 
18
  REQUESTED_MODELS = None
19
  USERS_TO_SUBMISSION_DATES = None
@@ -40,7 +38,10 @@ def add_new_eval(
40
 
41
  precision = precision.split(" ")[0]
42
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
43
-
 
 
 
44
  if model_type is None or model_type == "":
45
  return styled_error("Please select a model type.")
46
 
@@ -99,9 +100,6 @@ def add_new_eval(
99
 
100
  # Seems good, creating the eval
101
  print("Adding new eval")
102
- # dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, cols=["job_id"])
103
- # dfs = pd.concat(dfs).reset_index(drop=True)
104
- # max_job_id = max([int(c) for c in dfs["job_id"].values])
105
 
106
  eval_entry = {
107
  "model": model,
@@ -116,7 +114,6 @@ def add_new_eval(
116
  "likes": model_info.likes,
117
  "params": model_size,
118
  "license": license,
119
- # "job_id": max_job_id+1
120
  }
121
 
122
  # Check for duplicate submission
 
1
  import json
2
  import os
3
  from datetime import datetime, timezone
 
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
  from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
 
12
  is_model_on_hub,
13
  user_submission_permission,
14
  )
 
15
 
16
  REQUESTED_MODELS = None
17
  USERS_TO_SUBMISSION_DATES = None
 
38
 
39
  precision = precision.split(" ")[0]
40
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
41
+ # 리더보드 종료
42
+ if True:
43
+ return styled_error("The current Season 1 will conclude on Friday, August 2, and the new season will commence on August 12.")
44
+
45
  if model_type is None or model_type == "":
46
  return styled_error("Please select a model type.")
47
 
 
100
 
101
  # Seems good, creating the eval
102
  print("Adding new eval")
 
 
 
103
 
104
  eval_entry = {
105
  "model": model,
 
114
  "likes": model_info.likes,
115
  "params": model_size,
116
  "license": license,
 
117
  }
118
 
119
  # Check for duplicate submission