Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
pr/87
#87
by
choco9966
- opened
- app.py +42 -20
- requirements.txt +9 -14
- src/display/about.py +87 -138
- src/display/formatting.py +4 -3
- src/display/host_sponsor.png +0 -0
- src/display/main_logo.png +0 -0
- src/display/utils.py +37 -33
- src/leaderboard/read_evals.py +40 -12
- src/submission/submit.py +4 -7
app.py
CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
|
|
2 |
import pandas as pd
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
from huggingface_hub import snapshot_download
|
5 |
-
from gradio_space_ci
|
6 |
|
7 |
from src.display.about import (
|
8 |
CITATION_BUTTON_LABEL,
|
@@ -32,6 +32,11 @@ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PU
|
|
32 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
33 |
from src.submission.submit import add_new_eval
|
34 |
from src.tools.collections import update_collections
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
|
37 |
def restart_space():
|
@@ -58,6 +63,8 @@ if REPO_ID == "upstage/open-ko-llm-leaderboard": # update only when it's from re
|
|
58 |
update_collections(original_df.copy())
|
59 |
leaderboard_df = original_df.copy()
|
60 |
|
|
|
|
|
61 |
(
|
62 |
finished_eval_queue_df,
|
63 |
running_eval_queue_df,
|
@@ -148,6 +155,7 @@ def filter_models(
|
|
148 |
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
149 |
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
150 |
filtered_df = filtered_df.loc[mask]
|
|
|
151 |
return filtered_df
|
152 |
|
153 |
leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, False, False)
|
@@ -291,13 +299,28 @@ with demo:
|
|
291 |
leaderboard_table,
|
292 |
queue=True,
|
293 |
)
|
294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
296 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
297 |
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
298 |
|
299 |
-
|
300 |
-
with gr.TabItem("Submission Info", elem_id="llm-benchmark-tab-table", id=3):
|
301 |
with gr.Column():
|
302 |
with gr.Row():
|
303 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
@@ -360,7 +383,7 @@ with demo:
|
|
360 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
361 |
label="Model type",
|
362 |
multiselect=False,
|
363 |
-
value=ModelType.
|
364 |
interactive=True,
|
365 |
)
|
366 |
|
@@ -381,22 +404,21 @@ with demo:
|
|
381 |
)
|
382 |
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
383 |
|
384 |
-
|
385 |
-
submit_button = gr.Button("We are no longer accepting submissions.", interactive=False)
|
386 |
submission_result = gr.Markdown()
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
|
401 |
with gr.Row():
|
402 |
with gr.Accordion("📙 Citation", open=False):
|
|
|
2 |
import pandas as pd
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
from huggingface_hub import snapshot_download
|
5 |
+
from gradio_space_ci import configure_space_ci # FOR CI
|
6 |
|
7 |
from src.display.about import (
|
8 |
CITATION_BUTTON_LABEL,
|
|
|
32 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
33 |
from src.submission.submit import add_new_eval
|
34 |
from src.tools.collections import update_collections
|
35 |
+
from src.tools.plots import (
|
36 |
+
create_metric_plot_obj,
|
37 |
+
create_plot_df,
|
38 |
+
create_scores_df,
|
39 |
+
)
|
40 |
|
41 |
|
42 |
def restart_space():
|
|
|
63 |
update_collections(original_df.copy())
|
64 |
leaderboard_df = original_df.copy()
|
65 |
|
66 |
+
plot_df = create_plot_df(create_scores_df(raw_data))
|
67 |
+
|
68 |
(
|
69 |
finished_eval_queue_df,
|
70 |
running_eval_queue_df,
|
|
|
155 |
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
156 |
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
157 |
filtered_df = filtered_df.loc[mask]
|
158 |
+
|
159 |
return filtered_df
|
160 |
|
161 |
leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, False, False)
|
|
|
299 |
leaderboard_table,
|
300 |
queue=True,
|
301 |
)
|
302 |
+
|
303 |
+
with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=4):
|
304 |
+
with gr.Row():
|
305 |
+
with gr.Column():
|
306 |
+
chart = create_metric_plot_obj(
|
307 |
+
plot_df,
|
308 |
+
[AutoEvalColumn.average.name],
|
309 |
+
title="Average of Top Scores Over Time (from last update)",
|
310 |
+
)
|
311 |
+
gr.Plot(value=chart, min_width=500)
|
312 |
+
with gr.Column():
|
313 |
+
chart = create_metric_plot_obj(
|
314 |
+
plot_df,
|
315 |
+
BENCHMARK_COLS,
|
316 |
+
title="Top Scores Over Time (from last update)",
|
317 |
+
)
|
318 |
+
gr.Plot(value=chart, min_width=500)
|
319 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
320 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
321 |
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
322 |
|
323 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
|
|
324 |
with gr.Column():
|
325 |
with gr.Row():
|
326 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
383 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
384 |
label="Model type",
|
385 |
multiselect=False,
|
386 |
+
value=ModelType.IFT.to_str(" : "),
|
387 |
interactive=True,
|
388 |
)
|
389 |
|
|
|
404 |
)
|
405 |
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
406 |
|
407 |
+
submit_button = gr.Button("Submit Evalulation!")
|
|
|
408 |
submission_result = gr.Markdown()
|
409 |
+
submit_button.click(
|
410 |
+
add_new_eval,
|
411 |
+
[
|
412 |
+
model_name_textbox,
|
413 |
+
base_model_name_textbox,
|
414 |
+
revision_name_textbox,
|
415 |
+
precision,
|
416 |
+
private,
|
417 |
+
weight_type,
|
418 |
+
model_type,
|
419 |
+
],
|
420 |
+
submission_result,
|
421 |
+
)
|
422 |
|
423 |
with gr.Row():
|
424 |
with gr.Accordion("📙 Citation", open=False):
|
requirements.txt
CHANGED
@@ -2,22 +2,17 @@ APScheduler==3.10.1
|
|
2 |
black==23.11.0
|
3 |
click==8.1.3
|
4 |
datasets==2.14.5
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
9 |
plotly==5.14.1
|
10 |
python-dateutil==2.8.2
|
|
|
11 |
sentencepiece
|
12 |
tqdm==4.65.0
|
13 |
-
transformers==4.
|
14 |
tokenizers>=0.15.0
|
15 |
-
gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected]
|
16 |
-
isort
|
17 |
-
ruff
|
18 |
-
gradio==4.31.0
|
19 |
-
gradio[oauth]
|
20 |
-
gradio_leaderboard==0.0.11
|
21 |
-
requests==2.31.0
|
22 |
-
requests-oauthlib== 1.3.1
|
23 |
-
schedule == 1.2.2
|
|
|
2 |
black==23.11.0
|
3 |
click==8.1.3
|
4 |
datasets==2.14.5
|
5 |
+
gradio==4.19.2
|
6 |
+
gradio_client==0.10.1
|
7 |
+
huggingface-hub>=0.18.0
|
8 |
+
matplotlib==3.7.1
|
9 |
+
numpy==1.24.2
|
10 |
+
pandas==2.0.0
|
11 |
plotly==5.14.1
|
12 |
python-dateutil==2.8.2
|
13 |
+
requests==2.28.2
|
14 |
sentencepiece
|
15 |
tqdm==4.65.0
|
16 |
+
transformers==4.38.2
|
17 |
tokenizers>=0.15.0
|
18 |
+
gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.1.2 # CI !!!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/about.py
CHANGED
@@ -1,60 +1,63 @@
|
|
1 |
-
import os
|
2 |
-
import base64
|
3 |
from src.display.utils import ModelType
|
4 |
|
5 |
-
current_dir = os.path.dirname(os.path.realpath(__file__))
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
with open(os.path.join(current_dir, "host_sponsor.png"), "rb") as image_file:
|
10 |
-
host_sponsor = base64.b64encode(image_file.read()).decode('utf-8')
|
11 |
-
|
12 |
-
TITLE = f"""<img src="data:image/jpeg;base64,{main_logo}" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
|
13 |
-
BOTTOM_LOGO = f"""<img src="data:image/jpeg;base64,{host_sponsor}" style="width:75%;display:block;margin-left:auto;margin-right:auto">"""
|
14 |
|
15 |
INTRODUCTION_TEXT = f"""
|
16 |
-
The
|
17 |
-
|
18 |
-
🚀 The Open Ko-LLM Leaderboard2 🇰🇷 objectively evaluates the performance of Korean Large Language Model (LLM). When you submit a model on the "Submit here!" page, it is automatically evaluated.
|
19 |
|
20 |
-
|
|
|
|
|
|
|
21 |
|
22 |
-
|
23 |
"""
|
24 |
|
25 |
LLM_BENCHMARKS_TEXT = f"""
|
26 |
-
#
|
27 |
-
|
28 |
While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Korean leaderboard, 🚀 Open Ko-LLM, to evaluate models that reflect the characteristics of the Korean language and Korean culture. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in Korean.
|
29 |
|
30 |
-
##
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
|
|
|
33 |
|
34 |
-
|
35 |
-
- Ko-WinoGrande (provided by [Flitto](https://www.flitto.com/portal/en))
|
36 |
-
- Ko-GSM8K (provided by [Flitto](https://www.flitto.com/portal/en))
|
37 |
-
- Ko-EQ-Bench (provided by [Flitto](https://www.flitto.com/portal/en))
|
38 |
-
- Ko-IFEval (provided by [Flitto](https://www.flitto.com/portal/en))
|
39 |
-
- KorNAT-Knowledge (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
|
40 |
-
- KorNAT-Social-Value (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
|
41 |
-
- Ko-Harmlessness (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
|
42 |
-
- Ko-Helpfulness (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
|
43 |
|
44 |
-
|
45 |
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
|
49 |
|
50 |
-
|
51 |
|
52 |
-
|
53 |
-
-
|
|
|
54 |
|
55 |
## More resources
|
56 |
-
|
57 |
-
If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)!
|
58 |
"""
|
59 |
|
60 |
|
@@ -63,71 +66,38 @@ FAQ_TEXT = """
|
|
63 |
|
64 |
|
65 |
EVALUATION_QUEUE_TEXT = f"""
|
66 |
-
# Evaluation Queue for the
|
67 |
-
|
68 |
-
Models added here will be automatically evaluated on the 🤗 cluster.
|
69 |
-
|
70 |
-
## Submission Disclaimer
|
71 |
|
72 |
-
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
- Your submission will be visible to the community and you may be contacted regarding your model.
|
77 |
-
- Please submit carefully and responsibly 💛
|
78 |
-
|
79 |
-
## First Steps Before Submitting a Model
|
80 |
-
|
81 |
-
### 1. Ensure Your Model Loads with AutoClasses
|
82 |
-
|
83 |
-
Verify that you can load your model and tokenizer using AutoClasses:
|
84 |
-
|
85 |
-
```jsx
|
86 |
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
87 |
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
88 |
model = AutoModel.from_pretrained("your model name", revision=revision)
|
89 |
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
90 |
```
|
91 |
|
92 |
-
|
93 |
|
94 |
-
|
95 |
-
- Ensure your model is public.
|
96 |
-
- We are working on adding support for models requiring `use_remote_code=True`.
|
97 |
|
98 |
-
|
99 |
|
100 |
-
|
101 |
|
102 |
-
###
|
|
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
### 4. Complete Your Model Card
|
107 |
|
|
|
108 |
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
109 |
|
110 |
-
|
111 |
-
|
112 |
-
Choose the right precision to avoid evaluation errors:
|
113 |
-
|
114 |
-
- Not all models convert properly from float16 to bfloat16.
|
115 |
-
- Incorrect precision can cause issues (e.g., loading a bf16 model in fp16 may generate NaNs).
|
116 |
-
|
117 |
-
> Important: When submitting, git branches and tags will be strictly tied to the specific commit present at the time of submission to ensure revision consistency.
|
118 |
-
>
|
119 |
-
|
120 |
-
## Model types
|
121 |
-
|
122 |
-
- 🟢 : 🟢 pretrained model: new, base models, trained on a given text corpora using masked modelling
|
123 |
-
- 🟩 : 🟩 continuously pretrained model: new, base models, continuously trained on further corpus (which may include IFT/chat data) using masked modelling
|
124 |
-
- 🔶 : 🔶 fine-tuned on domain-specific datasets model: pretrained models finetuned on more data
|
125 |
-
- 💬 : 💬 chat models (RLHF, DPO, IFT, ...) model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
|
126 |
-
- 🤝 : 🤝 base merges and moerges model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
|
127 |
-
|
128 |
-
Please provide information about the model through an issue! 🤩
|
129 |
-
|
130 |
-
🏴☠️ : 🏴☠️ This icon indicates that the model has been selected as a subject of caution by the community, implying that users should exercise restraint when using it. Clicking on the icon will take you to a discussion about that model. (Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, are selected as subjects of caution.)
|
131 |
"""
|
132 |
|
133 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Authors of open-ko-llm-leaderboard are ordered alphabetically."
|
@@ -136,10 +106,8 @@ CITATION_BUTTON_TEXT = r"""
|
|
136 |
title={Open Ko-LLM Leaderboard: Evaluating Large Language Models in Korean with Ko-H5 Benchmark},
|
137 |
author={Chanjun Park and Hyeonwoo Kim and Dahyun Kim and Seonghwan Cho and Sanghoon Kim and Sukyung Lee and Yungi Kim and Hwalsuk Lee},
|
138 |
year={2024},
|
139 |
-
booktitle={
|
140 |
}
|
141 |
-
|
142 |
-
|
143 |
@software{eval-harness,
|
144 |
author = {Gao, Leo and
|
145 |
Tow, Jonathan and
|
@@ -164,59 +132,40 @@ CITATION_BUTTON_TEXT = r"""
|
|
164 |
publisher = {Zenodo},
|
165 |
version = {v0.0.1},
|
166 |
doi = {10.5281/zenodo.5371628},
|
167 |
-
url = {https://doi.org/10.5281/zenodo.5371628}
|
168 |
-
}
|
169 |
-
|
170 |
-
@misc{rein2023gpqagraduatelevelgoogleproofqa,
|
171 |
-
title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark},
|
172 |
-
author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman},
|
173 |
-
year={2023},
|
174 |
-
eprint={2311.12022},
|
175 |
-
archivePrefix={arXiv},
|
176 |
-
primaryClass={cs.AI},
|
177 |
-
url={https://arxiv.org/abs/2311.12022},
|
178 |
-
}
|
179 |
-
|
180 |
-
@article{sakaguchi2021winogrande,
|
181 |
-
title={Winogrande: An adversarial winograd schema challenge at scale},
|
182 |
-
author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
|
183 |
-
journal={Communications of the ACM},
|
184 |
-
volume={64},
|
185 |
-
number={9},
|
186 |
-
pages={99--106},
|
187 |
-
year={2021},
|
188 |
-
publisher={ACM New York, NY, USA}
|
189 |
}
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
}
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
}
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
eprint={2311.07911},
|
211 |
-
archivePrefix={arXiv},
|
212 |
-
primaryClass={cs.CL},
|
213 |
-
url={https://arxiv.org/abs/2311.07911},
|
214 |
}
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
}
|
222 |
"""
|
|
|
|
|
|
|
1 |
from src.display.utils import ModelType
|
2 |
|
|
|
3 |
|
4 |
+
TITLE = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/header_logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
|
5 |
+
BOTTOM_LOGO = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/footer_logo_240715.png" style="width:50%;display:block;margin-left:auto;margin-right:auto">"""
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
INTRODUCTION_TEXT = f"""
|
8 |
+
🚀 The Open Ko-LLM Leaderboard 🇰🇷 objectively evaluates the performance of Korean Large Language Model (LLM).
|
|
|
|
|
9 |
|
10 |
+
When you submit a model on the "Submit here!" page, it is automatically evaluated. The GPU used for evaluation is operated with the support of __[KT](https://cloud.kt.com/)__.
|
11 |
+
The data used for evaluation consists of datasets to assess reasoning, language understanding, hallucination, and commonsense.
|
12 |
+
The evaluation dataset is exclusively private and only available for evaluation process.
|
13 |
+
More detailed information about the benchmark dataset is provided on the “About” page.
|
14 |
|
15 |
+
This leaderboard is co-hosted by __[Upstage](https://www.upstage.ai)__, and __[NIA](https://www.nia.or.kr/site/nia_kor/main.do)__ that provides various Korean Data Sets through __[AI-Hub](https://aihub.or.kr)__, and operated by __[Upstage](https://www.upstage.ai)__.
|
16 |
"""
|
17 |
|
18 |
LLM_BENCHMARKS_TEXT = f"""
|
19 |
+
# Context
|
|
|
20 |
While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Korean leaderboard, 🚀 Open Ko-LLM, to evaluate models that reflect the characteristics of the Korean language and Korean culture. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in Korean.
|
21 |
|
22 |
+
## Icons
|
23 |
+
{ModelType.PT.to_str(" : ")} model
|
24 |
+
{ModelType.IFT.to_str(" : ")} model
|
25 |
+
{ModelType.RL.to_str(" : ")} model
|
26 |
+
If there is no icon, it indicates that there is insufficient information about the model.
|
27 |
+
Please provide information about the model through an issue! 🤩
|
28 |
|
29 |
+
🏴☠️ : This icon indicates that the model has been selected as a subject of caution by the community, implying that users should exercise restraint when using it. Clicking on the icon will take you to a discussion about that model.
|
30 |
+
(Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, are selected as subjects of caution.)
|
31 |
|
32 |
+
## How it works
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
📈 We evaluate models using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), a unified framework to test generative language models on a large number of different evaluation tasks.
|
35 |
|
36 |
+
We have set up a benchmark using datasets translated into Korean, and applied variations by human experts, from the six tasks (HellaSwag, MMLU, Arc, Truthful QA, Winogrande, GSM8k) operated by __HuggingFace [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)__. We have also added a new dataset prepared from scratch.
|
37 |
+
- Ko-HellaSwag (provided by __[Upstage](https://www.upstage.ai/)__, machine translation)
|
38 |
+
- Ko-MMLU (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
39 |
+
- Ko-Arc (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
40 |
+
- Ko-Truthful QA (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
41 |
+
- Ko-Winogrande (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
42 |
+
- Ko-GSM8k (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
43 |
+
- Ko-CommonGen V2 (provided by __[Korea University NLP&AI Lab](http://nlp.korea.ac.kr/)__, created from scratch)
|
44 |
+
- Ko-EQ Bench (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
45 |
+
- Ko-InstFollow (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
46 |
+
- KorNAT-CKA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
47 |
+
- KorNAT-SVA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
48 |
+
- Ko-Harmlessness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
49 |
+
- Ko-Helpfulness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
50 |
|
51 |
+
To provide an evaluation befitting the LLM era, we've selected benchmark datasets suitable for assessing these elements: expertise, inference, hallucination, truthfulness and common sense. The final score is converted to the average score from each evaluation datasets.
|
52 |
|
53 |
+
GPUs are provided by __[KT](https://cloud.kt.com/)__ for the evaluations.
|
54 |
|
55 |
+
## Details and Logs
|
56 |
+
- Detailed numerical results in the `results` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/results
|
57 |
+
- Community queries and running status in the `requests` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/requests
|
58 |
|
59 |
## More resources
|
60 |
+
If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)!
|
|
|
61 |
"""
|
62 |
|
63 |
|
|
|
66 |
|
67 |
|
68 |
EVALUATION_QUEUE_TEXT = f"""
|
69 |
+
# Evaluation Queue for the 🚀 Open Ko-LLM Leaderboard
|
70 |
+
Models added here will be automatically evaluated on the KT GPU cluster.
|
|
|
|
|
|
|
71 |
|
72 |
+
## <Some good practices before submitting a model>
|
73 |
|
74 |
+
### 1️⃣ Make sure you can load your model and tokenizer using AutoClasses
|
75 |
+
```python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
77 |
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
78 |
model = AutoModel.from_pretrained("your model name", revision=revision)
|
79 |
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
80 |
```
|
81 |
|
82 |
+
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
83 |
|
84 |
+
⚠️ Make sure your model is public!
|
|
|
|
|
85 |
|
86 |
+
⚠️ Maker sure your model runs with [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)
|
87 |
|
88 |
+
⚠️ If your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
89 |
|
90 |
+
### 2️⃣ Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
91 |
+
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
92 |
|
93 |
+
### 3️⃣ Make sure your model has an open license!
|
94 |
+
This is a leaderboard for 🚀 Open Ko-LLMs, and we'd love for as many people as possible to know they can use your model
|
|
|
95 |
|
96 |
+
### 4️⃣ Fill up your model card
|
97 |
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
98 |
|
99 |
+
## In case of model failure
|
100 |
+
If your model is displayed in the `FAILED` category, its execution stopped. Make sure you have followed the above steps first. If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
"""
|
102 |
|
103 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Authors of open-ko-llm-leaderboard are ordered alphabetically."
|
|
|
106 |
title={Open Ko-LLM Leaderboard: Evaluating Large Language Models in Korean with Ko-H5 Benchmark},
|
107 |
author={Chanjun Park and Hyeonwoo Kim and Dahyun Kim and Seonghwan Cho and Sanghoon Kim and Sukyung Lee and Yungi Kim and Hwalsuk Lee},
|
108 |
year={2024},
|
109 |
+
booktitle={ACL Main}
|
110 |
}
|
|
|
|
|
111 |
@software{eval-harness,
|
112 |
author = {Gao, Leo and
|
113 |
Tow, Jonathan and
|
|
|
132 |
publisher = {Zenodo},
|
133 |
version = {v0.0.1},
|
134 |
doi = {10.5281/zenodo.5371628},
|
135 |
+
url = {https://doi.org/10.5281/zenodo.5371628}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
}
|
137 |
+
@misc{seo2023kocommongen,
|
138 |
+
title={Korean Commonsense Reasoning Evaluation for Large Language Models},
|
139 |
+
author={Jaehyung Seo, Chanjun Park, Hyeonseok Moon, Sugyeong Eo, Aram So, Heuiseok Lim},
|
140 |
+
year={2023},
|
141 |
+
affilation={Korea University, NLP&AI},
|
142 |
+
booktitle={Proceedings of the 35th Annual Conference on Human & Cognitive Language Technology}}
|
143 |
+
@misc{park2023koarc,
|
144 |
+
title={Ko-ARC},
|
145 |
+
original_title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
|
146 |
+
author={Hyunbyung Park, Chanjun Park},
|
147 |
+
original_author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
|
148 |
+
year={2023}
|
149 |
}
|
150 |
+
@misc{park2023kohellaswag,
|
151 |
+
title={Ko-HellaSwag},
|
152 |
+
original_title={HellaSwag: Can a Machine Really Finish Your Sentence?},
|
153 |
+
author={Hyunbyung Park, Chanjun Park},
|
154 |
+
original_author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
|
155 |
+
year={2023}
|
156 |
}
|
157 |
+
@misc{park2023kommlu,
|
158 |
+
title={Ko-MMLU},
|
159 |
+
original_title={Measuring Massive Multitask Language Understanding},
|
160 |
+
author={Hyunbyung Park, Chanjun Park},
|
161 |
+
original_author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
162 |
+
year={2023}
|
|
|
|
|
|
|
|
|
163 |
}
|
164 |
+
@misc{park2023kotruthfulqa,
|
165 |
+
title={Ko-TruthfulQA},
|
166 |
+
original_title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
|
167 |
+
author={Hyunbyung Park, Chanjun Park},
|
168 |
+
original_author={Stephanie Lin and Jacob Hilton and Owain Evans},
|
169 |
+
year={2023}
|
170 |
}
|
171 |
"""
|
src/display/formatting.py
CHANGED
@@ -14,9 +14,10 @@ def model_hyperlink(link, model_name):
|
|
14 |
def make_clickable_model(model_name):
|
15 |
link = f"https://huggingface.co/{model_name}"
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
20 |
|
21 |
|
22 |
def styled_error(error):
|
|
|
14 |
def make_clickable_model(model_name):
|
15 |
link = f"https://huggingface.co/{model_name}"
|
16 |
|
17 |
+
details_model_name = model_name.replace("/", "__")
|
18 |
+
details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
|
19 |
+
|
20 |
+
return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
|
21 |
|
22 |
|
23 |
def styled_error(error):
|
src/display/host_sponsor.png
DELETED
Binary file (131 kB)
|
|
src/display/main_logo.png
DELETED
Binary file (345 kB)
|
|
src/display/utils.py
CHANGED
@@ -14,15 +14,19 @@ class Task:
|
|
14 |
col_name: str
|
15 |
|
16 |
class Tasks(Enum):
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
26 |
|
27 |
|
28 |
# These classes are for user facing column names,
|
@@ -85,30 +89,26 @@ class ModelDetails:
|
|
85 |
|
86 |
class ModelType(Enum):
|
87 |
PT = ModelDetails(name="pretrained", symbol="🟢")
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
Unknown = ModelDetails(name="other", symbol="❓")
|
93 |
|
94 |
def to_str(self, separator=" "):
|
95 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
96 |
|
97 |
@staticmethod
|
98 |
-
def from_str(
|
99 |
-
|
100 |
-
|
101 |
-
if "
|
102 |
-
return ModelType.CPT
|
103 |
-
if "pretrained" in m_type or "🟢" in m_type:
|
104 |
return ModelType.PT
|
105 |
-
if
|
106 |
-
return ModelType.
|
107 |
-
if "
|
108 |
-
return ModelType.
|
109 |
return ModelType.Unknown
|
110 |
|
111 |
-
|
112 |
class WeightType(Enum):
|
113 |
Adapter = ModelDetails("Adapter")
|
114 |
Original = ModelDetails("Original")
|
@@ -116,13 +116,12 @@ class WeightType(Enum):
|
|
116 |
|
117 |
class Precision(Enum):
|
118 |
float16 = ModelDetails("float16")
|
119 |
-
bfloat16 = ModelDetails("bfloat16")
|
120 |
-
qt_8bit = ModelDetails("8bit")
|
121 |
-
qt_4bit = ModelDetails("4bit")
|
122 |
-
qt_GPTQ = ModelDetails("GPTQ")
|
123 |
Unknown = ModelDetails("?")
|
124 |
|
125 |
-
@staticmethod
|
126 |
def from_str(precision):
|
127 |
if precision in ["torch.float16", "float16"]:
|
128 |
return Precision.float16
|
@@ -135,10 +134,15 @@ class Precision(Enum):
|
|
135 |
if precision in ["GPTQ", "None"]:
|
136 |
return Precision.qt_GPTQ
|
137 |
return Precision.Unknown
|
|
|
|
|
|
|
138 |
|
139 |
# Column selection
|
140 |
-
COLS = [c.name for c in fields(AutoEvalColumn)]
|
141 |
-
TYPES = [c.type for c in fields(AutoEvalColumn)]
|
|
|
|
|
142 |
|
143 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
144 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
@@ -153,4 +157,4 @@ NUMERIC_INTERVALS = {
|
|
153 |
"13~35B": pd.Interval(13, 35, closed="right"),
|
154 |
"35~60B": pd.Interval(35, 60, closed="right"),
|
155 |
"60B+": pd.Interval(60, 10000, closed="right"),
|
156 |
-
}
|
|
|
14 |
col_name: str
|
15 |
|
16 |
class Tasks(Enum):
|
17 |
+
arc = Task("ko_arc_challenge", "acc_norm", "Ko-ARC")
|
18 |
+
hellaswag = Task("ko_hellaswag", "acc_norm", "Ko-HellaSwag")
|
19 |
+
mmlu = Task("ko_mmlu", "acc", "Ko-MMLU")
|
20 |
+
truthfulqa = Task("ko_truthfulqa_mc", "mc2", "Ko-TruthfulQA")
|
21 |
+
winogrande = Task("ko_winogrande", "acc_norm", "Ko-Winogrande")
|
22 |
+
gsm8k = Task("ko_gsm8k", "acc_norm", "Ko-GSM8k")
|
23 |
+
commongen_v2 = Task("ko_commongen_v2", "acc_norm", "Ko-CommonGen V2")
|
24 |
+
eqBench = Task("ko_eq_bench", "acc_norm", "Ko-EQ Bench")
|
25 |
+
instFollow = Task("ko_inst_follow", "acc_norm", "Ko-InstFollow")
|
26 |
+
korNatCka = Task("kor_nat_cka", "acc_norm", "KorNAT-CKA")
|
27 |
+
korNatSva = Task("kor_nat_sva", "acc_norm", "KorNAT-SVA")
|
28 |
+
harmlessness = Task("ko_harmlessness", "acc_norm", "Ko-Harmlessness")
|
29 |
+
helpfulness = Task("ko_helpfulness", "acc_norm", "Ko-Helpfulness")
|
30 |
|
31 |
|
32 |
# These classes are for user facing column names,
|
|
|
89 |
|
90 |
class ModelType(Enum):
|
91 |
PT = ModelDetails(name="pretrained", symbol="🟢")
|
92 |
+
# FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
93 |
+
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
94 |
+
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
95 |
+
Unknown = ModelDetails(name="", symbol="?")
|
|
|
96 |
|
97 |
def to_str(self, separator=" "):
|
98 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
99 |
|
100 |
@staticmethod
|
101 |
+
def from_str(type):
|
102 |
+
# if "fine-tuned" in type or "🔶" in type:
|
103 |
+
# return ModelType.FT
|
104 |
+
if "pretrained" in type or "🟢" in type:
|
|
|
|
|
105 |
return ModelType.PT
|
106 |
+
if "RL-tuned" in type or "🟦" in type:
|
107 |
+
return ModelType.RL
|
108 |
+
if "instruction-tuned" in type or "⭕" in type:
|
109 |
+
return ModelType.IFT
|
110 |
return ModelType.Unknown
|
111 |
|
|
|
112 |
class WeightType(Enum):
|
113 |
Adapter = ModelDetails("Adapter")
|
114 |
Original = ModelDetails("Original")
|
|
|
116 |
|
117 |
class Precision(Enum):
|
118 |
float16 = ModelDetails("float16")
|
119 |
+
# bfloat16 = ModelDetails("bfloat16")
|
120 |
+
# qt_8bit = ModelDetails("8bit")
|
121 |
+
# qt_4bit = ModelDetails("4bit")
|
122 |
+
# qt_GPTQ = ModelDetails("GPTQ")
|
123 |
Unknown = ModelDetails("?")
|
124 |
|
|
|
125 |
def from_str(precision):
|
126 |
if precision in ["torch.float16", "float16"]:
|
127 |
return Precision.float16
|
|
|
134 |
if precision in ["GPTQ", "None"]:
|
135 |
return Precision.qt_GPTQ
|
136 |
return Precision.Unknown
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
|
141 |
# Column selection
|
142 |
+
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
143 |
+
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
144 |
+
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
145 |
+
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
146 |
|
147 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
148 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
|
157 |
"13~35B": pd.Interval(13, 35, closed="right"),
|
158 |
"35~60B": pd.Interval(35, 60, closed="right"),
|
159 |
"60B+": pd.Interval(60, 10000, closed="right"),
|
160 |
+
}
|
src/leaderboard/read_evals.py
CHANGED
@@ -48,7 +48,7 @@ class EvalResult:
|
|
48 |
precision = Precision.from_str(config.get("model_dtype"))
|
49 |
|
50 |
# Get model and org
|
51 |
-
org_and_model = config.get("model_name", None)
|
52 |
org_and_model = org_and_model.split("/", 1)
|
53 |
|
54 |
if len(org_and_model) == 1:
|
@@ -96,18 +96,26 @@ class EvalResult:
|
|
96 |
results = {}
|
97 |
for task in Tasks:
|
98 |
task = task.value
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
105 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
|
|
106 |
continue
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
111 |
results[task.benchmark] = mean_acc
|
112 |
|
113 |
return self(
|
@@ -143,7 +151,27 @@ class EvalResult:
|
|
143 |
def to_dict(self):
|
144 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
145 |
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
data_dict = {
|
149 |
"eval_name": self.eval_name, # not a column, just a save name,
|
|
|
48 |
precision = Precision.from_str(config.get("model_dtype"))
|
49 |
|
50 |
# Get model and org
|
51 |
+
org_and_model = config.get("model_name", config.get("model_args", None))
|
52 |
org_and_model = org_and_model.split("/", 1)
|
53 |
|
54 |
if len(org_and_model) == 1:
|
|
|
96 |
results = {}
|
97 |
for task in Tasks:
|
98 |
task = task.value
|
99 |
+
|
100 |
+
# Some truthfulQA values are NaNs
|
101 |
+
if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
|
102 |
+
if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
|
103 |
+
results[task.benchmark] = 0.0
|
104 |
+
continue
|
105 |
+
|
106 |
+
# New tasks have been added, we need to skip them if not exists
|
107 |
+
if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eq_bench", "ko_inst_follow", "kor_nat_cka", "kor_nat_sva", "ko_harmlessness", "ko_helpfulness"]:
|
108 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
109 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
110 |
+
results[task.benchmark] = 0.0
|
111 |
continue
|
112 |
+
|
113 |
+
# We average all scores of a given metric (mostly for mmlu)
|
114 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
115 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
116 |
+
continue
|
117 |
+
|
118 |
+
mean_acc = np.mean(accs) * 100.0
|
119 |
results[task.benchmark] = mean_acc
|
120 |
|
121 |
return self(
|
|
|
151 |
def to_dict(self):
|
152 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
153 |
|
154 |
+
# Skip the new tasks for now
|
155 |
+
# TODO: safely remove this code when the task results are all added
|
156 |
+
skip_avg_len = 0
|
157 |
+
if self.results['ko_winogrande'] == 0.0:
|
158 |
+
skip_avg_len += 1
|
159 |
+
if self.results['ko_gsm8k'] == 0.0:
|
160 |
+
skip_avg_len += 1
|
161 |
+
if self.results['ko_eq_bench'] == 0.0:
|
162 |
+
skip_avg_len += 1
|
163 |
+
if self.results['ko_inst_follow'] == 0.0:
|
164 |
+
skip_avg_len += 1
|
165 |
+
if self.results['kor_nat_cka'] == 0.0:
|
166 |
+
skip_avg_len += 1
|
167 |
+
if self.results['kor_nat_sva'] == 0.0:
|
168 |
+
skip_avg_len += 1
|
169 |
+
if self.results['ko_harmlessness'] == 0.0:
|
170 |
+
skip_avg_len += 1
|
171 |
+
if self.results['ko_helpfulness'] == 0.0:
|
172 |
+
skip_avg_len += 1
|
173 |
+
|
174 |
+
average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
|
175 |
|
176 |
data_dict = {
|
177 |
"eval_name": self.eval_name, # not a column, just a save name,
|
src/submission/submit.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
-
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
7 |
from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
|
@@ -13,7 +12,6 @@ from src.submission.check_validity import (
|
|
13 |
is_model_on_hub,
|
14 |
user_submission_permission,
|
15 |
)
|
16 |
-
from src.populate import get_evaluation_queue_df
|
17 |
|
18 |
REQUESTED_MODELS = None
|
19 |
USERS_TO_SUBMISSION_DATES = None
|
@@ -40,7 +38,10 @@ def add_new_eval(
|
|
40 |
|
41 |
precision = precision.split(" ")[0]
|
42 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
43 |
-
|
|
|
|
|
|
|
44 |
if model_type is None or model_type == "":
|
45 |
return styled_error("Please select a model type.")
|
46 |
|
@@ -99,9 +100,6 @@ def add_new_eval(
|
|
99 |
|
100 |
# Seems good, creating the eval
|
101 |
print("Adding new eval")
|
102 |
-
# dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, cols=["job_id"])
|
103 |
-
# dfs = pd.concat(dfs).reset_index(drop=True)
|
104 |
-
# max_job_id = max([int(c) for c in dfs["job_id"].values])
|
105 |
|
106 |
eval_entry = {
|
107 |
"model": model,
|
@@ -116,7 +114,6 @@ def add_new_eval(
|
|
116 |
"likes": model_info.likes,
|
117 |
"params": model_size,
|
118 |
"license": license,
|
119 |
-
# "job_id": max_job_id+1
|
120 |
}
|
121 |
|
122 |
# Check for duplicate submission
|
|
|
1 |
import json
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
|
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
|
|
|
12 |
is_model_on_hub,
|
13 |
user_submission_permission,
|
14 |
)
|
|
|
15 |
|
16 |
REQUESTED_MODELS = None
|
17 |
USERS_TO_SUBMISSION_DATES = None
|
|
|
38 |
|
39 |
precision = precision.split(" ")[0]
|
40 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
41 |
+
# 리더보드 종료
|
42 |
+
if True:
|
43 |
+
return styled_error("The current Season 1 will conclude on Friday, August 2, and the new season will commence on August 12.")
|
44 |
+
|
45 |
if model_type is None or model_type == "":
|
46 |
return styled_error("Please select a model type.")
|
47 |
|
|
|
100 |
|
101 |
# Seems good, creating the eval
|
102 |
print("Adding new eval")
|
|
|
|
|
|
|
103 |
|
104 |
eval_entry = {
|
105 |
"model": model,
|
|
|
114 |
"likes": model_info.likes,
|
115 |
"params": model_size,
|
116 |
"license": license,
|
|
|
117 |
}
|
118 |
|
119 |
# Check for duplicate submission
|