openfree commited on
Commit
ded31ce
ยท
verified ยท
1 Parent(s): 4f57346

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -255
app.py CHANGED
@@ -1,264 +1,107 @@
1
- #๋ชจ๋ธ๋ช…๊ณผ url ๋ณ€๊ฒฝ: "src/display/formatting.py" ๊ทธ๋ฆฌ๊ณ  src/leaderboard/read_evals.py
2
- #ํ‰๊ฐ€ ํ•ญ๋ชฉ๋ช… ๋ณ€๊ฒฝ: "src/about.py"
3
-
4
  import gradio as gr
5
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
6
  import pandas as pd
7
- from apscheduler.schedulers.background import BackgroundScheduler
8
- from huggingface_hub import snapshot_download
9
-
10
- from src.about import (
11
- CITATION_BUTTON_LABEL,
12
- CITATION_BUTTON_TEXT,
13
- EVALUATION_QUEUE_TEXT,
14
- INTRODUCTION_TEXT,
15
- LLM_BENCHMARKS_TEXT,
16
- TITLE,
17
- )
18
- from src.display.css_html_js import custom_css
19
- from src.display.utils import (
20
- BENCHMARK_COLS,
21
- COLS,
22
- EVAL_COLS,
23
- EVAL_TYPES,
24
- AutoEvalColumn,
25
- ModelType,
26
- fields,
27
- WeightType,
28
- Precision
29
- )
30
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
31
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
32
- from src.submission.submit import add_new_eval
33
-
34
- def debug_model_names(df, label="๋””๋ฒ„๊ทธ"):
35
- """
36
- ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์—์„œ ๋ชจ๋ธ ์ด๋ฆ„ ๊ด€๋ จ ์—ด์„ ๋””๋ฒ„๊น…ํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜
37
- """
38
- print(f"===== {label} ๋””๋ฒ„๊น… =====")
39
- if df is None or df.empty:
40
- print("๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์ด ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค.")
41
- return
42
-
43
- model_cols = [col for col in df.columns if 'model' in col.lower()]
44
- if not model_cols:
45
- print("๋ชจ๋ธ ๊ด€๋ จ ์—ด์ด ์—†์Šต๋‹ˆ๋‹ค.")
46
- return
47
-
48
- for col in model_cols:
49
- print(f"์ปฌ๋Ÿผ: {col}")
50
- print(df[col].head())
51
- print("\n")
52
-
53
- print("==================\n")
54
-
55
- def restart_space():
56
- API.restart_space(repo_id=REPO_ID)
57
-
58
- ### Space initialisation
59
- try:
60
- print(EVAL_REQUESTS_PATH)
61
- snapshot_download(
62
- repo_id=QUEUE_REPO,
63
- local_dir=EVAL_REQUESTS_PATH,
64
- repo_type="dataset",
65
- tqdm_class=None,
66
- etag_timeout=30,
67
- token=TOKEN
68
- )
69
- except Exception:
70
- restart_space()
71
- try:
72
- print(EVAL_RESULTS_PATH)
73
- snapshot_download(
74
- repo_id=RESULTS_REPO,
75
- local_dir=EVAL_RESULTS_PATH,
76
- repo_type="dataset",
77
- tqdm_class=None,
78
- etag_timeout=30,
79
- token=TOKEN
80
- )
81
- except Exception:
82
- restart_space()
83
-
84
-
85
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
86
- # ๋””๋ฒ„๊น…์„ ์œ„ํ•œ ์ฝ”๋“œ (ํ•„์š”์‹œ ์ฃผ์„ ํ•ด์ œ)
87
- # debug_model_names(LEADERBOARD_DF, "Leaderboard ๋ฐ์ดํ„ฐ")
88
-
89
- # ๋ณ€ํ™˜ ๋งคํ•‘ ์ •์˜
90
- benchmark_mapping = {
91
- "ANLI": "Korean Bar Exam (Lawyer)",
92
- "LogiQA": "Senior Civil Service Examination(๊ตญ๊ฐ€์ง 5๊ธ‰)"
93
  }
94
 
95
- # Leaderboard์— ํ‘œ์‹œ๋  ๋ชจ๋ธ ํƒ€์ž… ์ปฌ๋Ÿผ ๋ณ€ํ™˜ (์ปฌ๋Ÿผ ์ด๋ฆ„์€ AutoEvalColumn.model_type.name๋กœ ๊ฐ€์ •)
96
- model_type_column = AutoEvalColumn.model_type.name
97
- if model_type_column in LEADERBOARD_DF.columns:
98
- LEADERBOARD_DF[model_type_column] = LEADERBOARD_DF[model_type_column].apply(lambda s: benchmark_mapping.get(s, s))
99
 
100
- (
101
- finished_eval_queue_df,
102
- running_eval_queue_df,
103
- pending_eval_queue_df,
104
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
105
- # ๋””๋ฒ„๊น…์„ ์œ„ํ•œ ์ฝ”๋“œ (ํ•„์š”์‹œ ์ฃผ์„ ํ•ด์ œ)
106
- # debug_model_names(finished_eval_queue_df, "์™„๋ฃŒ๋œ ํ‰๊ฐ€ ํ")
107
- # debug_model_names(running_eval_queue_df, "์‹คํ–‰ ์ค‘์ธ ํ‰๊ฐ€ ํ")
108
- # debug_model_names(pending_eval_queue_df, "๋Œ€๊ธฐ ์ค‘์ธ ํ‰๊ฐ€ ํ")
109
 
110
- def init_leaderboard(dataframe):
111
- if dataframe is None or dataframe.empty:
112
- raise ValueError("Leaderboard DataFrame is empty or None.")
113
- return Leaderboard(
114
- value=dataframe,
115
- datatype=[c.type for c in fields(AutoEvalColumn)],
116
- select_columns=SelectColumns(
117
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
118
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
119
- label="Select Columns to Display:",
120
- ),
121
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
122
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
123
- filter_columns=[
124
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
125
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
126
- ColumnFilter(
127
- AutoEvalColumn.params.name,
128
- type="slider",
129
- min=0.01,
130
- max=150,
131
- label="Select the number of parameters (B)",
132
- ),
133
- ColumnFilter(
134
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
135
- ),
136
- ],
137
- bool_checkboxgroup_label="Hide models",
138
- interactive=False,
139
- )
140
 
141
- def get_model_type_display(enum_obj):
142
- """
143
- ModelType enum ๊ฐ์ฒด๋ฅผ ๋ฐ›์•„์„œ ๋ณ€ํ™˜ ๋งคํ•‘์— ์žˆ์œผ๋ฉด ํ•ด๋‹น ๊ฐ’์„ ๋ฐ˜ํ™˜,
144
- ์—†์œผ๋ฉด ๊ธฐ๋ณธ to_str(" : ") ๊ฒฐ๊ณผ๋ฅผ ๋ฐ˜ํ™˜.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  """
146
- try:
147
- key = enum_obj.name
148
- except AttributeError:
149
- key = enum_obj.to_str(" : ")
150
- return benchmark_mapping.get(key, enum_obj.to_str(" : "))
151
-
152
- demo = gr.Blocks(css=custom_css)
153
- with demo:
154
- gr.HTML(TITLE)
155
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
156
-
157
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
158
- with gr.TabItem("๐Ÿ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
159
- leaderboard = init_leaderboard(LEADERBOARD_DF)
160
-
161
- with gr.TabItem("๐Ÿ“ About", elem_id="llm-benchmark-tab-table", id=2):
162
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
163
-
164
- with gr.TabItem("๐Ÿš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
165
- with gr.Column():
166
- with gr.Row():
167
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
168
-
169
- with gr.Column():
170
- with gr.Accordion(
171
- f"โœ… Finished Evaluations ({len(finished_eval_queue_df)})",
172
- open=False,
173
- ):
174
- with gr.Row():
175
- finished_eval_table = gr.components.Dataframe(
176
- value=finished_eval_queue_df,
177
- headers=EVAL_COLS,
178
- datatype=EVAL_TYPES,
179
- row_count=5,
180
- )
181
- with gr.Accordion(
182
- f"๐Ÿ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
183
- open=False,
184
- ):
185
- with gr.Row():
186
- running_eval_table = gr.components.Dataframe(
187
- value=running_eval_queue_df,
188
- headers=EVAL_COLS,
189
- datatype=EVAL_TYPES,
190
- row_count=5,
191
- )
192
-
193
- with gr.Accordion(
194
- f"โณ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
195
- open=False,
196
- ):
197
- with gr.Row():
198
- pending_eval_table = gr.components.Dataframe(
199
- value=pending_eval_queue_df,
200
- headers=EVAL_COLS,
201
- datatype=EVAL_TYPES,
202
- row_count=5,
203
- )
204
- with gr.Row():
205
- gr.Markdown("# โœ‰๏ธโœจ Submit your model here!", elem_classes="markdown-text")
206
-
207
- with gr.Row():
208
- with gr.Column():
209
- model_name_textbox = gr.Textbox(label="Model name")
210
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
211
- model_type = gr.Dropdown(
212
- choices=[get_model_type_display(t) for t in ModelType if t != ModelType.Unknown],
213
- label="Model type",
214
- multiselect=False,
215
- value=None,
216
- interactive=True,
217
- )
218
-
219
- with gr.Column():
220
- precision = gr.Dropdown(
221
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
222
- label="Precision",
223
- multiselect=False,
224
- value="float16",
225
- interactive=True,
226
- )
227
- weight_type = gr.Dropdown(
228
- choices=[i.value.name for i in WeightType],
229
- label="Weights type",
230
- multiselect=False,
231
- value="Original",
232
- interactive=True,
233
- )
234
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
235
-
236
- submit_button = gr.Button("Submit Eval")
237
- submission_result = gr.Markdown()
238
- submit_button.click(
239
- add_new_eval,
240
- [
241
- model_name_textbox,
242
- base_model_name_textbox,
243
- revision_name_textbox,
244
- precision,
245
- weight_type,
246
- model_type,
247
- ],
248
- submission_result,
249
- )
250
-
251
- with gr.Row():
252
- with gr.Accordion("๐Ÿ“™ Citation", open=False):
253
- citation_button = gr.Textbox(
254
- value=CITATION_BUTTON_TEXT,
255
- label=CITATION_BUTTON_LABEL,
256
- lines=20,
257
- elem_id="citation-button",
258
- show_copy_button=True,
259
- )
260
 
261
- scheduler = BackgroundScheduler()
262
- scheduler.add_job(restart_space, "interval", seconds=1800)
263
- scheduler.start()
264
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
1
  import gradio as gr
 
2
  import pandas as pd
3
+ import numpy as np
4
+
5
+ # ๋ฐ์ดํ„ฐ ์ •์˜ (ํ•˜๋“œ์ฝ”๋”ฉ)
6
+ data = {
7
+ "Company/Model": [
8
+ "Anthropic/Claude 3 Opus",
9
+ "OpenAI/GPT-4",
10
+ "Google/Gemini Ultra",
11
+ "Cohere/Command R+",
12
+ "Naver/HyperCLOVA X",
13
+ "Kakao/KoGPT"
14
+ ],
15
+ "URL": [
16
+ "https://www.anthropic.com/claude",
17
+ "https://openai.com/gpt-4",
18
+ "https://deepmind.google/technologies/gemini/",
19
+ "https://cohere.com/models/command-r-plus",
20
+ "https://clova.ai/hyperclova",
21
+ "https://kogpt.ai/"
22
+ ],
23
+ "Korean Bar Exam (๋ณ€ํ˜ธ์‚ฌ)": [85, 82, 80, 75, 79, 77],
24
+ "Senior Civil Service Examination (๊ตญ๊ฐ€์ง 5๊ธ‰)": [88, 84, 83, 76, 81, 78]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
26
 
27
+ # DataFrame ์ƒ์„ฑ
28
+ df = pd.DataFrame(data)
 
 
29
 
30
+ # Average ์ ์ˆ˜ ๊ณ„์‚ฐ
31
+ exam_columns = ["Korean Bar Exam (๋ณ€ํ˜ธ์‚ฌ)", "Senior Civil Service Examination (๊ตญ๊ฐ€์ง 5๊ธ‰)"]
32
+ df["Average"] = df[exam_columns].mean(axis=1).round(1)
 
 
 
 
 
 
33
 
34
+ # ์—ด ์ˆœ์„œ ์žฌ๋ฐฐ์น˜ (Company/Model, URL, Average, ๊ทธ ๋‹ค์Œ ๊ฐ ์‹œํ—˜)
35
+ cols = ["Company/Model", "URL", "Average"] + exam_columns
36
+ df = df[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # HTML๋กœ ๋ Œ๋”๋งํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜ (URL์„ ํด๋ฆญ ๊ฐ€๋Šฅํ•œ ๋งํฌ๋กœ ๋ณ€ํ™˜)
39
+ def format_df_as_html(df):
40
+ # DataFrame ๋ณต์‚ฌ๋ณธ ์ƒ์„ฑ
41
+ display_df = df.copy()
42
+
43
+ # URL ์—ด์„ ํด๋ฆญ ๊ฐ€๋Šฅํ•œ ๋งํฌ๋กœ ๋ณ€ํ™˜
44
+ for i, url in enumerate(display_df["URL"]):
45
+ model_name = display_df.iloc[i]["Company/Model"]
46
+ display_df.at[i, "Company/Model"] = f'<a href="{url}" target="_blank">{model_name}</a>'
47
+
48
+ # URL ์—ด ์ œ๊ฑฐ (์ด๋ฏธ Company/Model์— ๋งํฌ๋กœ ํ†ตํ•ฉ)
49
+ display_df = display_df.drop("URL", axis=1)
50
+
51
+ # ํ‘œ ์Šคํƒ€์ผ ์ถ”๊ฐ€
52
+ styled_html = """
53
+ <style>
54
+ table {
55
+ width: 100%;
56
+ border-collapse: collapse;
57
+ font-family: Arial, sans-serif;
58
+ }
59
+ th {
60
+ background-color: #4CAF50;
61
+ color: white;
62
+ font-weight: bold;
63
+ text-align: left;
64
+ padding: 12px;
65
+ }
66
+ td {
67
+ padding: 10px;
68
+ border-bottom: 1px solid #ddd;
69
+ }
70
+ tr:nth-child(even) {
71
+ background-color: #f2f2f2;
72
+ }
73
+ tr:hover {
74
+ background-color: #ddd;
75
+ }
76
+ .header {
77
+ text-align: center;
78
+ font-size: 24px;
79
+ font-weight: bold;
80
+ margin-bottom: 20px;
81
+ color: #333;
82
+ }
83
+ </style>
84
+ <div class="header">Korean Exam Leaderboard</div>
85
  """
86
+
87
+ # DataFrame์„ HTML๋กœ ๋ณ€ํ™˜ํ•˜๊ณ  ์Šคํƒ€์ผ ์ ์šฉ
88
+ html_table = display_df.to_html(index=False, escape=False)
89
+ return styled_html + html_table
90
+
91
+ # Gradio ์ธํ„ฐํŽ˜์ด์Šค
92
+ def show_leaderboard():
93
+ html_content = format_df_as_html(df)
94
+ return html_content
95
+
96
+ # ์ธํ„ฐํŽ˜์ด์Šค ์ƒ์„ฑ
97
+ demo = gr.Interface(
98
+ fn=show_leaderboard,
99
+ inputs=None,
100
+ outputs=gr.HTML(),
101
+ title="Korean Exam Leaderboard",
102
+ description="์„ฑ๋Šฅ ๋น„๊ต: ํ•œ๊ตญ ๋ฒ•ํ•™ ๋ฐ ํ–‰์ •๊ณ ์‹œ ์‹œํ—˜์—์„œ์˜ AI ๋ชจ๋ธ ์ ์ˆ˜"
103
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ # ์•ฑ ์‹คํ–‰
106
+ if __name__ == "__main__":
107
+ demo.launch()