Spaces:
Running
Running
lixuejing
commited on
Commit
·
6981fa7
1
Parent(s):
dba0a90
update
Browse files- app.py +22 -68
- src/display/utils.py +1 -1
- src/leaderboard/read_evals.py +2 -0
- src/populate.py +3 -3
app.py
CHANGED
|
@@ -32,7 +32,7 @@ from src.display.utils import (
|
|
| 32 |
BENCHMARK_QUOTACOLS
|
| 33 |
)
|
| 34 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 35 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 36 |
from src.submission.submit import add_new_eval
|
| 37 |
from src.scripts.update_all_request_files import update_dynamic_files
|
| 38 |
from src.tools.collections import update_collections
|
|
@@ -77,6 +77,16 @@ def init_space():
|
|
| 77 |
#update_collections(original_df.copy())
|
| 78 |
leaderboard_df = original_df.copy()
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
#plot_df = create_plot_df(create_scores_df(raw_data))
|
| 81 |
|
| 82 |
(
|
|
@@ -86,12 +96,10 @@ def init_space():
|
|
| 86 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 87 |
|
| 88 |
#return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
| 89 |
-
return leaderboard_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
| 90 |
|
| 91 |
-
leaderboard_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
| 92 |
-
#return leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
| 93 |
|
| 94 |
-
#leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
| 95 |
|
| 96 |
|
| 97 |
# Searching and filtering
|
|
@@ -231,6 +239,13 @@ leaderboard_df = filter_models(
|
|
| 231 |
hide_models=[], # Deleted, merges, flagged, MoEs
|
| 232 |
)
|
| 233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
|
| 235 |
|
| 236 |
demo = gr.Blocks(css=custom_css)
|
|
@@ -265,36 +280,6 @@ with demo:
|
|
| 265 |
elem_id="column-select",
|
| 266 |
interactive=True,
|
| 267 |
)
|
| 268 |
-
#with gr.Row():
|
| 269 |
-
# hide_models = gr.CheckboxGroup(
|
| 270 |
-
# label="Hide models",
|
| 271 |
-
# choices = ["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
|
| 272 |
-
# value=[],
|
| 273 |
-
# interactive=True
|
| 274 |
-
# )
|
| 275 |
-
#with gr.Column(min_width=320):
|
| 276 |
-
# #with gr.Box(elem_id="box-filter"):
|
| 277 |
-
# filter_columns_type = gr.CheckboxGroup(
|
| 278 |
-
# label="Model types",
|
| 279 |
-
# choices=[t.to_str() for t in ModelType],
|
| 280 |
-
# value=[t.to_str() for t in ModelType],
|
| 281 |
-
# interactive=True,
|
| 282 |
-
# elem_id="filter-columns-type",
|
| 283 |
-
# )
|
| 284 |
-
# filter_columns_precision = gr.CheckboxGroup(
|
| 285 |
-
# label="Precision",
|
| 286 |
-
# choices=[i.value.name for i in Precision],
|
| 287 |
-
# value=[i.value.name for i in Precision],
|
| 288 |
-
# interactive=True,
|
| 289 |
-
# elem_id="filter-columns-precision",
|
| 290 |
-
# )
|
| 291 |
-
# filter_columns_size = gr.CheckboxGroup(
|
| 292 |
-
# label="Model sizes (in billions of parameters)",
|
| 293 |
-
# choices=list(NUMERIC_INTERVALS.keys()),
|
| 294 |
-
# value=list(NUMERIC_INTERVALS.keys()),
|
| 295 |
-
# interactive=True,
|
| 296 |
-
# elem_id="filter-columns-size",
|
| 297 |
-
# )
|
| 298 |
|
| 299 |
|
| 300 |
leaderboard_table = gr.components.Dataframe(
|
|
@@ -382,40 +367,10 @@ with demo:
|
|
| 382 |
elem_id="column-select",
|
| 383 |
interactive=True,
|
| 384 |
)
|
| 385 |
-
#with gr.Row():
|
| 386 |
-
# hide_models = gr.CheckboxGroup(
|
| 387 |
-
# label="Hide models",
|
| 388 |
-
# choices = ["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
|
| 389 |
-
# value=[],
|
| 390 |
-
# interactive=True
|
| 391 |
-
# )
|
| 392 |
-
#with gr.Column(min_width=320):
|
| 393 |
-
# #with gr.Box(elem_id="box-filter"):
|
| 394 |
-
# filter_columns_type = gr.CheckboxGroup(
|
| 395 |
-
# label="Model types",
|
| 396 |
-
# choices=[t.to_str() for t in ModelType],
|
| 397 |
-
# value=[t.to_str() for t in ModelType],
|
| 398 |
-
# interactive=True,
|
| 399 |
-
# elem_id="filter-columns-type",
|
| 400 |
-
# )
|
| 401 |
-
# filter_columns_precision = gr.CheckboxGroup(
|
| 402 |
-
# label="Precision",
|
| 403 |
-
# choices=[i.value.name for i in Precision],
|
| 404 |
-
# value=[i.value.name for i in Precision],
|
| 405 |
-
# interactive=True,
|
| 406 |
-
# elem_id="filter-columns-precision",
|
| 407 |
-
# )
|
| 408 |
-
# filter_columns_size = gr.CheckboxGroup(
|
| 409 |
-
# label="Model sizes (in billions of parameters)",
|
| 410 |
-
# choices=list(NUMERIC_INTERVALS.keys()),
|
| 411 |
-
# value=list(NUMERIC_INTERVALS.keys()),
|
| 412 |
-
# interactive=True,
|
| 413 |
-
# elem_id="filter-columns-size",
|
| 414 |
-
# )
|
| 415 |
|
| 416 |
|
| 417 |
leaderboard_table = gr.components.Dataframe(
|
| 418 |
-
value=
|
| 419 |
[c.name for c in fields(AutoEvalColumnQuota) if c.never_hidden]
|
| 420 |
+ shown_columns.value
|
| 421 |
+ [AutoEvalColumnQuota.dummy.name]
|
|
@@ -430,8 +385,7 @@ with demo:
|
|
| 430 |
|
| 431 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 432 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
| 433 |
-
value=
|
| 434 |
-
#value=leaderboard_df[QUOTACOLS],
|
| 435 |
headers=QUOTACOLS,
|
| 436 |
datatype=QUOTATYPES,
|
| 437 |
visible=False,
|
|
|
|
| 32 |
BENCHMARK_QUOTACOLS
|
| 33 |
)
|
| 34 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 35 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_leaderboard_df_quota
|
| 36 |
from src.submission.submit import add_new_eval
|
| 37 |
from src.scripts.update_all_request_files import update_dynamic_files
|
| 38 |
from src.tools.collections import update_collections
|
|
|
|
| 77 |
#update_collections(original_df.copy())
|
| 78 |
leaderboard_df = original_df.copy()
|
| 79 |
|
| 80 |
+
raw_data_quota, original_df_quota = get_leaderboard_df(
|
| 81 |
+
results_path=EVAL_RESULTS_PATH,
|
| 82 |
+
requests_path=EVAL_REQUESTS_PATH,
|
| 83 |
+
dynamic_path=DYNAMIC_INFO_FILE_PATH,
|
| 84 |
+
cols=list(set(QUOTACOLS+COLS)),
|
| 85 |
+
benchmark_cols=list(set(BENCHMARK_QUOTACOLS+BENCHMARK_COLS))
|
| 86 |
+
)
|
| 87 |
+
#update_collections(original_df.copy())
|
| 88 |
+
leaderboard_df_quota = original_df_quota.copy()
|
| 89 |
+
|
| 90 |
#plot_df = create_plot_df(create_scores_df(raw_data))
|
| 91 |
|
| 92 |
(
|
|
|
|
| 96 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 97 |
|
| 98 |
#return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
| 99 |
+
return leaderboard_df, original_df, leaderboard_df_quota, original_df_quota,finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
| 100 |
|
| 101 |
+
leaderboard_df, original_df, leaderboard_df_quota, original_df_quota, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
|
|
|
| 102 |
|
|
|
|
| 103 |
|
| 104 |
|
| 105 |
# Searching and filtering
|
|
|
|
| 239 |
hide_models=[], # Deleted, merges, flagged, MoEs
|
| 240 |
)
|
| 241 |
|
| 242 |
+
leaderboard_df_quota = filter_models(
|
| 243 |
+
df=leaderboard_df_quota,
|
| 244 |
+
type_query=[t.to_str(" : ") for t in ModelType],
|
| 245 |
+
size_query=list(NUMERIC_INTERVALS.keys()),
|
| 246 |
+
precision_query=[i.value.name for i in Precision],
|
| 247 |
+
hide_models=[], # Deleted, merges, flagged, MoEs
|
| 248 |
+
)
|
| 249 |
|
| 250 |
|
| 251 |
demo = gr.Blocks(css=custom_css)
|
|
|
|
| 280 |
elem_id="column-select",
|
| 281 |
interactive=True,
|
| 282 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
|
| 285 |
leaderboard_table = gr.components.Dataframe(
|
|
|
|
| 367 |
elem_id="column-select",
|
| 368 |
interactive=True,
|
| 369 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
|
| 371 |
|
| 372 |
leaderboard_table = gr.components.Dataframe(
|
| 373 |
+
value=leaderboard_df_quota[
|
| 374 |
[c.name for c in fields(AutoEvalColumnQuota) if c.never_hidden]
|
| 375 |
+ shown_columns.value
|
| 376 |
+ [AutoEvalColumnQuota.dummy.name]
|
|
|
|
| 385 |
|
| 386 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 387 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
| 388 |
+
value=original_df_quota[QUOTACOLS],
|
|
|
|
| 389 |
headers=QUOTACOLS,
|
| 390 |
datatype=QUOTATYPES,
|
| 391 |
visible=False,
|
src/display/utils.py
CHANGED
|
@@ -51,7 +51,7 @@ auto_eval_column_quota_dict = []
|
|
| 51 |
auto_eval_column_quota_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 52 |
auto_eval_column_quota_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 53 |
#Scores
|
| 54 |
-
auto_eval_column_quota_dict.append(["average_quota", ColumnContent, ColumnContent("
|
| 55 |
for task in Quotas:
|
| 56 |
auto_eval_column_quota_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 57 |
# Model information
|
|
|
|
| 51 |
auto_eval_column_quota_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 52 |
auto_eval_column_quota_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 53 |
#Scores
|
| 54 |
+
auto_eval_column_quota_dict.append(["average_quota", ColumnContent, ColumnContent("AverageSampled ⬆️", "number", True)])
|
| 55 |
for task in Quotas:
|
| 56 |
auto_eval_column_quota_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 57 |
# Model information
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -168,6 +168,8 @@ class EvalResult:
|
|
| 168 |
else:
|
| 169 |
average_quota = average_quota/nums
|
| 170 |
|
|
|
|
|
|
|
| 171 |
data_dict = {
|
| 172 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 173 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
|
| 168 |
else:
|
| 169 |
average_quota = average_quota/nums
|
| 170 |
|
| 171 |
+
print("AutoEvalColumn.average.name",AutoEvalColumn.average.name, average)
|
| 172 |
+
print("AutoEvalColumnQuota.average_quota.name",AutoEvalColumnQuota.average_quota.name,average_quota)
|
| 173 |
data_dict = {
|
| 174 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 175 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
src/populate.py
CHANGED
|
@@ -18,8 +18,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
|
|
| 18 |
#all_data_json.append(baseline_row)
|
| 19 |
filter_models_flags(all_data_json)
|
| 20 |
df = pd.DataFrame.from_records(all_data_json)
|
| 21 |
-
|
| 22 |
-
|
| 23 |
df = df[cols].round(decimals=2)
|
| 24 |
|
| 25 |
# filter out if any of the benchmarks have not been produced
|
|
@@ -37,7 +37,7 @@ def get_leaderboard_df_quota(results_path: str, requests_path: str, dynamic_path
|
|
| 37 |
filter_models_flags(all_data_json)
|
| 38 |
df = pd.DataFrame.from_records(all_data_json)
|
| 39 |
print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
|
| 40 |
-
df = df.sort_values(by=[AutoEvalColumnQuota.
|
| 41 |
df = df[cols].round(decimals=2)
|
| 42 |
|
| 43 |
# filter out if any of the benchmarks have not been produced
|
|
|
|
| 18 |
#all_data_json.append(baseline_row)
|
| 19 |
filter_models_flags(all_data_json)
|
| 20 |
df = pd.DataFrame.from_records(all_data_json)
|
| 21 |
+
print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
|
| 22 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 23 |
df = df[cols].round(decimals=2)
|
| 24 |
|
| 25 |
# filter out if any of the benchmarks have not been produced
|
|
|
|
| 37 |
filter_models_flags(all_data_json)
|
| 38 |
df = pd.DataFrame.from_records(all_data_json)
|
| 39 |
print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
|
| 40 |
+
df = df.sort_values(by=[AutoEvalColumnQuota.average_quota.name], ascending=False)
|
| 41 |
df = df[cols].round(decimals=2)
|
| 42 |
|
| 43 |
# filter out if any of the benchmarks have not been produced
|