lixuejing commited on
Commit
6981fa7
·
1 Parent(s): dba0a90
app.py CHANGED
@@ -32,7 +32,7 @@ from src.display.utils import (
32
  BENCHMARK_QUOTACOLS
33
  )
34
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
35
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
36
  from src.submission.submit import add_new_eval
37
  from src.scripts.update_all_request_files import update_dynamic_files
38
  from src.tools.collections import update_collections
@@ -77,6 +77,16 @@ def init_space():
77
  #update_collections(original_df.copy())
78
  leaderboard_df = original_df.copy()
79
 
 
 
 
 
 
 
 
 
 
 
80
  #plot_df = create_plot_df(create_scores_df(raw_data))
81
 
82
  (
@@ -86,12 +96,10 @@ def init_space():
86
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
87
 
88
  #return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
89
- return leaderboard_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
90
 
91
- leaderboard_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
92
- #return leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
93
 
94
- #leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
95
 
96
 
97
  # Searching and filtering
@@ -231,6 +239,13 @@ leaderboard_df = filter_models(
231
  hide_models=[], # Deleted, merges, flagged, MoEs
232
  )
233
 
 
 
 
 
 
 
 
234
 
235
 
236
  demo = gr.Blocks(css=custom_css)
@@ -265,36 +280,6 @@ with demo:
265
  elem_id="column-select",
266
  interactive=True,
267
  )
268
- #with gr.Row():
269
- # hide_models = gr.CheckboxGroup(
270
- # label="Hide models",
271
- # choices = ["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
272
- # value=[],
273
- # interactive=True
274
- # )
275
- #with gr.Column(min_width=320):
276
- # #with gr.Box(elem_id="box-filter"):
277
- # filter_columns_type = gr.CheckboxGroup(
278
- # label="Model types",
279
- # choices=[t.to_str() for t in ModelType],
280
- # value=[t.to_str() for t in ModelType],
281
- # interactive=True,
282
- # elem_id="filter-columns-type",
283
- # )
284
- # filter_columns_precision = gr.CheckboxGroup(
285
- # label="Precision",
286
- # choices=[i.value.name for i in Precision],
287
- # value=[i.value.name for i in Precision],
288
- # interactive=True,
289
- # elem_id="filter-columns-precision",
290
- # )
291
- # filter_columns_size = gr.CheckboxGroup(
292
- # label="Model sizes (in billions of parameters)",
293
- # choices=list(NUMERIC_INTERVALS.keys()),
294
- # value=list(NUMERIC_INTERVALS.keys()),
295
- # interactive=True,
296
- # elem_id="filter-columns-size",
297
- # )
298
 
299
 
300
  leaderboard_table = gr.components.Dataframe(
@@ -382,40 +367,10 @@ with demo:
382
  elem_id="column-select",
383
  interactive=True,
384
  )
385
- #with gr.Row():
386
- # hide_models = gr.CheckboxGroup(
387
- # label="Hide models",
388
- # choices = ["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
389
- # value=[],
390
- # interactive=True
391
- # )
392
- #with gr.Column(min_width=320):
393
- # #with gr.Box(elem_id="box-filter"):
394
- # filter_columns_type = gr.CheckboxGroup(
395
- # label="Model types",
396
- # choices=[t.to_str() for t in ModelType],
397
- # value=[t.to_str() for t in ModelType],
398
- # interactive=True,
399
- # elem_id="filter-columns-type",
400
- # )
401
- # filter_columns_precision = gr.CheckboxGroup(
402
- # label="Precision",
403
- # choices=[i.value.name for i in Precision],
404
- # value=[i.value.name for i in Precision],
405
- # interactive=True,
406
- # elem_id="filter-columns-precision",
407
- # )
408
- # filter_columns_size = gr.CheckboxGroup(
409
- # label="Model sizes (in billions of parameters)",
410
- # choices=list(NUMERIC_INTERVALS.keys()),
411
- # value=list(NUMERIC_INTERVALS.keys()),
412
- # interactive=True,
413
- # elem_id="filter-columns-size",
414
- # )
415
 
416
 
417
  leaderboard_table = gr.components.Dataframe(
418
- value=leaderboard_df[
419
  [c.name for c in fields(AutoEvalColumnQuota) if c.never_hidden]
420
  + shown_columns.value
421
  + [AutoEvalColumnQuota.dummy.name]
@@ -430,8 +385,7 @@ with demo:
430
 
431
  # Dummy leaderboard for handling the case when the user uses backspace key
432
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
433
- value=original_df[QUOTACOLS],
434
- #value=leaderboard_df[QUOTACOLS],
435
  headers=QUOTACOLS,
436
  datatype=QUOTATYPES,
437
  visible=False,
 
32
  BENCHMARK_QUOTACOLS
33
  )
34
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
35
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_leaderboard_df_quota
36
  from src.submission.submit import add_new_eval
37
  from src.scripts.update_all_request_files import update_dynamic_files
38
  from src.tools.collections import update_collections
 
77
  #update_collections(original_df.copy())
78
  leaderboard_df = original_df.copy()
79
 
80
+ raw_data_quota, original_df_quota = get_leaderboard_df(
81
+ results_path=EVAL_RESULTS_PATH,
82
+ requests_path=EVAL_REQUESTS_PATH,
83
+ dynamic_path=DYNAMIC_INFO_FILE_PATH,
84
+ cols=list(set(QUOTACOLS+COLS)),
85
+ benchmark_cols=list(set(BENCHMARK_QUOTACOLS+BENCHMARK_COLS))
86
+ )
87
+ #update_collections(original_df.copy())
88
+ leaderboard_df_quota = original_df_quota.copy()
89
+
90
  #plot_df = create_plot_df(create_scores_df(raw_data))
91
 
92
  (
 
96
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
97
 
98
  #return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
99
+ return leaderboard_df, original_df, leaderboard_df_quota, original_df_quota,finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
100
 
101
+ leaderboard_df, original_df, leaderboard_df_quota, original_df_quota, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
 
102
 
 
103
 
104
 
105
  # Searching and filtering
 
239
  hide_models=[], # Deleted, merges, flagged, MoEs
240
  )
241
 
242
+ leaderboard_df_quota = filter_models(
243
+ df=leaderboard_df_quota,
244
+ type_query=[t.to_str(" : ") for t in ModelType],
245
+ size_query=list(NUMERIC_INTERVALS.keys()),
246
+ precision_query=[i.value.name for i in Precision],
247
+ hide_models=[], # Deleted, merges, flagged, MoEs
248
+ )
249
 
250
 
251
  demo = gr.Blocks(css=custom_css)
 
280
  elem_id="column-select",
281
  interactive=True,
282
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
 
285
  leaderboard_table = gr.components.Dataframe(
 
367
  elem_id="column-select",
368
  interactive=True,
369
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
 
371
 
372
  leaderboard_table = gr.components.Dataframe(
373
+ value=leaderboard_df_quota[
374
  [c.name for c in fields(AutoEvalColumnQuota) if c.never_hidden]
375
  + shown_columns.value
376
  + [AutoEvalColumnQuota.dummy.name]
 
385
 
386
  # Dummy leaderboard for handling the case when the user uses backspace key
387
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
388
+ value=original_df_quota[QUOTACOLS],
 
389
  headers=QUOTACOLS,
390
  datatype=QUOTATYPES,
391
  visible=False,
src/display/utils.py CHANGED
@@ -51,7 +51,7 @@ auto_eval_column_quota_dict = []
51
  auto_eval_column_quota_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
52
  auto_eval_column_quota_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
53
  #Scores
54
- auto_eval_column_quota_dict.append(["average_quota", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
55
  for task in Quotas:
56
  auto_eval_column_quota_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
57
  # Model information
 
51
  auto_eval_column_quota_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
52
  auto_eval_column_quota_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
53
  #Scores
54
+ auto_eval_column_quota_dict.append(["average_quota", ColumnContent, ColumnContent("AverageSampled ⬆️", "number", True)])
55
  for task in Quotas:
56
  auto_eval_column_quota_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
57
  # Model information
src/leaderboard/read_evals.py CHANGED
@@ -168,6 +168,8 @@ class EvalResult:
168
  else:
169
  average_quota = average_quota/nums
170
 
 
 
171
  data_dict = {
172
  "eval_name": self.eval_name, # not a column, just a save name,
173
  AutoEvalColumn.precision.name: self.precision.value.name,
 
168
  else:
169
  average_quota = average_quota/nums
170
 
171
+ print("AutoEvalColumn.average.name",AutoEvalColumn.average.name, average)
172
+ print("AutoEvalColumnQuota.average_quota.name",AutoEvalColumnQuota.average_quota.name,average_quota)
173
  data_dict = {
174
  "eval_name": self.eval_name, # not a column, just a save name,
175
  AutoEvalColumn.precision.name: self.precision.value.name,
src/populate.py CHANGED
@@ -18,8 +18,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
18
  #all_data_json.append(baseline_row)
19
  filter_models_flags(all_data_json)
20
  df = pd.DataFrame.from_records(all_data_json)
21
- #print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
22
- #df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
23
  df = df[cols].round(decimals=2)
24
 
25
  # filter out if any of the benchmarks have not been produced
@@ -37,7 +37,7 @@ def get_leaderboard_df_quota(results_path: str, requests_path: str, dynamic_path
37
  filter_models_flags(all_data_json)
38
  df = pd.DataFrame.from_records(all_data_json)
39
  print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
40
- df = df.sort_values(by=[AutoEvalColumnQuota.average.name], ascending=False)
41
  df = df[cols].round(decimals=2)
42
 
43
  # filter out if any of the benchmarks have not been produced
 
18
  #all_data_json.append(baseline_row)
19
  filter_models_flags(all_data_json)
20
  df = pd.DataFrame.from_records(all_data_json)
21
+ print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
22
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
23
  df = df[cols].round(decimals=2)
24
 
25
  # filter out if any of the benchmarks have not been produced
 
37
  filter_models_flags(all_data_json)
38
  df = pd.DataFrame.from_records(all_data_json)
39
  print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
40
+ df = df.sort_values(by=[AutoEvalColumnQuota.average_quota.name], ascending=False)
41
  df = df[cols].round(decimals=2)
42
 
43
  # filter out if any of the benchmarks have not been produced