tathagataraha commited on
Commit
094d4db
Β·
1 Parent(s): 20dad4a

[FIX] Read evals

Browse files
Files changed (3) hide show
  1. app.py +113 -112
  2. src/envs.py +1 -1
  3. src/leaderboard/read_evals.py +27 -15
app.py CHANGED
@@ -262,118 +262,6 @@ with demo:
262
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
263
 
264
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
265
- with gr.TabItem("πŸ… Closed Ended Evaluation", elem_id="llm-benchmark-tab-table", id=0):
266
- with gr.Row():
267
- with gr.Column():
268
- with gr.Row():
269
- search_bar = gr.Textbox(
270
- placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
271
- show_label=False,
272
- elem_id="search-bar",
273
- )
274
- with gr.Row():
275
- shown_columns = gr.CheckboxGroup(
276
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
277
- value=[
278
- c.name
279
- for c in fields(AutoEvalColumn)
280
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
281
- ],
282
- label="Select columns to show",
283
- elem_id="column-select",
284
- interactive=True,
285
- )
286
- # with gr.Row():
287
- # deleted_models_visibility = gr.Checkbox(
288
- # value=False, label="Show gated/private/deleted models", interactive=True
289
- # )
290
- with gr.Column(min_width=320):
291
- # with gr.Box(elem_id="box-filter"):
292
- filter_columns_type = gr.CheckboxGroup(
293
- label="Model Types",
294
- choices=[t.to_str() for t in ModelType],
295
- value=[t.to_str() for t in ModelType],
296
- interactive=True,
297
- elem_id="filter-columns-type",
298
- )
299
- # filter_columns_architecture = gr.CheckboxGroup(
300
- # label="Architecture Types",
301
- # choices=[i.value.name for i in ModelArch],
302
- # value=[i.value.name for i in ModelArch],
303
- # interactive=True,
304
- # elem_id="filter-columns-architecture",
305
- # )
306
- filter_domain_specific = gr.CheckboxGroup(
307
- label="Domain Specificity",
308
- choices=["πŸ₯ Clinical models", "Generic models"],
309
- value=["πŸ₯ Clinical models", "Generic models"],
310
- interactive=True,
311
- elem_id="filter-columns-type",
312
- )
313
- filter_columns_size = gr.CheckboxGroup(
314
- label="Model sizes (in billions of parameters)",
315
- choices=list(NUMERIC_INTERVALS.keys()),
316
- value=list(NUMERIC_INTERVALS.keys()),
317
- interactive=True,
318
- elem_id="filter-columns-size",
319
- )
320
-
321
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
322
-
323
- leaderboard_table = gr.components.Dataframe(
324
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
325
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
326
- datatype=TYPES,
327
- elem_id="leaderboard-table",
328
- interactive=False,
329
- visible=True,
330
- )
331
-
332
- # Dummy leaderboard for handling the case when the user uses backspace key
333
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
334
- value=datasets_original_df[DATASET_COLS],
335
- headers=DATASET_COLS,
336
- datatype=TYPES,
337
- visible=False,
338
- )
339
-
340
-
341
- search_bar.submit(
342
- update_table,
343
- [
344
- hidden_leaderboard_table_for_search,
345
- shown_columns,
346
- search_bar,
347
- filter_columns_type,
348
- filter_domain_specific,
349
- filter_columns_size
350
- # filter_columns_architecture
351
- ],
352
- leaderboard_table,
353
- )
354
- for selector in [
355
- shown_columns,
356
- filter_columns_type,
357
- filter_domain_specific,
358
- # filter_columns_architecture,
359
- filter_columns_size,
360
- # deleted_models_visibility,
361
- ]:
362
- selector.change(
363
- update_table,
364
- [
365
- hidden_leaderboard_table_for_search,
366
- shown_columns,
367
- search_bar,
368
- filter_columns_type,
369
- filter_domain_specific,
370
- filter_columns_size
371
- # filter_columns_architecture,
372
- ],
373
- leaderboard_table,
374
- queue=True,
375
- )
376
-
377
  with gr.TabItem("πŸ… Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
378
  with gr.Row():
379
  with gr.Column():
@@ -1065,6 +953,119 @@ with demo:
1065
  leaderboard_table,
1066
  queue=True,
1067
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1068
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=5):
1069
  gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
1070
  gr.HTML(FIVE_PILLAR_DIAGRAM)
 
262
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
263
 
264
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  with gr.TabItem("πŸ… Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
266
  with gr.Row():
267
  with gr.Column():
 
953
  leaderboard_table,
954
  queue=True,
955
  )
956
+ with gr.TabItem("πŸ… Closed Ended Evaluation", elem_id="llm-benchmark-tab-table", id=0):
957
+ with gr.Row():
958
+ with gr.Column():
959
+ with gr.Row():
960
+ search_bar = gr.Textbox(
961
+ placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
962
+ show_label=False,
963
+ elem_id="search-bar",
964
+ )
965
+ with gr.Row():
966
+ shown_columns = gr.CheckboxGroup(
967
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
968
+ value=[
969
+ c.name
970
+ for c in fields(AutoEvalColumn)
971
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
972
+ ],
973
+ label="Select columns to show",
974
+ elem_id="column-select",
975
+ interactive=True,
976
+ )
977
+ # with gr.Row():
978
+ # deleted_models_visibility = gr.Checkbox(
979
+ # value=False, label="Show gated/private/deleted models", interactive=True
980
+ # )
981
+ with gr.Column(min_width=320):
982
+ # with gr.Box(elem_id="box-filter"):
983
+ filter_columns_type = gr.CheckboxGroup(
984
+ label="Model Types",
985
+ choices=[t.to_str() for t in ModelType],
986
+ value=[t.to_str() for t in ModelType],
987
+ interactive=True,
988
+ elem_id="filter-columns-type",
989
+ )
990
+ # filter_columns_architecture = gr.CheckboxGroup(
991
+ # label="Architecture Types",
992
+ # choices=[i.value.name for i in ModelArch],
993
+ # value=[i.value.name for i in ModelArch],
994
+ # interactive=True,
995
+ # elem_id="filter-columns-architecture",
996
+ # )
997
+ filter_domain_specific = gr.CheckboxGroup(
998
+ label="Domain Specificity",
999
+ choices=["πŸ₯ Clinical models", "Generic models"],
1000
+ value=["πŸ₯ Clinical models", "Generic models"],
1001
+ interactive=True,
1002
+ elem_id="filter-columns-type",
1003
+ )
1004
+ filter_columns_size = gr.CheckboxGroup(
1005
+ label="Model sizes (in billions of parameters)",
1006
+ choices=list(NUMERIC_INTERVALS.keys()),
1007
+ value=list(NUMERIC_INTERVALS.keys()),
1008
+ interactive=True,
1009
+ elem_id="filter-columns-size",
1010
+ )
1011
+
1012
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
1013
+
1014
+ leaderboard_table = gr.components.Dataframe(
1015
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1016
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1017
+ datatype=TYPES,
1018
+ elem_id="leaderboard-table",
1019
+ interactive=False,
1020
+ visible=True,
1021
+ )
1022
+
1023
+ # Dummy leaderboard for handling the case when the user uses backspace key
1024
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
1025
+ value=datasets_original_df[DATASET_COLS],
1026
+ headers=DATASET_COLS,
1027
+ datatype=TYPES,
1028
+ visible=False,
1029
+ )
1030
+
1031
+
1032
+ search_bar.submit(
1033
+ update_table,
1034
+ [
1035
+ hidden_leaderboard_table_for_search,
1036
+ shown_columns,
1037
+ search_bar,
1038
+ filter_columns_type,
1039
+ filter_domain_specific,
1040
+ filter_columns_size
1041
+ # filter_columns_architecture
1042
+ ],
1043
+ leaderboard_table,
1044
+ )
1045
+ for selector in [
1046
+ shown_columns,
1047
+ filter_columns_type,
1048
+ filter_domain_specific,
1049
+ # filter_columns_architecture,
1050
+ filter_columns_size,
1051
+ # deleted_models_visibility,
1052
+ ]:
1053
+ selector.change(
1054
+ update_table,
1055
+ [
1056
+ hidden_leaderboard_table_for_search,
1057
+ shown_columns,
1058
+ search_bar,
1059
+ filter_columns_type,
1060
+ filter_domain_specific,
1061
+ filter_columns_size
1062
+ # filter_columns_architecture,
1063
+ ],
1064
+ leaderboard_table,
1065
+ queue=True,
1066
+ )
1067
+
1068
+
1069
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=5):
1070
  gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
1071
  gr.HTML(FIVE_PILLAR_DIAGRAM)
src/envs.py CHANGED
@@ -8,7 +8,7 @@ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
  OWNER = "m42-health" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
- PRIVATE_REPO = True
12
 
13
 
14
  if PRIVATE_REPO:
 
8
 
9
  OWNER = "m42-health" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
+ PRIVATE_REPO = False
12
 
13
 
14
  if PRIVATE_REPO:
src/leaderboard/read_evals.py CHANGED
@@ -54,7 +54,8 @@ class EvalResult:
54
  except:
55
  breakpoint()
56
 
57
-
 
58
  config = data.get("config")
59
 
60
  # Precision
@@ -113,7 +114,8 @@ class EvalResult:
113
  if open_ended_results["ELO_intervals"] is not None and open_ended_results["Score_intervals"] is not None:
114
  open_ended_results["ELO_intervals"] = "+" + str(open_ended_results["ELO_intervals"][1]) + "/-" + str(abs(open_ended_results["ELO_intervals"][0]))
115
  open_ended_results["Score_intervals"] = "+" + str(open_ended_results["Score_intervals"][1]) + "/-" + str(abs(open_ended_results["Score_intervals"][0]))
116
- # breakpoint()
 
117
  # changes to be made here
118
  med_safety_results = {}
119
  if "med-safety" in data["results"]:
@@ -178,12 +180,12 @@ class EvalResult:
178
  continue
179
  mean_acc = np.mean(accs) # * 100.0
180
  closed_ended_arabic_results[task.benchmark] = mean_acc
181
- if open_ended_results == {} or med_safety_results == {} or medical_summarization_results == {} or aci_results == {} or soap_results == {}:
182
- open_ended_results = {}
183
- med_safety_results = {}
184
- medical_summarization_results = {}
185
- aci_results = {}
186
- soap_results = {}
187
  # types_results = {}
188
  # for clinical_type in ClinicalTypes:
189
  # clinical_type = clinical_type.value
@@ -195,7 +197,8 @@ class EvalResult:
195
 
196
  # mean_acc = np.mean(accs) # * 100.0
197
  # types_results[clinical_type.benchmark] = mean_acc
198
-
 
199
  return self(
200
  eval_name=result_key,
201
  full_model=full_model,
@@ -337,6 +340,14 @@ def get_request_file_for_model(requests_path, model_name, precision):
337
  request_file = tmp_request_file
338
  return request_file
339
 
 
 
 
 
 
 
 
 
340
 
341
  def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metric: str) -> list[EvalResult]:
342
  """From the path of the results folder root, extract all needed info for results"""
@@ -355,7 +366,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metri
355
 
356
  for file in files:
357
  model_result_filepaths.append(os.path.join(root, file))
358
-
359
  eval_results = {}
360
  for model_result_filepath in model_result_filepaths:
361
  # Creation of result
@@ -364,11 +375,12 @@ def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metri
364
 
365
  # Store results of same eval together
366
  eval_name = eval_result.eval_name
367
- # if eval_name in eval_results.keys():
368
- # eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
369
- # else:
370
- eval_results[eval_name] = eval_result
371
-
 
372
  results = []
373
  # clinical_type_results = []
374
  for v in eval_results.values():
 
54
  except:
55
  breakpoint()
56
 
57
+ # if "deepseek-ai/DeepSeek-R1-Distill-Llama-70B" in json_filepath:
58
+ # breakpoint()
59
  config = data.get("config")
60
 
61
  # Precision
 
114
  if open_ended_results["ELO_intervals"] is not None and open_ended_results["Score_intervals"] is not None:
115
  open_ended_results["ELO_intervals"] = "+" + str(open_ended_results["ELO_intervals"][1]) + "/-" + str(abs(open_ended_results["ELO_intervals"][0]))
116
  open_ended_results["Score_intervals"] = "+" + str(open_ended_results["Score_intervals"][1]) + "/-" + str(abs(open_ended_results["Score_intervals"][0]))
117
+ # if "deepseek-ai/DeepSeek-R1-Distill-Llama-70B" in json_filepath:
118
+ # breakpoint()
119
  # changes to be made here
120
  med_safety_results = {}
121
  if "med-safety" in data["results"]:
 
180
  continue
181
  mean_acc = np.mean(accs) # * 100.0
182
  closed_ended_arabic_results[task.benchmark] = mean_acc
183
+ # if open_ended_results == {} or med_safety_results == {} or medical_summarization_results == {} or aci_results == {} or soap_results == {}:
184
+ # open_ended_results = {}
185
+ # med_safety_results = {}
186
+ # medical_summarization_results = {}
187
+ # aci_results = {}
188
+ # soap_results = {}
189
  # types_results = {}
190
  # for clinical_type in ClinicalTypes:
191
  # clinical_type = clinical_type.value
 
197
 
198
  # mean_acc = np.mean(accs) # * 100.0
199
  # types_results[clinical_type.benchmark] = mean_acc
200
+ # if "deepseek-ai/DeepSeek-R1-Distill-Llama-70B" in json_filepath:
201
+ # breakpoint()
202
  return self(
203
  eval_name=result_key,
204
  full_model=full_model,
 
340
  request_file = tmp_request_file
341
  return request_file
342
 
343
+ def update_results(result1, result2):
344
+ # breakpoint()
345
+ for key in dir(result1):
346
+ if key.endswith("_results"):
347
+ if getattr(result1, key) == {}:
348
+ setattr(result1, key, getattr(result2, key))
349
+ # breakpoint()
350
+ return result1
351
 
352
  def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metric: str) -> list[EvalResult]:
353
  """From the path of the results folder root, extract all needed info for results"""
 
366
 
367
  for file in files:
368
  model_result_filepaths.append(os.path.join(root, file))
369
+ # breakpoint()
370
  eval_results = {}
371
  for model_result_filepath in model_result_filepaths:
372
  # Creation of result
 
375
 
376
  # Store results of same eval together
377
  eval_name = eval_result.eval_name
378
+ if eval_name in eval_results.keys():
379
+ eval_results[eval_name] = update_results(eval_results[eval_name], eval_result)
380
+ # eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
381
+ else:
382
+ eval_results[eval_name] = eval_result
383
+ # breakpoint()
384
  results = []
385
  # clinical_type_results = []
386
  for v in eval_results.values():