tathagataraha commited on
Commit
0da5ee3
·
1 Parent(s): b5701cc

[ADD] Open-ended evaluation

Browse files
Files changed (5) hide show
  1. app.py +125 -65
  2. src/about.py +4 -8
  3. src/display/utils.py +17 -8
  4. src/leaderboard/read_evals.py +56 -46
  5. src/populate.py +5 -2
app.py CHANGED
@@ -21,9 +21,9 @@ from src.about import (
21
  from src.display.css_html_js import custom_css
22
  from src.display.utils import (
23
  DATASET_BENCHMARK_COLS,
24
- TYPES_BENCHMARK_COLS,
25
  DATASET_COLS,
26
- Clinical_TYPES_COLS,
27
  EVAL_COLS,
28
  EVAL_TYPES,
29
  NUMERIC_INTERVALS,
@@ -64,9 +64,10 @@ except Exception:
64
  _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
65
  harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
66
 
67
- # _, span_based_types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "SpanBasedWithPartialOverlap", "clinical_types")
68
- # span_based_types_leaderboard_df = span_based_types_original_df.copy()
69
 
 
70
  # # Token based results
71
  # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
72
  # token_based_datasets_leaderboard_df = token_based_datasets_original_df.copy()
@@ -83,8 +84,12 @@ harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
83
 
84
 
85
  def update_df(shown_columns, subset="datasets"):
86
- leaderboard_table_df = harness_datasets_leaderboard_df.copy()
87
- hidden_leader_board_df = harness_datasets_original_df
 
 
 
 
88
  # else:
89
  # match evaluation_metric:
90
  # case "Span Based":
@@ -98,7 +103,7 @@ def update_df(shown_columns, subset="datasets"):
98
 
99
 
100
  value_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns
101
-
102
  return leaderboard_table_df[value_cols], hidden_leader_board_df
103
 
104
 
@@ -196,60 +201,6 @@ def filter_models(
196
 
197
  return filtered_df
198
 
199
- def change_submit_request_form(model_architecture):
200
- match model_architecture:
201
- case "Encoder":
202
- return (
203
- gr.Textbox(label="Threshold for gliner models", visible=False),
204
- gr.Radio(
205
- choices=["True", "False"],
206
- label="Load GLiNER Tokenizer",
207
- visible=False
208
- ),
209
- gr.Dropdown(
210
- choices=[prompt_template.value for prompt_template in PromptTemplateName],
211
- label="Prompt for generation",
212
- multiselect=False,
213
- # value="HTML Highlighted Spans",
214
- interactive=True,
215
- visible=False
216
- )
217
- )
218
- case "Decoder":
219
- return (
220
- gr.Textbox(label="Threshold for gliner models", visible=False),
221
- gr.Radio(
222
- choices=["True", "False"],
223
- label="Load GLiNER Tokenizer",
224
- visible=False
225
- ),
226
- gr.Dropdown(
227
- choices=[prompt_template.value for prompt_template in PromptTemplateName],
228
- label="Prompt for generation",
229
- multiselect=False,
230
- # value="HTML Highlighted Spans",
231
- interactive=True,
232
- visible=True
233
- )
234
- )
235
- case "GLiNER Encoder":
236
- return (
237
- gr.Textbox(label="Threshold for gliner models", visible=True),
238
- gr.Radio(
239
- choices=["True", "False"],
240
- label="Load GLiNER Tokenizer",
241
- visible=True
242
- ),
243
- gr.Dropdown(
244
- choices=[prompt_template.value for prompt_template in PromptTemplateName],
245
- label="Prompt for generation",
246
- multiselect=False,
247
- # value="HTML Highlighted Spans",
248
- interactive=True,
249
- visible=False
250
- )
251
- )
252
-
253
 
254
  demo = gr.Blocks(css=custom_css)
255
  with demo:
@@ -269,11 +220,11 @@ with demo:
269
  )
270
  with gr.Row():
271
  shown_columns = gr.CheckboxGroup(
272
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.clinical_type_col],
273
  value=[
274
  c.name
275
  for c in fields(AutoEvalColumn)
276
- if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.clinical_type_col
277
  ],
278
  label="Select columns to show",
279
  elem_id="column-select",
@@ -371,8 +322,117 @@ with demo:
371
  )
372
 
373
  with gr.TabItem("🏅 Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
374
- gr.Markdown("# Coming Soon!!!", elem_classes="markdown-text")
375
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=2):
377
  gr.Markdown("# Coming Soon!!!", elem_classes="markdown-text")
378
  pass
 
21
  from src.display.css_html_js import custom_css
22
  from src.display.utils import (
23
  DATASET_BENCHMARK_COLS,
24
+ OPEN_ENDED_BENCHMARK_COLS,
25
  DATASET_COLS,
26
+ OPEN_ENDED_COLS,
27
  EVAL_COLS,
28
  EVAL_TYPES,
29
  NUMERIC_INTERVALS,
 
64
  _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
65
  harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
66
 
67
+ _, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
68
+ open_ended_leaderboard_df = open_ended_original_df.copy()
69
 
70
+ # breakpoint()
71
  # # Token based results
72
  # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
73
  # token_based_datasets_leaderboard_df = token_based_datasets_original_df.copy()
 
84
 
85
 
86
  def update_df(shown_columns, subset="datasets"):
87
+ if subset == "datasets":
88
+ leaderboard_table_df = harness_datasets_leaderboard_df.copy()
89
+ hidden_leader_board_df = harness_datasets_original_df
90
+ elif subset == "open_ended":
91
+ leaderboard_table_df = open_ended_leaderboard_df.copy()
92
+ hidden_leader_board_df = open_ended_original_df
93
  # else:
94
  # match evaluation_metric:
95
  # case "Span Based":
 
103
 
104
 
105
  value_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns
106
+ # breakpoint()
107
  return leaderboard_table_df[value_cols], hidden_leader_board_df
108
 
109
 
 
201
 
202
  return filtered_df
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  demo = gr.Blocks(css=custom_css)
206
  with demo:
 
220
  )
221
  with gr.Row():
222
  shown_columns = gr.CheckboxGroup(
223
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
224
  value=[
225
  c.name
226
  for c in fields(AutoEvalColumn)
227
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
228
  ],
229
  label="Select columns to show",
230
  elem_id="column-select",
 
322
  )
323
 
324
  with gr.TabItem("🏅 Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
325
+ with gr.Row():
326
+ with gr.Column():
327
+ with gr.Row():
328
+ search_bar = gr.Textbox(
329
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
330
+ show_label=False,
331
+ elem_id="search-bar",
332
+ )
333
+ with gr.Row():
334
+ shown_columns = gr.CheckboxGroup(
335
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)],
336
+ value=[
337
+ c.name
338
+ for c in fields(AutoEvalColumn)
339
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)
340
+ ],
341
+ label="Select columns to show",
342
+ elem_id="column-select",
343
+ interactive=True,
344
+ )
345
+ # with gr.Row():
346
+ # deleted_models_visibility = gr.Checkbox(
347
+ # value=False, label="Show gated/private/deleted models", interactive=True
348
+ # )
349
+ with gr.Column(min_width=320):
350
+ # with gr.Box(elem_id="box-filter"):
351
+ filter_columns_type = gr.CheckboxGroup(
352
+ label="Model Types",
353
+ choices=[t.to_str() for t in ModelType],
354
+ value=[t.to_str() for t in ModelType],
355
+ interactive=True,
356
+ elem_id="filter-columns-type",
357
+ )
358
+ # filter_columns_architecture = gr.CheckboxGroup(
359
+ # label="Architecture Types",
360
+ # choices=[i.value.name for i in ModelArch],
361
+ # value=[i.value.name for i in ModelArch],
362
+ # interactive=True,
363
+ # elem_id="filter-columns-architecture",
364
+ # )
365
+ filter_domain_specific = gr.CheckboxGroup(
366
+ label="Domain specific models",
367
+ choices=["Yes", "No"],
368
+ value=["Yes", "No"],
369
+ interactive=True,
370
+ elem_id="filter-columns-type",
371
+ )
372
+ filter_columns_size = gr.CheckboxGroup(
373
+ label="Model sizes (in billions of parameters)",
374
+ choices=list(NUMERIC_INTERVALS.keys()),
375
+ value=list(NUMERIC_INTERVALS.keys()),
376
+ interactive=True,
377
+ elem_id="filter-columns-size",
378
+ )
379
+
380
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="open_ended")
381
+
382
+ leaderboard_table = gr.components.Dataframe(
383
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
384
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
385
+ datatype=TYPES,
386
+ elem_id="leaderboard-table",
387
+ interactive=False,
388
+ visible=True,
389
+ )
390
+
391
+ # Dummy leaderboard for handling the case when the user uses backspace key
392
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
393
+ value=datasets_original_df[OPEN_ENDED_COLS],
394
+ headers=OPEN_ENDED_COLS,
395
+ datatype=TYPES,
396
+ visible=False,
397
+ )
398
+
399
+
400
+ search_bar.submit(
401
+ update_table,
402
+ [
403
+ hidden_leaderboard_table_for_search,
404
+ shown_columns,
405
+ search_bar,
406
+ filter_columns_type,
407
+ filter_domain_specific,
408
+ filter_columns_size
409
+ # filter_columns_architecture
410
+ ],
411
+ leaderboard_table,
412
+ )
413
+ for selector in [
414
+ shown_columns,
415
+ filter_columns_type,
416
+ filter_domain_specific,
417
+ # filter_columns_architecture,
418
+ filter_columns_size,
419
+ # deleted_models_visibility,
420
+ ]:
421
+ selector.change(
422
+ update_table,
423
+ [
424
+ hidden_leaderboard_table_for_search,
425
+ shown_columns,
426
+ search_bar,
427
+ filter_columns_type,
428
+ filter_domain_specific,
429
+ filter_columns_size
430
+ # filter_columns_architecture,
431
+ ],
432
+ leaderboard_table,
433
+ queue=True,
434
+ )
435
+
436
  with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=2):
437
  gr.Markdown("# Coming Soon!!!", elem_classes="markdown-text")
438
  pass
src/about.py CHANGED
@@ -27,19 +27,15 @@ class HarnessTasks(Enum):
27
  # task6 = Task("", "f1", "")
28
 
29
  @dataclass
30
- class ClinicalType:
31
  benchmark: str
32
  metric: str
33
  col_name: str
34
 
35
- class ClinicalTypes(Enum):
36
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
37
- type0 = ClinicalType("condition", "f1", "CONDITION")
38
- type1 = ClinicalType("measurement", "f1", "MEASUREMENT")
39
- type2 = ClinicalType("drug", "f1", "DRUG")
40
- type3 = ClinicalType("procedure", "f1", "PROCEDURE")
41
- type4 = ClinicalType("gene", "f1", "GENE")
42
- type5 = ClinicalType("gene variant", "f1", "GENE VARIANT")
43
 
44
 
45
  NUM_FEWSHOT = 0 # Change with your few shot
 
27
  # task6 = Task("", "f1", "")
28
 
29
  @dataclass
30
+ class OpenEndedColumn:
31
  benchmark: str
32
  metric: str
33
  col_name: str
34
 
35
+ class OpenEndedColumns(Enum):
36
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
37
+ column0 = OpenEndedColumn("ELO", "score", "ELO")
38
+ column1 = OpenEndedColumn("Score", "score", "Score")
 
 
 
 
39
 
40
 
41
  NUM_FEWSHOT = 0 # Change with your few shot
src/display/utils.py CHANGED
@@ -3,8 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import HarnessTasks
7
- from src.about import ClinicalTypes
8
 
9
 
10
  def fields(raw_class):
@@ -20,9 +19,12 @@ class ColumnContent:
20
  type: str
21
  displayed_by_default: bool
22
  hidden: bool = False
 
23
  never_hidden: bool = False
24
  dataset_task_col: bool = False
25
- clinical_type_col: bool = False
 
 
26
 
27
 
28
  ## Leaderboard columns
@@ -32,9 +34,11 @@ auto_eval_column_dict = []
32
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
33
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
34
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
35
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
36
  for task in HarnessTasks:
37
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True)])
 
 
38
  auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
39
  auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
40
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
@@ -181,8 +185,11 @@ class EvaluationMetrics(Enum):
181
 
182
 
183
  # Column selection
184
- DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.clinical_type_col]
185
- Clinical_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
 
 
 
186
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
187
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
188
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
@@ -191,7 +198,9 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
191
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
192
 
193
  DATASET_BENCHMARK_COLS = [t.value.col_name for t in HarnessTasks]
194
- TYPES_BENCHMARK_COLS = [t.value.col_name for t in ClinicalTypes]
 
 
195
 
196
  NUMERIC_INTERVALS = {
197
  "?": pd.Interval(-1, 0, closed="right"),
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import HarnessTasks, OpenEndedColumns
 
7
 
8
 
9
  def fields(raw_class):
 
19
  type: str
20
  displayed_by_default: bool
21
  hidden: bool = False
22
+ invariant: bool = True
23
  never_hidden: bool = False
24
  dataset_task_col: bool = False
25
+ open_ended_col: bool = False
26
+ med_safety_col: bool = False
27
+ cross_examination_col: bool = False
28
 
29
 
30
  ## Leaderboard columns
 
34
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
35
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
36
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
37
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, invariant=False)])
38
  for task in HarnessTasks:
39
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
40
+ for column in OpenEndedColumns:
41
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_col=True, invariant=False)])
42
  auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
43
  auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
44
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
 
185
 
186
 
187
  # Column selection
188
+ DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
189
+ OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
190
+ MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.dataset_task_col and not c.cross_examination_col]
191
+ CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.dataset_task_col]
192
+
193
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
194
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
195
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 
198
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
199
 
200
  DATASET_BENCHMARK_COLS = [t.value.col_name for t in HarnessTasks]
201
+ OPEN_ENDED_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedColumns]
202
+ # MED_SAFETY_BENCHMARK_COLS = [t.value.col_name for t in MedSafetyTasks]
203
+ # CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
204
 
205
  NUMERIC_INTERVALS = {
206
  "?": pd.Interval(-1, 0, closed="right"),
src/leaderboard/read_evals.py CHANGED
@@ -8,7 +8,7 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, ClinicalTypes
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
@@ -22,6 +22,9 @@ class EvalResult:
22
  model: str
23
  revision: str # commit hash, "" if main
24
  dataset_results: dict
 
 
 
25
  is_domain_specific: bool
26
  use_chat_template: bool
27
  # clinical_type_results:dict
@@ -90,6 +93,19 @@ class EvalResult:
90
  continue
91
  mean_acc = np.mean(accs) # * 100.0
92
  harness_results[task.benchmark] = mean_acc
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  # types_results = {}
94
  # for clinical_type in ClinicalTypes:
95
  # clinical_type = clinical_type.value
@@ -109,6 +125,9 @@ class EvalResult:
109
  model=model,
110
  revision=config.get("revision", ""),
111
  dataset_results=harness_results,
 
 
 
112
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
113
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
114
  precision=precision,
@@ -146,60 +165,51 @@ class EvalResult:
146
 
147
  def to_dict(self, subset):
148
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  if subset == "datasets":
150
  average = sum([v for v in self.dataset_results.values() if v is not None]) / len(HarnessTasks)
151
- data_dict = {
152
- "eval_name": self.eval_name, # not a column, just a save name,
153
- AutoEvalColumn.precision.name: self.precision.value.name,
154
- AutoEvalColumn.model_type.name: self.model_type.value.name,
155
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol + (" 🏥" if self.is_domain_specific else ""),
156
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
157
- # AutoEvalColumn.architecture.name: self.architecture.value.name,
158
- # AutoEvalColumn.backbone.name: self.backbone,
159
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
160
- AutoEvalColumn.is_domain_specific.name: self.is_domain_specific,
161
- AutoEvalColumn.use_chat_template.name: self.use_chat_template,
162
- AutoEvalColumn.revision.name: self.revision,
163
- AutoEvalColumn.average.name: average,
164
- AutoEvalColumn.license.name: self.license,
165
- AutoEvalColumn.likes.name: self.likes,
166
- AutoEvalColumn.params.name: self.num_params,
167
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
168
- AutoEvalColumn.date.name: self.date,
169
- "display_result" : self.display_result,
170
- }
171
  if len(self.dataset_results) > 0:
172
  for task in HarnessTasks:
173
  data_dict[task.value.col_name] = self.dataset_results[task.value.benchmark]
174
-
175
  return data_dict
176
 
177
- if subset == "clinical_types":
178
- average = sum([v for v in self.clinical_type_results.values() if v is not None]) / len(ClinicalTypes)
179
- data_dict = {
180
- "eval_name": self.eval_name, # not a column, just a save name,
181
- AutoEvalColumn.precision.name: self.precision.value.name,
182
- AutoEvalColumn.model_type.name: self.model_type.value.name,
183
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
184
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
185
- AutoEvalColumn.architecture.name: self.architecture.value.name,
186
- AutoEvalColumn.backbone.name: self.backbone,
187
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
188
- AutoEvalColumn.revision.name: self.revision,
189
- AutoEvalColumn.average.name: average,
190
- AutoEvalColumn.license.name: self.license,
191
- AutoEvalColumn.likes.name: self.likes,
192
- AutoEvalColumn.params.name: self.num_params,
193
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
194
- "display_result" : self.display_result,
195
- }
196
-
197
- for clinical_type in ClinicalTypes:
198
- data_dict[clinical_type.value.col_name] = self.clinical_type_results[clinical_type.value.benchmark]
199
-
200
  return data_dict
201
 
202
-
 
 
 
 
 
 
 
 
 
 
203
 
204
  def get_request_file_for_model(requests_path, model_name, precision):
205
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
 
22
  model: str
23
  revision: str # commit hash, "" if main
24
  dataset_results: dict
25
+ open_ended_results: dict
26
+ med_safety_results: dict
27
+ cross_examination_results: dict
28
  is_domain_specific: bool
29
  use_chat_template: bool
30
  # clinical_type_results:dict
 
93
  continue
94
  mean_acc = np.mean(accs) # * 100.0
95
  harness_results[task.benchmark] = mean_acc
96
+ open_ended_results = {}
97
+ if "open-ended" in data["results"]:
98
+ for task in OpenEndedColumns:
99
+ task = task.value
100
+ # We average all scores of a given metric (not all metrics are present in all files)
101
+ accs = np.array([v for k, v in data["results"]["open-ended"]["overall"].items() if task.benchmark == k])
102
+ if accs.size == 0 or any([acc is None for acc in accs]):
103
+ continue
104
+ mean_acc = np.mean(accs) # * 100.0
105
+ open_ended_results[task.benchmark] = mean_acc
106
+ # breakpoint()
107
+ med_safety_results = {}
108
+ cross_examination_results = {}
109
  # types_results = {}
110
  # for clinical_type in ClinicalTypes:
111
  # clinical_type = clinical_type.value
 
125
  model=model,
126
  revision=config.get("revision", ""),
127
  dataset_results=harness_results,
128
+ open_ended_results=open_ended_results,
129
+ med_safety_results=med_safety_results,
130
+ cross_examination_results=cross_examination_results,
131
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
132
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
133
  precision=precision,
 
165
 
166
  def to_dict(self, subset):
167
  """Converts the Eval Result to a dict compatible with our dataframe display"""
168
+ data_dict = {
169
+ "eval_name": self.eval_name, # not a column, just a save name,
170
+ AutoEvalColumn.precision.name: self.precision.value.name,
171
+ AutoEvalColumn.model_type.name: self.model_type.value.name,
172
+ AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol + (" 🏥" if self.is_domain_specific else ""),
173
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
174
+ # AutoEvalColumn.architecture.name: self.architecture.value.name,
175
+ # AutoEvalColumn.backbone.name: self.backbone,
176
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
177
+ AutoEvalColumn.is_domain_specific.name: self.is_domain_specific,
178
+ AutoEvalColumn.use_chat_template.name: self.use_chat_template,
179
+ AutoEvalColumn.revision.name: self.revision,
180
+ AutoEvalColumn.license.name: self.license,
181
+ AutoEvalColumn.likes.name: self.likes,
182
+ AutoEvalColumn.params.name: self.num_params,
183
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
184
+ AutoEvalColumn.date.name: self.date,
185
+ "display_result" : self.display_result,
186
+ }
187
+
188
  if subset == "datasets":
189
  average = sum([v for v in self.dataset_results.values() if v is not None]) / len(HarnessTasks)
190
+ data_dict[AutoEvalColumn.average.name] = average
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  if len(self.dataset_results) > 0:
192
  for task in HarnessTasks:
193
  data_dict[task.value.col_name] = self.dataset_results[task.value.benchmark]
 
194
  return data_dict
195
 
196
+ if subset == "open_ended":
197
+ if len(self.open_ended_results) > 0:
198
+ for task in OpenEndedColumns:
199
+ data_dict[task.value.col_name] = self.open_ended_results[task.value.benchmark]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  return data_dict
201
 
202
+ # if subset == "med_safety":
203
+ # if len(self.med_safety_results) > 0:
204
+ # for task in MedSafetyTasks:
205
+ # data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
206
+ # return data_dict
207
+
208
+ # if subset == "cross_examination":
209
+ # if len(self.cross_examination_results) > 0:
210
+ # for task in CrossExaminationTasks:
211
+ # data_dict[task.value.col_name] = self.cross_examination_results[task.value.benchmark]
212
+ # return data_dict
213
 
214
  def get_request_file_for_model(requests_path, model_name, precision):
215
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
src/populate.py CHANGED
@@ -4,7 +4,7 @@ import os
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
@@ -16,7 +16,10 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
16
  all_data_json = [v.to_dict(subset=subset) for v in raw_data]
17
 
18
  df = pd.DataFrame.from_records(all_data_json)
19
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
 
 
20
  cols = list(set(df.columns).intersection(set(cols)))
21
  df = df[cols].round(decimals=2)
22
 
 
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
 
16
  all_data_json = [v.to_dict(subset=subset) for v in raw_data]
17
 
18
  df = pd.DataFrame.from_records(all_data_json)
19
+ if subset == "datasets":
20
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
21
+ elif subset == "open_ended":
22
+ df = df.sort_values(by=["ELO"], ascending=False)
23
  cols = list(set(df.columns).intersection(set(cols)))
24
  df = df[cols].round(decimals=2)
25