tathagataraha commited on
Commit
2a7ac72
·
1 Parent(s): 12f8259

[MODIFY] Med-Safety: Average -> Harmfulness Score

Browse files
src/display/utils.py CHANGED
@@ -38,8 +38,9 @@ auto_eval_column_dict = []
38
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
39
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
40
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
41
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, med_safety_col=True, invariant=False)])
42
  auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
 
43
  for task in HarnessTasks:
44
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
45
  for column in OpenEndedColumns:
 
38
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
39
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
40
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
41
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, invariant=False)])
42
  auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
43
+ auto_eval_column_dict.append(["harmfulness", ColumnContent, ColumnContent("Harmfulness Score", "number", True, False, med_safety_col=True, invariant=False)])
44
  for task in HarnessTasks:
45
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
46
  for column in OpenEndedColumns:
src/leaderboard/read_evals.py CHANGED
@@ -265,7 +265,7 @@ class EvalResult:
265
  # changes to be made here
266
  if subset == "med_safety":
267
  average = sum([v for v in self.med_safety_results.values() if v is not None]) / len(MedSafetyColumns)
268
- data_dict[AutoEvalColumn.average.name] = average
269
  if len(self.med_safety_results) > 0:
270
  for task in MedSafetyColumns:
271
  data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
 
265
  # changes to be made here
266
  if subset == "med_safety":
267
  average = sum([v for v in self.med_safety_results.values() if v is not None]) / len(MedSafetyColumns)
268
+ data_dict[AutoEvalColumn.harmfulness.name] = average
269
  if len(self.med_safety_results) > 0:
270
  for task in MedSafetyColumns:
271
  data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
src/populate.py CHANGED
@@ -21,15 +21,15 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
21
  if subset == "datasets":
22
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
23
  elif subset == "med_safety":
24
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=True)
25
  elif subset == "open_ended":
26
  df = df.sort_values(by=["ELO"], ascending=False)
27
  elif subset == "medical_summarization":
28
- df = df.sort_values(by=["Overall Score"], ascending=False)
29
  elif subset == "aci":
30
- df = df.sort_values(by=["Overall Score"], ascending=False)
31
  elif subset == "soap":
32
- df = df.sort_values(by=["Overall Score"], ascending=False)
33
  cols = list(set(df.columns).intersection(set(cols)))
34
  df = df[cols].round(decimals=2)
35
  # filter out if any of the benchmarks have not been produced
 
21
  if subset == "datasets":
22
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
23
  elif subset == "med_safety":
24
+ df = df.sort_values(by=[AutoEvalColumn.harmfulness.name], ascending=True)
25
  elif subset == "open_ended":
26
  df = df.sort_values(by=["ELO"], ascending=False)
27
  elif subset == "medical_summarization":
28
+ df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
29
  elif subset == "aci":
30
+ df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
31
  elif subset == "soap":
32
+ df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
33
  cols = list(set(df.columns).intersection(set(cols)))
34
  df = df[cols].round(decimals=2)
35
  # filter out if any of the benchmarks have not been produced