Spaces:
Running
Running
Commit
·
c92b14d
1
Parent(s):
7d6aad6
[MODIFY] Metrics for medical summarization, aci bench and soap notes
Browse files- app.py +4 -4
- src/about.py +2 -2
- src/display/utils.py +1 -0
- src/leaderboard/read_evals.py +11 -0
- src/populate.py +3 -3
app.py
CHANGED
|
@@ -704,11 +704,11 @@ with demo:
|
|
| 704 |
)
|
| 705 |
with gr.Row():
|
| 706 |
shown_columns = gr.CheckboxGroup(
|
| 707 |
-
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.
|
| 708 |
value=[
|
| 709 |
c.name
|
| 710 |
for c in fields(AutoEvalColumn)
|
| 711 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.
|
| 712 |
],
|
| 713 |
label="Select columns to show",
|
| 714 |
elem_id="column-select",
|
|
@@ -814,11 +814,11 @@ with demo:
|
|
| 814 |
)
|
| 815 |
with gr.Row():
|
| 816 |
shown_columns = gr.CheckboxGroup(
|
| 817 |
-
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.
|
| 818 |
value=[
|
| 819 |
c.name
|
| 820 |
for c in fields(AutoEvalColumn)
|
| 821 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.
|
| 822 |
],
|
| 823 |
label="Select columns to show",
|
| 824 |
elem_id="column-select",
|
|
|
|
| 704 |
)
|
| 705 |
with gr.Row():
|
| 706 |
shown_columns = gr.CheckboxGroup(
|
| 707 |
+
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)],
|
| 708 |
value=[
|
| 709 |
c.name
|
| 710 |
for c in fields(AutoEvalColumn)
|
| 711 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)
|
| 712 |
],
|
| 713 |
label="Select columns to show",
|
| 714 |
elem_id="column-select",
|
|
|
|
| 814 |
)
|
| 815 |
with gr.Row():
|
| 816 |
shown_columns = gr.CheckboxGroup(
|
| 817 |
+
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)],
|
| 818 |
value=[
|
| 819 |
c.name
|
| 820 |
for c in fields(AutoEvalColumn)
|
| 821 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)
|
| 822 |
],
|
| 823 |
label="Select columns to show",
|
| 824 |
elem_id="column-select",
|
src/about.py
CHANGED
|
@@ -79,7 +79,7 @@ class ACIColumns(Enum):
|
|
| 79 |
aci_column0 = ACIColumn("coverage", "score", "Coverage")
|
| 80 |
aci_column1 = ACIColumn("conform", "score", "Conformity")
|
| 81 |
aci_column2 = ACIColumn("fact", "score", "Consistency")
|
| 82 |
-
aci_column3 = ACIColumn("brief", "score", "Conciseness")
|
| 83 |
|
| 84 |
@dataclass
|
| 85 |
class SOAPColumn:
|
|
@@ -91,7 +91,7 @@ class SOAPColumns(Enum):
|
|
| 91 |
soap_column0 = SOAPColumn("coverage", "score", "Coverage")
|
| 92 |
soap_column1 = SOAPColumn("conform", "score", "Conformity")
|
| 93 |
soap_column2 = SOAPColumn("fact", "score", "Consistency")
|
| 94 |
-
soap_column3 = SOAPColumn("brief", "score", "Conciseness")
|
| 95 |
|
| 96 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 97 |
# ---------------------------------------------------
|
|
|
|
| 79 |
aci_column0 = ACIColumn("coverage", "score", "Coverage")
|
| 80 |
aci_column1 = ACIColumn("conform", "score", "Conformity")
|
| 81 |
aci_column2 = ACIColumn("fact", "score", "Consistency")
|
| 82 |
+
# aci_column3 = ACIColumn("brief", "score", "Conciseness")
|
| 83 |
|
| 84 |
@dataclass
|
| 85 |
class SOAPColumn:
|
|
|
|
| 91 |
soap_column0 = SOAPColumn("coverage", "score", "Coverage")
|
| 92 |
soap_column1 = SOAPColumn("conform", "score", "Conformity")
|
| 93 |
soap_column2 = SOAPColumn("fact", "score", "Consistency")
|
| 94 |
+
# soap_column3 = SOAPColumn("brief", "score", "Conciseness")
|
| 95 |
|
| 96 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 97 |
# ---------------------------------------------------
|
src/display/utils.py
CHANGED
|
@@ -39,6 +39,7 @@ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent(
|
|
| 39 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 40 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
|
| 41 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, med_safety_col=True, invariant=False)])
|
|
|
|
| 42 |
for task in HarnessTasks:
|
| 43 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
|
| 44 |
for column in OpenEndedColumns:
|
|
|
|
| 39 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 40 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
|
| 41 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, med_safety_col=True, invariant=False)])
|
| 42 |
+
auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
|
| 43 |
for task in HarnessTasks:
|
| 44 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
|
| 45 |
for column in OpenEndedColumns:
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -272,15 +272,26 @@ class EvalResult:
|
|
| 272 |
return data_dict
|
| 273 |
if subset == "medical_summarization":
|
| 274 |
if len(self.medical_summarization_results) > 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
for task in MedicalSummarizationColumns:
|
| 276 |
data_dict[task.value.col_name] = self.medical_summarization_results[task.value.benchmark]
|
| 277 |
return data_dict
|
| 278 |
if subset == "aci":
|
|
|
|
|
|
|
| 279 |
if len(self.aci_results) > 0:
|
| 280 |
for task in ACIColumns:
|
| 281 |
data_dict[task.value.col_name] = self.aci_results[task.value.benchmark]
|
| 282 |
return data_dict
|
| 283 |
if subset == "soap":
|
|
|
|
|
|
|
| 284 |
if len(self.soap_results) > 0:
|
| 285 |
for task in SOAPColumns:
|
| 286 |
data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
|
|
|
|
| 272 |
return data_dict
|
| 273 |
if subset == "medical_summarization":
|
| 274 |
if len(self.medical_summarization_results) > 0:
|
| 275 |
+
adjusted_conciseness = max(0, self.medical_summarization_results["brief"])
|
| 276 |
+
coverage = self.medical_summarization_results["coverage"]
|
| 277 |
+
hm = 2 / (1/coverage + 1/adjusted_conciseness) if not (adjusted_conciseness == 0 or coverage == 0) else 0
|
| 278 |
+
conformity = self.medical_summarization_results["conform"]
|
| 279 |
+
consistency = self.medical_summarization_results["fact"]
|
| 280 |
+
overall = sum([hm, conformity, consistency]) / 3
|
| 281 |
+
data_dict[AutoEvalColumn.overall.name] = overall
|
| 282 |
for task in MedicalSummarizationColumns:
|
| 283 |
data_dict[task.value.col_name] = self.medical_summarization_results[task.value.benchmark]
|
| 284 |
return data_dict
|
| 285 |
if subset == "aci":
|
| 286 |
+
overall = sum([v for v in self.aci_results.values() if v is not None]) / len(ACIColumns)
|
| 287 |
+
data_dict[AutoEvalColumn.overall.name] = overall
|
| 288 |
if len(self.aci_results) > 0:
|
| 289 |
for task in ACIColumns:
|
| 290 |
data_dict[task.value.col_name] = self.aci_results[task.value.benchmark]
|
| 291 |
return data_dict
|
| 292 |
if subset == "soap":
|
| 293 |
+
overall = sum([v for v in self.soap_results.values() if v is not None]) / len(SOAPColumns)
|
| 294 |
+
data_dict[AutoEvalColumn.overall.name] = overall
|
| 295 |
if len(self.soap_results) > 0:
|
| 296 |
for task in SOAPColumns:
|
| 297 |
data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
|
src/populate.py
CHANGED
|
@@ -25,11 +25,11 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 25 |
elif subset == "open_ended":
|
| 26 |
df = df.sort_values(by=["ELO"], ascending=False)
|
| 27 |
elif subset == "medical_summarization":
|
| 28 |
-
df = df.sort_values(by=["
|
| 29 |
elif subset == "aci":
|
| 30 |
-
df = df.sort_values(by=["
|
| 31 |
elif subset == "soap":
|
| 32 |
-
df = df.sort_values(by=["
|
| 33 |
cols = list(set(df.columns).intersection(set(cols)))
|
| 34 |
df = df[cols].round(decimals=2)
|
| 35 |
# filter out if any of the benchmarks have not been produced
|
|
|
|
| 25 |
elif subset == "open_ended":
|
| 26 |
df = df.sort_values(by=["ELO"], ascending=False)
|
| 27 |
elif subset == "medical_summarization":
|
| 28 |
+
df = df.sort_values(by=["Overall Score"], ascending=False)
|
| 29 |
elif subset == "aci":
|
| 30 |
+
df = df.sort_values(by=["Overall Score"], ascending=False)
|
| 31 |
elif subset == "soap":
|
| 32 |
+
df = df.sort_values(by=["Overall Score"], ascending=False)
|
| 33 |
cols = list(set(df.columns).intersection(set(cols)))
|
| 34 |
df = df[cols].round(decimals=2)
|
| 35 |
# filter out if any of the benchmarks have not been produced
|