Spaces:

m42-health
/

MEDIC-Benchmark

Running

tathagataraha commited on Jan 6

Commit

3c09632

1 Parent(s): faceee1

[ODIFY] Documentation

Files changed (3) hide show

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from src.about import (
     EVALUATION_QUEUE_TEXT,
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT_1,
     # EVALUATION_EXAMPLE_IMG,
     # LLM_BENCHMARKS_TEXT_2,
     # ENTITY_DISTRIBUTION_IMG,
@@ -580,6 +581,7 @@ with demo:
                 )
         with gr.TabItem("🏅 Medical Summarization", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
@@ -689,6 +691,7 @@ with demo:
                     queue=True,
                 )
         with gr.TabItem("🏅 Note generation", elem_id="llm-benchmark-tab-table", id=4):
             with gr.Tabs(elem_classes="tab-buttons2") as tabs:
                 with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-table2", id=0):
                     with gr.Row():

     EVALUATION_QUEUE_TEXT,
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT_1,
+    CROSS_EVALUATION_METRICS,
     # EVALUATION_EXAMPLE_IMG,
     # LLM_BENCHMARKS_TEXT_2,
     # ENTITY_DISTRIBUTION_IMG,
                 )
         with gr.TabItem("🏅 Medical Summarization", elem_id="llm-benchmark-tab-table", id=3):
+            gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
                     queue=True,
                 )
         with gr.TabItem("🏅 Note generation", elem_id="llm-benchmark-tab-table", id=4):
+            gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
             with gr.Tabs(elem_classes="tab-buttons2") as tabs:
                 with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-table2", id=0):
                     with gr.Row():

src/about.py CHANGED Viewed

@@ -175,6 +175,12 @@ Select this option if your model uses a chat template. The chat template will be
 Upon successful submission of your request, your model's result would be updated on the leaderboard within 5 working days!
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 @misc{kanithi2024mediccomprehensiveframeworkevaluating,

 Upon successful submission of your request, your model's result would be updated on the leaderboard within 5 working days!
 """
+CROSS_EVALUATION_METRICS = """
+- **Coverage**: Measures how thoroughly the summary covers the original document. A higher score means the summary includes more details from the original.
+- **Conformity**: Also called the non-contradiction score, this checks if the summary avoids contradicting the original document. A higher score means the summary aligns better with the original.
+- **Consistency**: Measures the level of non-hallucination, or how much the summary sticks to the facts in the document. A higher score means the summary is more factual and accurate.
+- **Conciseness**: Measures how brief the summary is. A higher score means the summary is more concise. A negative score means the summary is longer than the original document.
+"""
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 @misc{kanithi2024mediccomprehensiveframeworkevaluating,

src/leaderboard/read_evals.py CHANGED Viewed

@@ -161,6 +161,13 @@ class EvalResult:
                     continue
                 mean_acc = np.mean(accs)  # * 100.0
                 soap_results[task.benchmark] = mean_acc
         # types_results = {}
         # for clinical_type in ClinicalTypes:
         #     clinical_type = clinical_type.value

                     continue
                 mean_acc = np.mean(accs)  # * 100.0
                 soap_results[task.benchmark] = mean_acc
+        if harness_results == {} or open_ended_results == {} or med_safety_results == {} or medical_summarization_results == {} or aci_results == {} or soap_results == {}:
+            harness_results = {}
+            open_ended_results = {}
+            med_safety_results = {}
+            medical_summarization_results = {}
+            aci_results = {}
+            soap_results = {}
         # types_results = {}
         # for clinical_type in ClinicalTypes:
         #     clinical_type = clinical_type.value