Spaces:
Running
Running
Commit
Β·
3c09632
1
Parent(s):
faceee1
[ODIFY] Documentation
Browse files- app.py +3 -0
- src/about.py +6 -0
- src/leaderboard/read_evals.py +7 -0
app.py
CHANGED
@@ -11,6 +11,7 @@ from src.about import (
|
|
11 |
EVALUATION_QUEUE_TEXT,
|
12 |
INTRODUCTION_TEXT,
|
13 |
LLM_BENCHMARKS_TEXT_1,
|
|
|
14 |
# EVALUATION_EXAMPLE_IMG,
|
15 |
# LLM_BENCHMARKS_TEXT_2,
|
16 |
# ENTITY_DISTRIBUTION_IMG,
|
@@ -580,6 +581,7 @@ with demo:
|
|
580 |
)
|
581 |
|
582 |
with gr.TabItem("π
Medical Summarization", elem_id="llm-benchmark-tab-table", id=3):
|
|
|
583 |
with gr.Row():
|
584 |
with gr.Column():
|
585 |
with gr.Row():
|
@@ -689,6 +691,7 @@ with demo:
|
|
689 |
queue=True,
|
690 |
)
|
691 |
with gr.TabItem("π
Note generation", elem_id="llm-benchmark-tab-table", id=4):
|
|
|
692 |
with gr.Tabs(elem_classes="tab-buttons2") as tabs:
|
693 |
with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-table2", id=0):
|
694 |
with gr.Row():
|
|
|
11 |
EVALUATION_QUEUE_TEXT,
|
12 |
INTRODUCTION_TEXT,
|
13 |
LLM_BENCHMARKS_TEXT_1,
|
14 |
+
CROSS_EVALUATION_METRICS,
|
15 |
# EVALUATION_EXAMPLE_IMG,
|
16 |
# LLM_BENCHMARKS_TEXT_2,
|
17 |
# ENTITY_DISTRIBUTION_IMG,
|
|
|
581 |
)
|
582 |
|
583 |
with gr.TabItem("π
Medical Summarization", elem_id="llm-benchmark-tab-table", id=3):
|
584 |
+
gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
|
585 |
with gr.Row():
|
586 |
with gr.Column():
|
587 |
with gr.Row():
|
|
|
691 |
queue=True,
|
692 |
)
|
693 |
with gr.TabItem("π
Note generation", elem_id="llm-benchmark-tab-table", id=4):
|
694 |
+
gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
|
695 |
with gr.Tabs(elem_classes="tab-buttons2") as tabs:
|
696 |
with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-table2", id=0):
|
697 |
with gr.Row():
|
src/about.py
CHANGED
@@ -175,6 +175,12 @@ Select this option if your model uses a chat template. The chat template will be
|
|
175 |
Upon successful submission of your request, your model's result would be updated on the leaderboard within 5 working days!
|
176 |
"""
|
177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
179 |
CITATION_BUTTON_TEXT = r"""
|
180 |
@misc{kanithi2024mediccomprehensiveframeworkevaluating,
|
|
|
175 |
Upon successful submission of your request, your model's result would be updated on the leaderboard within 5 working days!
|
176 |
"""
|
177 |
|
178 |
+
CROSS_EVALUATION_METRICS = """
|
179 |
+
- **Coverage**: Measures how thoroughly the summary covers the original document. A higher score means the summary includes more details from the original.
|
180 |
+
- **Conformity**: Also called the non-contradiction score, this checks if the summary avoids contradicting the original document. A higher score means the summary aligns better with the original.
|
181 |
+
- **Consistency**: Measures the level of non-hallucination, or how much the summary sticks to the facts in the document. A higher score means the summary is more factual and accurate.
|
182 |
+
- **Conciseness**: Measures how brief the summary is. A higher score means the summary is more concise. A negative score means the summary is longer than the original document.
|
183 |
+
"""
|
184 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
185 |
CITATION_BUTTON_TEXT = r"""
|
186 |
@misc{kanithi2024mediccomprehensiveframeworkevaluating,
|
src/leaderboard/read_evals.py
CHANGED
@@ -161,6 +161,13 @@ class EvalResult:
|
|
161 |
continue
|
162 |
mean_acc = np.mean(accs) # * 100.0
|
163 |
soap_results[task.benchmark] = mean_acc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
# types_results = {}
|
165 |
# for clinical_type in ClinicalTypes:
|
166 |
# clinical_type = clinical_type.value
|
|
|
161 |
continue
|
162 |
mean_acc = np.mean(accs) # * 100.0
|
163 |
soap_results[task.benchmark] = mean_acc
|
164 |
+
if harness_results == {} or open_ended_results == {} or med_safety_results == {} or medical_summarization_results == {} or aci_results == {} or soap_results == {}:
|
165 |
+
harness_results = {}
|
166 |
+
open_ended_results = {}
|
167 |
+
med_safety_results = {}
|
168 |
+
medical_summarization_results = {}
|
169 |
+
aci_results = {}
|
170 |
+
soap_results = {}
|
171 |
# types_results = {}
|
172 |
# for clinical_type in ClinicalTypes:
|
173 |
# clinical_type = clinical_type.value
|