Spaces:
Running
Running
Commit
·
20dad4a
1
Parent(s):
2e9477a
[ADD] Closed ended arabic
Browse files- app.py +124 -1
- src/about.py +13 -0
- src/display/utils.py +13 -2
- src/leaderboard/read_evals.py +25 -2
- src/populate.py +10 -1
app.py
CHANGED
@@ -31,12 +31,14 @@ from src.display.utils import (
|
|
31 |
MEDICAL_SUMMARIZATION_BENCHMARK_COLS,
|
32 |
ACI_BENCHMARK_COLS,
|
33 |
SOAP_BENCHMARK_COLS,
|
|
|
34 |
DATASET_COLS,
|
35 |
OPEN_ENDED_COLS,
|
36 |
MED_SAFETY_COLS,
|
37 |
MEDICAL_SUMMARIZATION_COLS,
|
38 |
ACI_COLS,
|
39 |
SOAP_COLS,
|
|
|
40 |
EVAL_COLS,
|
41 |
EVAL_TYPES,
|
42 |
NUMERIC_INTERVALS,
|
@@ -94,6 +96,10 @@ aci_leaderboard_df = aci_original_df.copy()
|
|
94 |
_, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
|
95 |
soap_leaderboard_df = soap_original_df.copy()
|
96 |
|
|
|
|
|
|
|
|
|
97 |
# breakpoint()
|
98 |
# # Token based results
|
99 |
# _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
|
@@ -130,6 +136,9 @@ def update_df(shown_columns, subset="datasets"):
|
|
130 |
elif subset == "soap":
|
131 |
leaderboard_table_df = soap_leaderboard_df.copy()
|
132 |
hidden_leader_board_df = soap_original_df
|
|
|
|
|
|
|
133 |
# else:
|
134 |
# match evaluation_metric:
|
135 |
# case "Span Based":
|
@@ -941,7 +950,121 @@ with demo:
|
|
941 |
with gr.Accordion("Question generation", open=False):
|
942 |
system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
|
943 |
with gr.Accordion("Cross Examination", open=False):
|
944 |
-
system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
945 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=5):
|
946 |
gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
|
947 |
gr.HTML(FIVE_PILLAR_DIAGRAM)
|
|
|
31 |
MEDICAL_SUMMARIZATION_BENCHMARK_COLS,
|
32 |
ACI_BENCHMARK_COLS,
|
33 |
SOAP_BENCHMARK_COLS,
|
34 |
+
CLOSED_ENDED_ARABIC_BENCHMARK_COLS,
|
35 |
DATASET_COLS,
|
36 |
OPEN_ENDED_COLS,
|
37 |
MED_SAFETY_COLS,
|
38 |
MEDICAL_SUMMARIZATION_COLS,
|
39 |
ACI_COLS,
|
40 |
SOAP_COLS,
|
41 |
+
CLOSED_ENDED_ARABIC_COLS,
|
42 |
EVAL_COLS,
|
43 |
EVAL_TYPES,
|
44 |
NUMERIC_INTERVALS,
|
|
|
96 |
_, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
|
97 |
soap_leaderboard_df = soap_original_df.copy()
|
98 |
|
99 |
+
if PRIVATE_REPO:
|
100 |
+
_, closed_ended_arabic_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, CLOSED_ENDED_ARABIC_COLS, CLOSED_ENDED_ARABIC_BENCHMARK_COLS, "score", "closed_ended_arabic")
|
101 |
+
closed_ended_arabic_leaderboard_df = closed_ended_arabic_original_df.copy()
|
102 |
+
|
103 |
# breakpoint()
|
104 |
# # Token based results
|
105 |
# _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
|
|
|
136 |
elif subset == "soap":
|
137 |
leaderboard_table_df = soap_leaderboard_df.copy()
|
138 |
hidden_leader_board_df = soap_original_df
|
139 |
+
elif PRIVATE_REPO and subset == "closed-ended-arabic":
|
140 |
+
leaderboard_table_df = closed_ended_arabic_leaderboard_df.copy()
|
141 |
+
hidden_leader_board_df = closed_ended_arabic_original_df
|
142 |
# else:
|
143 |
# match evaluation_metric:
|
144 |
# case "Span Based":
|
|
|
950 |
with gr.Accordion("Question generation", open=False):
|
951 |
system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
|
952 |
with gr.Accordion("Cross Examination", open=False):
|
953 |
+
system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
|
954 |
+
if PRIVATE_REPO:
|
955 |
+
with gr.TabItem("Dev Evals", elem_id="llm-benchmark-tab-table", id=100):
|
956 |
+
with gr.Tabs(elem_classes="tab-buttons2") as tabs:
|
957 |
+
with gr.TabItem("🏅 Arabic Closed Ended Evaluation", elem_id="llm-benchmark-tab-table100", id=0):
|
958 |
+
with gr.Row():
|
959 |
+
with gr.Column():
|
960 |
+
with gr.Row():
|
961 |
+
search_bar = gr.Textbox(
|
962 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
963 |
+
show_label=False,
|
964 |
+
elem_id="search-bar",
|
965 |
+
)
|
966 |
+
with gr.Row():
|
967 |
+
shown_columns = gr.CheckboxGroup(
|
968 |
+
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_arabic_col)],
|
969 |
+
value=[
|
970 |
+
c.name
|
971 |
+
for c in fields(AutoEvalColumn)
|
972 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_arabic_col)
|
973 |
+
],
|
974 |
+
label="Select columns to show",
|
975 |
+
elem_id="column-select",
|
976 |
+
interactive=True,
|
977 |
+
)
|
978 |
+
# with gr.Row():
|
979 |
+
# deleted_models_visibility = gr.Checkbox(
|
980 |
+
# value=False, label="Show gated/private/deleted models", interactive=True
|
981 |
+
# )
|
982 |
+
with gr.Column(min_width=320):
|
983 |
+
# with gr.Box(elem_id="box-filter"):
|
984 |
+
filter_columns_type = gr.CheckboxGroup(
|
985 |
+
label="Model Types",
|
986 |
+
choices=[t.to_str() for t in ModelType],
|
987 |
+
value=[t.to_str() for t in ModelType],
|
988 |
+
interactive=True,
|
989 |
+
elem_id="filter-columns-type",
|
990 |
+
)
|
991 |
+
# filter_columns_architecture = gr.CheckboxGroup(
|
992 |
+
# label="Architecture Types",
|
993 |
+
# choices=[i.value.name for i in ModelArch],
|
994 |
+
# value=[i.value.name for i in ModelArch],
|
995 |
+
# interactive=True,
|
996 |
+
# elem_id="filter-columns-architecture",
|
997 |
+
# )
|
998 |
+
filter_domain_specific = gr.CheckboxGroup(
|
999 |
+
label="Domain Specificity",
|
1000 |
+
choices=["🏥 Clinical models", "Generic models"],
|
1001 |
+
value=["🏥 Clinical models", "Generic models"],
|
1002 |
+
interactive=True,
|
1003 |
+
elem_id="filter-columns-type",
|
1004 |
+
)
|
1005 |
+
filter_columns_size = gr.CheckboxGroup(
|
1006 |
+
label="Model sizes (in billions of parameters)",
|
1007 |
+
choices=list(NUMERIC_INTERVALS.keys()),
|
1008 |
+
value=list(NUMERIC_INTERVALS.keys()),
|
1009 |
+
interactive=True,
|
1010 |
+
elem_id="filter-columns-size",
|
1011 |
+
)
|
1012 |
+
|
1013 |
+
closed_ended_arabic_leaderboard_df, closed_ended_arabic_original_df = update_df(shown_columns.value, subset="closed-ended-arabic")
|
1014 |
+
|
1015 |
+
leaderboard_table = gr.components.Dataframe(
|
1016 |
+
value=closed_ended_arabic_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
1017 |
+
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
1018 |
+
datatype=TYPES,
|
1019 |
+
elem_id="leaderboard-table",
|
1020 |
+
interactive=False,
|
1021 |
+
visible=True,
|
1022 |
+
)
|
1023 |
+
|
1024 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
1025 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
1026 |
+
value=closed_ended_arabic_original_df[CLOSED_ENDED_ARABIC_COLS],
|
1027 |
+
headers=CLOSED_ENDED_ARABIC_COLS,
|
1028 |
+
datatype=TYPES,
|
1029 |
+
visible=False,
|
1030 |
+
)
|
1031 |
+
|
1032 |
+
|
1033 |
+
search_bar.submit(
|
1034 |
+
update_table,
|
1035 |
+
[
|
1036 |
+
hidden_leaderboard_table_for_search,
|
1037 |
+
shown_columns,
|
1038 |
+
search_bar,
|
1039 |
+
filter_columns_type,
|
1040 |
+
filter_domain_specific,
|
1041 |
+
filter_columns_size
|
1042 |
+
# filter_columns_architecture
|
1043 |
+
],
|
1044 |
+
leaderboard_table,
|
1045 |
+
)
|
1046 |
+
for selector in [
|
1047 |
+
shown_columns,
|
1048 |
+
filter_columns_type,
|
1049 |
+
filter_domain_specific,
|
1050 |
+
# filter_columns_architecture,
|
1051 |
+
filter_columns_size,
|
1052 |
+
# deleted_models_visibility,
|
1053 |
+
]:
|
1054 |
+
selector.change(
|
1055 |
+
update_table,
|
1056 |
+
[
|
1057 |
+
hidden_leaderboard_table_for_search,
|
1058 |
+
shown_columns,
|
1059 |
+
search_bar,
|
1060 |
+
filter_columns_type,
|
1061 |
+
filter_domain_specific,
|
1062 |
+
filter_columns_size
|
1063 |
+
# filter_columns_architecture,
|
1064 |
+
],
|
1065 |
+
leaderboard_table,
|
1066 |
+
queue=True,
|
1067 |
+
)
|
1068 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=5):
|
1069 |
gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
|
1070 |
gr.HTML(FIVE_PILLAR_DIAGRAM)
|
src/about.py
CHANGED
@@ -96,6 +96,19 @@ class SOAPColumns(Enum):
|
|
96 |
soap_column2 = SOAPColumn("fact", "score", "Consistency")
|
97 |
# soap_column3 = SOAPColumn("brief", "score", "Conciseness")
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
NUM_FEWSHOT = 0 # Change with your few shot
|
100 |
# ---------------------------------------------------
|
101 |
|
|
|
96 |
soap_column2 = SOAPColumn("fact", "score", "Consistency")
|
97 |
# soap_column3 = SOAPColumn("brief", "score", "Conciseness")
|
98 |
|
99 |
+
@dataclass
|
100 |
+
class ClosedEndedArabicColumn:
|
101 |
+
benchmark: str
|
102 |
+
metric: str
|
103 |
+
col_name: str
|
104 |
+
|
105 |
+
class ClosedEndedArabicColumns(Enum):
|
106 |
+
arabictask0 = ClosedEndedArabicColumn("MMLU-Arabic", "accuracy", "MMLU-Arabic")
|
107 |
+
arabictask2 = ClosedEndedArabicColumn("MedMCQA-Arabic", "accuracy", "MedMCQA-Arabic")
|
108 |
+
arabictask3 = ClosedEndedArabicColumn("MedQA-Arabic", "accuracy", "MedQA-Arabic")
|
109 |
+
arabictask5 = ClosedEndedArabicColumn("PubMedQA-Arabic", "accuracy", "PubMedQA-Arabic")
|
110 |
+
|
111 |
+
|
112 |
NUM_FEWSHOT = 0 # Change with your few shot
|
113 |
# ---------------------------------------------------
|
114 |
|
src/display/utils.py
CHANGED
@@ -4,7 +4,8 @@ from enum import Enum
|
|
4 |
import pandas as pd
|
5 |
|
6 |
# changes to be made here
|
7 |
-
from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns
|
|
|
8 |
import json
|
9 |
import gradio as gr
|
10 |
|
@@ -30,6 +31,7 @@ class ColumnContent:
|
|
30 |
medical_summarization_col: bool = False
|
31 |
aci_col: bool = False
|
32 |
soap_col: bool = False
|
|
|
33 |
|
34 |
|
35 |
## Leaderboard columns
|
@@ -39,7 +41,7 @@ auto_eval_column_dict = []
|
|
39 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
40 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
41 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
|
42 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, invariant=False)])
|
43 |
auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
|
44 |
for task in HarnessTasks:
|
45 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
|
@@ -57,6 +59,9 @@ for column in ACIColumns:
|
|
57 |
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, aci_col=True, invariant=False)])
|
58 |
for column in SOAPColumns:
|
59 |
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, soap_col=True, invariant=False)])
|
|
|
|
|
|
|
60 |
auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
|
61 |
auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
|
62 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
@@ -89,6 +94,8 @@ class EvalQueueColumn: # Queue column
|
|
89 |
med_safety_status = ColumnContent("med_safety_status", "str", True)
|
90 |
medical_summarization_status = ColumnContent("medical_summarization_status", "str", True)
|
91 |
note_generation_status = ColumnContent("note_generation_status", "str", True)
|
|
|
|
|
92 |
|
93 |
## All the model information that we might need
|
94 |
@dataclass
|
@@ -214,6 +221,8 @@ MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c
|
|
214 |
MEDICAL_SUMMARIZATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medical_summarization_col or c.invariant)]
|
215 |
ACI_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.aci_col or c.invariant)]
|
216 |
SOAP_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.soap_col or c.invariant)]
|
|
|
|
|
217 |
# CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.cross_examination_col or c.invariant)]
|
218 |
# DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
|
219 |
# OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
|
@@ -234,6 +243,8 @@ MED_SAFETY_BENCHMARK_COLS = [t.value.col_name for t in MedSafetyColumns]
|
|
234 |
MEDICAL_SUMMARIZATION_BENCHMARK_COLS = [t.value.col_name for t in MedicalSummarizationColumns]
|
235 |
ACI_BENCHMARK_COLS = [t.value.col_name for t in ACIColumns]
|
236 |
SOAP_BENCHMARK_COLS = [t.value.col_name for t in SOAPColumns]
|
|
|
|
|
237 |
# CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
|
238 |
|
239 |
NUMERIC_INTERVALS = {
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
# changes to be made here
|
7 |
+
from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, ClosedEndedArabicColumns
|
8 |
+
from src.envs import PRIVATE_REPO
|
9 |
import json
|
10 |
import gradio as gr
|
11 |
|
|
|
31 |
medical_summarization_col: bool = False
|
32 |
aci_col: bool = False
|
33 |
soap_col: bool = False
|
34 |
+
closed_ended_arabic_col: bool = False
|
35 |
|
36 |
|
37 |
## Leaderboard columns
|
|
|
41 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
42 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
43 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
|
44 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, closed_ended_arabic_col=True, invariant=False)])
|
45 |
auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
|
46 |
for task in HarnessTasks:
|
47 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
|
|
|
59 |
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, aci_col=True, invariant=False)])
|
60 |
for column in SOAPColumns:
|
61 |
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, soap_col=True, invariant=False)])
|
62 |
+
# if PRIVATE_REPO:
|
63 |
+
for column in ClosedEndedArabicColumns:
|
64 |
+
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, closed_ended_arabic_col=True, invariant=False)])
|
65 |
auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
|
66 |
auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
|
67 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
|
|
94 |
med_safety_status = ColumnContent("med_safety_status", "str", True)
|
95 |
medical_summarization_status = ColumnContent("medical_summarization_status", "str", True)
|
96 |
note_generation_status = ColumnContent("note_generation_status", "str", True)
|
97 |
+
if PRIVATE_REPO:
|
98 |
+
closed_ended_arabic_status = ColumnContent("closed_ended_arabic_status", "str", True)
|
99 |
|
100 |
## All the model information that we might need
|
101 |
@dataclass
|
|
|
221 |
MEDICAL_SUMMARIZATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medical_summarization_col or c.invariant)]
|
222 |
ACI_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.aci_col or c.invariant)]
|
223 |
SOAP_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.soap_col or c.invariant)]
|
224 |
+
# if PRIVATE_REPO:
|
225 |
+
CLOSED_ENDED_ARABIC_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.closed_ended_arabic_col or c.invariant)]
|
226 |
# CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.cross_examination_col or c.invariant)]
|
227 |
# DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
|
228 |
# OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
|
|
|
243 |
MEDICAL_SUMMARIZATION_BENCHMARK_COLS = [t.value.col_name for t in MedicalSummarizationColumns]
|
244 |
ACI_BENCHMARK_COLS = [t.value.col_name for t in ACIColumns]
|
245 |
SOAP_BENCHMARK_COLS = [t.value.col_name for t in SOAPColumns]
|
246 |
+
# if PRIVATE_REPO:
|
247 |
+
CLOSED_ENDED_ARABIC_BENCHMARK_COLS = [t.value.col_name for t in ClosedEndedArabicColumns]
|
248 |
# CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
|
249 |
|
250 |
NUMERIC_INTERVALS = {
|
src/leaderboard/read_evals.py
CHANGED
@@ -9,8 +9,9 @@ import numpy as np
|
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
# changes to be made here
|
12 |
-
from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns
|
13 |
from src.submission.check_validity import is_model_on_hub
|
|
|
14 |
|
15 |
|
16 |
@dataclass
|
@@ -29,6 +30,7 @@ class EvalResult:
|
|
29 |
medical_summarization_results: dict
|
30 |
aci_results: dict
|
31 |
soap_results: dict
|
|
|
32 |
is_domain_specific: bool
|
33 |
use_chat_template: bool
|
34 |
# clinical_type_results:dict
|
@@ -162,6 +164,20 @@ class EvalResult:
|
|
162 |
continue
|
163 |
mean_acc = np.mean(accs) # * 100.0
|
164 |
soap_results[task.benchmark] = mean_acc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
if open_ended_results == {} or med_safety_results == {} or medical_summarization_results == {} or aci_results == {} or soap_results == {}:
|
166 |
open_ended_results = {}
|
167 |
med_safety_results = {}
|
@@ -192,6 +208,7 @@ class EvalResult:
|
|
192 |
medical_summarization_results=medical_summarization_results,
|
193 |
aci_results=aci_results,
|
194 |
soap_results=soap_results,
|
|
|
195 |
is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
|
196 |
use_chat_template=config.get("use_chat_template", False), # Assuming a default value
|
197 |
precision=precision,
|
@@ -294,7 +311,13 @@ class EvalResult:
|
|
294 |
for task in SOAPColumns:
|
295 |
data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
|
296 |
return data_dict
|
297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
|
299 |
def get_request_file_for_model(requests_path, model_name, precision):
|
300 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
|
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
# changes to be made here
|
12 |
+
from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, ClosedEndedArabicColumns
|
13 |
from src.submission.check_validity import is_model_on_hub
|
14 |
+
from src.envs import PRIVATE_REPO
|
15 |
|
16 |
|
17 |
@dataclass
|
|
|
30 |
medical_summarization_results: dict
|
31 |
aci_results: dict
|
32 |
soap_results: dict
|
33 |
+
closed_ended_arabic_results: dict
|
34 |
is_domain_specific: bool
|
35 |
use_chat_template: bool
|
36 |
# clinical_type_results:dict
|
|
|
164 |
continue
|
165 |
mean_acc = np.mean(accs) # * 100.0
|
166 |
soap_results[task.benchmark] = mean_acc
|
167 |
+
closed_ended_arabic_results = {}
|
168 |
+
if PRIVATE_REPO and "closed-ended-arabic" in data["results"]:
|
169 |
+
for task in ClosedEndedArabicColumns:
|
170 |
+
task = task.value
|
171 |
+
# We average all scores of a given metric (not all metrics are present in all files)
|
172 |
+
try:
|
173 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"]["closed-ended-arabic"].items() if task.benchmark == k])
|
174 |
+
except:
|
175 |
+
# breakpoint()
|
176 |
+
accs = np.array([])
|
177 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
178 |
+
continue
|
179 |
+
mean_acc = np.mean(accs) # * 100.0
|
180 |
+
closed_ended_arabic_results[task.benchmark] = mean_acc
|
181 |
if open_ended_results == {} or med_safety_results == {} or medical_summarization_results == {} or aci_results == {} or soap_results == {}:
|
182 |
open_ended_results = {}
|
183 |
med_safety_results = {}
|
|
|
208 |
medical_summarization_results=medical_summarization_results,
|
209 |
aci_results=aci_results,
|
210 |
soap_results=soap_results,
|
211 |
+
closed_ended_arabic_results=closed_ended_arabic_results,
|
212 |
is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
|
213 |
use_chat_template=config.get("use_chat_template", False), # Assuming a default value
|
214 |
precision=precision,
|
|
|
311 |
for task in SOAPColumns:
|
312 |
data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
|
313 |
return data_dict
|
314 |
+
if PRIVATE_REPO and subset == "closed_ended_arabic":
|
315 |
+
average = sum([v for v in self.closed_ended_arabic_results.values() if v is not None]) / len(ClosedEndedArabicColumns)
|
316 |
+
data_dict[AutoEvalColumn.average.name] = average
|
317 |
+
if len(self.closed_ended_arabic_results) > 0:
|
318 |
+
for task in ClosedEndedArabicColumns:
|
319 |
+
data_dict[task.value.col_name] = self.closed_ended_arabic_results[task.value.benchmark]
|
320 |
+
return data_dict
|
321 |
|
322 |
def get_request_file_for_model(requests_path, model_name, precision):
|
323 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
src/populate.py
CHANGED
@@ -5,8 +5,9 @@ import pandas as pd
|
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
# changes to be made here
|
8 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
|
|
10 |
|
11 |
|
12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
|
@@ -30,6 +31,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
30 |
df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
|
31 |
elif subset == "soap":
|
32 |
df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
|
|
|
|
|
33 |
cols = list(set(df.columns).intersection(set(cols)))
|
34 |
df = df[cols].round(decimals=2)
|
35 |
# filter out if any of the benchmarks have not been produced
|
@@ -54,6 +57,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
54 |
data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
|
55 |
data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
|
56 |
data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
|
|
|
|
|
57 |
all_evals.append(data)
|
58 |
elif ".md" not in entry:
|
59 |
# this is a folder
|
@@ -70,6 +75,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
70 |
data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
|
71 |
data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
|
72 |
data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
|
|
|
|
|
73 |
all_evals.append(data)
|
74 |
# breakpoint()
|
75 |
pending_list = []
|
@@ -78,6 +85,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
78 |
for run in all_evals:
|
79 |
# changes to be made here
|
80 |
status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["medical-summarization"], run["status"]["note-generation"]]
|
|
|
|
|
81 |
# status_list = status_list
|
82 |
if "RUNNING" in status_list:
|
83 |
running_list.append(run)
|
|
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
# changes to be made here
|
8 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, ClosedEndedArabicColumns
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
+
from src.envs import PRIVATE_REPO
|
11 |
|
12 |
|
13 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
|
|
|
31 |
df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
|
32 |
elif subset == "soap":
|
33 |
df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
|
34 |
+
elif subset == "closed_ended_arabic":
|
35 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
36 |
cols = list(set(df.columns).intersection(set(cols)))
|
37 |
df = df[cols].round(decimals=2)
|
38 |
# filter out if any of the benchmarks have not been produced
|
|
|
57 |
data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
|
58 |
data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
|
59 |
data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
|
60 |
+
if PRIVATE_REPO:
|
61 |
+
data[EvalQueueColumn.closed_ended_arabic_status.name] = data["status"]["closed-ended-arabic"]
|
62 |
all_evals.append(data)
|
63 |
elif ".md" not in entry:
|
64 |
# this is a folder
|
|
|
75 |
data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
|
76 |
data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
|
77 |
data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
|
78 |
+
if PRIVATE_REPO:
|
79 |
+
data[EvalQueueColumn.closed_ended_arabic_status.name] = data["status"]["closed-ended-arabic"]
|
80 |
all_evals.append(data)
|
81 |
# breakpoint()
|
82 |
pending_list = []
|
|
|
85 |
for run in all_evals:
|
86 |
# changes to be made here
|
87 |
status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["medical-summarization"], run["status"]["note-generation"]]
|
88 |
+
if PRIVATE_REPO:
|
89 |
+
status_list.append(run["status"]["closed-ended-arabic"])
|
90 |
# status_list = status_list
|
91 |
if "RUNNING" in status_list:
|
92 |
running_list.append(run)
|