tathagataraha commited on
Commit
20dad4a
·
1 Parent(s): 2e9477a

[ADD] Closed ended arabic

Browse files
Files changed (5) hide show
  1. app.py +124 -1
  2. src/about.py +13 -0
  3. src/display/utils.py +13 -2
  4. src/leaderboard/read_evals.py +25 -2
  5. src/populate.py +10 -1
app.py CHANGED
@@ -31,12 +31,14 @@ from src.display.utils import (
31
  MEDICAL_SUMMARIZATION_BENCHMARK_COLS,
32
  ACI_BENCHMARK_COLS,
33
  SOAP_BENCHMARK_COLS,
 
34
  DATASET_COLS,
35
  OPEN_ENDED_COLS,
36
  MED_SAFETY_COLS,
37
  MEDICAL_SUMMARIZATION_COLS,
38
  ACI_COLS,
39
  SOAP_COLS,
 
40
  EVAL_COLS,
41
  EVAL_TYPES,
42
  NUMERIC_INTERVALS,
@@ -94,6 +96,10 @@ aci_leaderboard_df = aci_original_df.copy()
94
  _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
95
  soap_leaderboard_df = soap_original_df.copy()
96
 
 
 
 
 
97
  # breakpoint()
98
  # # Token based results
99
  # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
@@ -130,6 +136,9 @@ def update_df(shown_columns, subset="datasets"):
130
  elif subset == "soap":
131
  leaderboard_table_df = soap_leaderboard_df.copy()
132
  hidden_leader_board_df = soap_original_df
 
 
 
133
  # else:
134
  # match evaluation_metric:
135
  # case "Span Based":
@@ -941,7 +950,121 @@ with demo:
941
  with gr.Accordion("Question generation", open=False):
942
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
943
  with gr.Accordion("Cross Examination", open=False):
944
- system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
945
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=5):
946
  gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
947
  gr.HTML(FIVE_PILLAR_DIAGRAM)
 
31
  MEDICAL_SUMMARIZATION_BENCHMARK_COLS,
32
  ACI_BENCHMARK_COLS,
33
  SOAP_BENCHMARK_COLS,
34
+ CLOSED_ENDED_ARABIC_BENCHMARK_COLS,
35
  DATASET_COLS,
36
  OPEN_ENDED_COLS,
37
  MED_SAFETY_COLS,
38
  MEDICAL_SUMMARIZATION_COLS,
39
  ACI_COLS,
40
  SOAP_COLS,
41
+ CLOSED_ENDED_ARABIC_COLS,
42
  EVAL_COLS,
43
  EVAL_TYPES,
44
  NUMERIC_INTERVALS,
 
96
  _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
97
  soap_leaderboard_df = soap_original_df.copy()
98
 
99
+ if PRIVATE_REPO:
100
+ _, closed_ended_arabic_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, CLOSED_ENDED_ARABIC_COLS, CLOSED_ENDED_ARABIC_BENCHMARK_COLS, "score", "closed_ended_arabic")
101
+ closed_ended_arabic_leaderboard_df = closed_ended_arabic_original_df.copy()
102
+
103
  # breakpoint()
104
  # # Token based results
105
  # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
 
136
  elif subset == "soap":
137
  leaderboard_table_df = soap_leaderboard_df.copy()
138
  hidden_leader_board_df = soap_original_df
139
+ elif PRIVATE_REPO and subset == "closed-ended-arabic":
140
+ leaderboard_table_df = closed_ended_arabic_leaderboard_df.copy()
141
+ hidden_leader_board_df = closed_ended_arabic_original_df
142
  # else:
143
  # match evaluation_metric:
144
  # case "Span Based":
 
950
  with gr.Accordion("Question generation", open=False):
951
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
952
  with gr.Accordion("Cross Examination", open=False):
953
+ system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
954
+ if PRIVATE_REPO:
955
+ with gr.TabItem("Dev Evals", elem_id="llm-benchmark-tab-table", id=100):
956
+ with gr.Tabs(elem_classes="tab-buttons2") as tabs:
957
+ with gr.TabItem("🏅 Arabic Closed Ended Evaluation", elem_id="llm-benchmark-tab-table100", id=0):
958
+ with gr.Row():
959
+ with gr.Column():
960
+ with gr.Row():
961
+ search_bar = gr.Textbox(
962
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
963
+ show_label=False,
964
+ elem_id="search-bar",
965
+ )
966
+ with gr.Row():
967
+ shown_columns = gr.CheckboxGroup(
968
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_arabic_col)],
969
+ value=[
970
+ c.name
971
+ for c in fields(AutoEvalColumn)
972
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_arabic_col)
973
+ ],
974
+ label="Select columns to show",
975
+ elem_id="column-select",
976
+ interactive=True,
977
+ )
978
+ # with gr.Row():
979
+ # deleted_models_visibility = gr.Checkbox(
980
+ # value=False, label="Show gated/private/deleted models", interactive=True
981
+ # )
982
+ with gr.Column(min_width=320):
983
+ # with gr.Box(elem_id="box-filter"):
984
+ filter_columns_type = gr.CheckboxGroup(
985
+ label="Model Types",
986
+ choices=[t.to_str() for t in ModelType],
987
+ value=[t.to_str() for t in ModelType],
988
+ interactive=True,
989
+ elem_id="filter-columns-type",
990
+ )
991
+ # filter_columns_architecture = gr.CheckboxGroup(
992
+ # label="Architecture Types",
993
+ # choices=[i.value.name for i in ModelArch],
994
+ # value=[i.value.name for i in ModelArch],
995
+ # interactive=True,
996
+ # elem_id="filter-columns-architecture",
997
+ # )
998
+ filter_domain_specific = gr.CheckboxGroup(
999
+ label="Domain Specificity",
1000
+ choices=["🏥 Clinical models", "Generic models"],
1001
+ value=["🏥 Clinical models", "Generic models"],
1002
+ interactive=True,
1003
+ elem_id="filter-columns-type",
1004
+ )
1005
+ filter_columns_size = gr.CheckboxGroup(
1006
+ label="Model sizes (in billions of parameters)",
1007
+ choices=list(NUMERIC_INTERVALS.keys()),
1008
+ value=list(NUMERIC_INTERVALS.keys()),
1009
+ interactive=True,
1010
+ elem_id="filter-columns-size",
1011
+ )
1012
+
1013
+ closed_ended_arabic_leaderboard_df, closed_ended_arabic_original_df = update_df(shown_columns.value, subset="closed-ended-arabic")
1014
+
1015
+ leaderboard_table = gr.components.Dataframe(
1016
+ value=closed_ended_arabic_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1017
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1018
+ datatype=TYPES,
1019
+ elem_id="leaderboard-table",
1020
+ interactive=False,
1021
+ visible=True,
1022
+ )
1023
+
1024
+ # Dummy leaderboard for handling the case when the user uses backspace key
1025
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
1026
+ value=closed_ended_arabic_original_df[CLOSED_ENDED_ARABIC_COLS],
1027
+ headers=CLOSED_ENDED_ARABIC_COLS,
1028
+ datatype=TYPES,
1029
+ visible=False,
1030
+ )
1031
+
1032
+
1033
+ search_bar.submit(
1034
+ update_table,
1035
+ [
1036
+ hidden_leaderboard_table_for_search,
1037
+ shown_columns,
1038
+ search_bar,
1039
+ filter_columns_type,
1040
+ filter_domain_specific,
1041
+ filter_columns_size
1042
+ # filter_columns_architecture
1043
+ ],
1044
+ leaderboard_table,
1045
+ )
1046
+ for selector in [
1047
+ shown_columns,
1048
+ filter_columns_type,
1049
+ filter_domain_specific,
1050
+ # filter_columns_architecture,
1051
+ filter_columns_size,
1052
+ # deleted_models_visibility,
1053
+ ]:
1054
+ selector.change(
1055
+ update_table,
1056
+ [
1057
+ hidden_leaderboard_table_for_search,
1058
+ shown_columns,
1059
+ search_bar,
1060
+ filter_columns_type,
1061
+ filter_domain_specific,
1062
+ filter_columns_size
1063
+ # filter_columns_architecture,
1064
+ ],
1065
+ leaderboard_table,
1066
+ queue=True,
1067
+ )
1068
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=5):
1069
  gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
1070
  gr.HTML(FIVE_PILLAR_DIAGRAM)
src/about.py CHANGED
@@ -96,6 +96,19 @@ class SOAPColumns(Enum):
96
  soap_column2 = SOAPColumn("fact", "score", "Consistency")
97
  # soap_column3 = SOAPColumn("brief", "score", "Conciseness")
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  NUM_FEWSHOT = 0 # Change with your few shot
100
  # ---------------------------------------------------
101
 
 
96
  soap_column2 = SOAPColumn("fact", "score", "Consistency")
97
  # soap_column3 = SOAPColumn("brief", "score", "Conciseness")
98
 
99
+ @dataclass
100
+ class ClosedEndedArabicColumn:
101
+ benchmark: str
102
+ metric: str
103
+ col_name: str
104
+
105
+ class ClosedEndedArabicColumns(Enum):
106
+ arabictask0 = ClosedEndedArabicColumn("MMLU-Arabic", "accuracy", "MMLU-Arabic")
107
+ arabictask2 = ClosedEndedArabicColumn("MedMCQA-Arabic", "accuracy", "MedMCQA-Arabic")
108
+ arabictask3 = ClosedEndedArabicColumn("MedQA-Arabic", "accuracy", "MedQA-Arabic")
109
+ arabictask5 = ClosedEndedArabicColumn("PubMedQA-Arabic", "accuracy", "PubMedQA-Arabic")
110
+
111
+
112
  NUM_FEWSHOT = 0 # Change with your few shot
113
  # ---------------------------------------------------
114
 
src/display/utils.py CHANGED
@@ -4,7 +4,8 @@ from enum import Enum
4
  import pandas as pd
5
 
6
  # changes to be made here
7
- from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns
 
8
  import json
9
  import gradio as gr
10
 
@@ -30,6 +31,7 @@ class ColumnContent:
30
  medical_summarization_col: bool = False
31
  aci_col: bool = False
32
  soap_col: bool = False
 
33
 
34
 
35
  ## Leaderboard columns
@@ -39,7 +41,7 @@ auto_eval_column_dict = []
39
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
40
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
41
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
42
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, invariant=False)])
43
  auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
44
  for task in HarnessTasks:
45
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
@@ -57,6 +59,9 @@ for column in ACIColumns:
57
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, aci_col=True, invariant=False)])
58
  for column in SOAPColumns:
59
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, soap_col=True, invariant=False)])
 
 
 
60
  auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
61
  auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
62
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
@@ -89,6 +94,8 @@ class EvalQueueColumn: # Queue column
89
  med_safety_status = ColumnContent("med_safety_status", "str", True)
90
  medical_summarization_status = ColumnContent("medical_summarization_status", "str", True)
91
  note_generation_status = ColumnContent("note_generation_status", "str", True)
 
 
92
 
93
  ## All the model information that we might need
94
  @dataclass
@@ -214,6 +221,8 @@ MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c
214
  MEDICAL_SUMMARIZATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medical_summarization_col or c.invariant)]
215
  ACI_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.aci_col or c.invariant)]
216
  SOAP_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.soap_col or c.invariant)]
 
 
217
  # CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.cross_examination_col or c.invariant)]
218
  # DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
219
  # OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
@@ -234,6 +243,8 @@ MED_SAFETY_BENCHMARK_COLS = [t.value.col_name for t in MedSafetyColumns]
234
  MEDICAL_SUMMARIZATION_BENCHMARK_COLS = [t.value.col_name for t in MedicalSummarizationColumns]
235
  ACI_BENCHMARK_COLS = [t.value.col_name for t in ACIColumns]
236
  SOAP_BENCHMARK_COLS = [t.value.col_name for t in SOAPColumns]
 
 
237
  # CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
238
 
239
  NUMERIC_INTERVALS = {
 
4
  import pandas as pd
5
 
6
  # changes to be made here
7
+ from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, ClosedEndedArabicColumns
8
+ from src.envs import PRIVATE_REPO
9
  import json
10
  import gradio as gr
11
 
 
31
  medical_summarization_col: bool = False
32
  aci_col: bool = False
33
  soap_col: bool = False
34
+ closed_ended_arabic_col: bool = False
35
 
36
 
37
  ## Leaderboard columns
 
41
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
42
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
43
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
44
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, closed_ended_arabic_col=True, invariant=False)])
45
  auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
46
  for task in HarnessTasks:
47
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
 
59
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, aci_col=True, invariant=False)])
60
  for column in SOAPColumns:
61
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, soap_col=True, invariant=False)])
62
+ # if PRIVATE_REPO:
63
+ for column in ClosedEndedArabicColumns:
64
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, closed_ended_arabic_col=True, invariant=False)])
65
  auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
66
  auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
67
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
 
94
  med_safety_status = ColumnContent("med_safety_status", "str", True)
95
  medical_summarization_status = ColumnContent("medical_summarization_status", "str", True)
96
  note_generation_status = ColumnContent("note_generation_status", "str", True)
97
+ if PRIVATE_REPO:
98
+ closed_ended_arabic_status = ColumnContent("closed_ended_arabic_status", "str", True)
99
 
100
  ## All the model information that we might need
101
  @dataclass
 
221
  MEDICAL_SUMMARIZATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medical_summarization_col or c.invariant)]
222
  ACI_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.aci_col or c.invariant)]
223
  SOAP_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.soap_col or c.invariant)]
224
+ # if PRIVATE_REPO:
225
+ CLOSED_ENDED_ARABIC_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.closed_ended_arabic_col or c.invariant)]
226
  # CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.cross_examination_col or c.invariant)]
227
  # DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
228
  # OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
 
243
  MEDICAL_SUMMARIZATION_BENCHMARK_COLS = [t.value.col_name for t in MedicalSummarizationColumns]
244
  ACI_BENCHMARK_COLS = [t.value.col_name for t in ACIColumns]
245
  SOAP_BENCHMARK_COLS = [t.value.col_name for t in SOAPColumns]
246
+ # if PRIVATE_REPO:
247
+ CLOSED_ENDED_ARABIC_BENCHMARK_COLS = [t.value.col_name for t in ClosedEndedArabicColumns]
248
  # CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
249
 
250
  NUMERIC_INTERVALS = {
src/leaderboard/read_evals.py CHANGED
@@ -9,8 +9,9 @@ import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
  # changes to be made here
12
- from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns
13
  from src.submission.check_validity import is_model_on_hub
 
14
 
15
 
16
  @dataclass
@@ -29,6 +30,7 @@ class EvalResult:
29
  medical_summarization_results: dict
30
  aci_results: dict
31
  soap_results: dict
 
32
  is_domain_specific: bool
33
  use_chat_template: bool
34
  # clinical_type_results:dict
@@ -162,6 +164,20 @@ class EvalResult:
162
  continue
163
  mean_acc = np.mean(accs) # * 100.0
164
  soap_results[task.benchmark] = mean_acc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  if open_ended_results == {} or med_safety_results == {} or medical_summarization_results == {} or aci_results == {} or soap_results == {}:
166
  open_ended_results = {}
167
  med_safety_results = {}
@@ -192,6 +208,7 @@ class EvalResult:
192
  medical_summarization_results=medical_summarization_results,
193
  aci_results=aci_results,
194
  soap_results=soap_results,
 
195
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
196
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
197
  precision=precision,
@@ -294,7 +311,13 @@ class EvalResult:
294
  for task in SOAPColumns:
295
  data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
296
  return data_dict
297
-
 
 
 
 
 
 
298
 
299
  def get_request_file_for_model(requests_path, model_name, precision):
300
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
 
9
 
10
  from src.display.formatting import make_clickable_model
11
  # changes to be made here
12
+ from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, ClosedEndedArabicColumns
13
  from src.submission.check_validity import is_model_on_hub
14
+ from src.envs import PRIVATE_REPO
15
 
16
 
17
  @dataclass
 
30
  medical_summarization_results: dict
31
  aci_results: dict
32
  soap_results: dict
33
+ closed_ended_arabic_results: dict
34
  is_domain_specific: bool
35
  use_chat_template: bool
36
  # clinical_type_results:dict
 
164
  continue
165
  mean_acc = np.mean(accs) # * 100.0
166
  soap_results[task.benchmark] = mean_acc
167
+ closed_ended_arabic_results = {}
168
+ if PRIVATE_REPO and "closed-ended-arabic" in data["results"]:
169
+ for task in ClosedEndedArabicColumns:
170
+ task = task.value
171
+ # We average all scores of a given metric (not all metrics are present in all files)
172
+ try:
173
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"]["closed-ended-arabic"].items() if task.benchmark == k])
174
+ except:
175
+ # breakpoint()
176
+ accs = np.array([])
177
+ if accs.size == 0 or any([acc is None for acc in accs]):
178
+ continue
179
+ mean_acc = np.mean(accs) # * 100.0
180
+ closed_ended_arabic_results[task.benchmark] = mean_acc
181
  if open_ended_results == {} or med_safety_results == {} or medical_summarization_results == {} or aci_results == {} or soap_results == {}:
182
  open_ended_results = {}
183
  med_safety_results = {}
 
208
  medical_summarization_results=medical_summarization_results,
209
  aci_results=aci_results,
210
  soap_results=soap_results,
211
+ closed_ended_arabic_results=closed_ended_arabic_results,
212
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
213
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
214
  precision=precision,
 
311
  for task in SOAPColumns:
312
  data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
313
  return data_dict
314
+ if PRIVATE_REPO and subset == "closed_ended_arabic":
315
+ average = sum([v for v in self.closed_ended_arabic_results.values() if v is not None]) / len(ClosedEndedArabicColumns)
316
+ data_dict[AutoEvalColumn.average.name] = average
317
+ if len(self.closed_ended_arabic_results) > 0:
318
+ for task in ClosedEndedArabicColumns:
319
+ data_dict[task.value.col_name] = self.closed_ended_arabic_results[task.value.benchmark]
320
+ return data_dict
321
 
322
  def get_request_file_for_model(requests_path, model_name, precision):
323
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
src/populate.py CHANGED
@@ -5,8 +5,9 @@ import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  # changes to be made here
8
- from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns
9
  from src.leaderboard.read_evals import get_raw_eval_results
 
10
 
11
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
@@ -30,6 +31,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
30
  df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
31
  elif subset == "soap":
32
  df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
 
 
33
  cols = list(set(df.columns).intersection(set(cols)))
34
  df = df[cols].round(decimals=2)
35
  # filter out if any of the benchmarks have not been produced
@@ -54,6 +57,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
54
  data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
55
  data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
56
  data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
 
 
57
  all_evals.append(data)
58
  elif ".md" not in entry:
59
  # this is a folder
@@ -70,6 +75,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
70
  data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
71
  data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
72
  data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
 
 
73
  all_evals.append(data)
74
  # breakpoint()
75
  pending_list = []
@@ -78,6 +85,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
78
  for run in all_evals:
79
  # changes to be made here
80
  status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["medical-summarization"], run["status"]["note-generation"]]
 
 
81
  # status_list = status_list
82
  if "RUNNING" in status_list:
83
  running_list.append(run)
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  # changes to be made here
8
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, ClosedEndedArabicColumns
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
+ from src.envs import PRIVATE_REPO
11
 
12
 
13
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
 
31
  df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
32
  elif subset == "soap":
33
  df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
34
+ elif subset == "closed_ended_arabic":
35
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
36
  cols = list(set(df.columns).intersection(set(cols)))
37
  df = df[cols].round(decimals=2)
38
  # filter out if any of the benchmarks have not been produced
 
57
  data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
58
  data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
59
  data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
60
+ if PRIVATE_REPO:
61
+ data[EvalQueueColumn.closed_ended_arabic_status.name] = data["status"]["closed-ended-arabic"]
62
  all_evals.append(data)
63
  elif ".md" not in entry:
64
  # this is a folder
 
75
  data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
76
  data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
77
  data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
78
+ if PRIVATE_REPO:
79
+ data[EvalQueueColumn.closed_ended_arabic_status.name] = data["status"]["closed-ended-arabic"]
80
  all_evals.append(data)
81
  # breakpoint()
82
  pending_list = []
 
85
  for run in all_evals:
86
  # changes to be made here
87
  status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["medical-summarization"], run["status"]["note-generation"]]
88
+ if PRIVATE_REPO:
89
+ status_list.append(run["status"]["closed-ended-arabic"])
90
  # status_list = status_list
91
  if "RUNNING" in status_list:
92
  running_list.append(run)