Spaces:
Restarting
Restarting
Commit
·
34c150d
1
Parent(s):
d86ca68
[FIX] handled cases where one of the results are not present
Browse files- src/leaderboard/read_evals.py +17 -17
src/leaderboard/read_evals.py
CHANGED
@@ -77,19 +77,19 @@ class EvalResult:
|
|
77 |
|
78 |
# Extract results available in this file (some results are split in several files)
|
79 |
harness_results = {}
|
80 |
-
|
81 |
-
task
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
# types_results = {}
|
94 |
# for clinical_type in ClinicalTypes:
|
95 |
# clinical_type = clinical_type.value
|
@@ -168,9 +168,9 @@ class EvalResult:
|
|
168 |
AutoEvalColumn.date.name: self.date,
|
169 |
"display_result" : self.display_result,
|
170 |
}
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
|
175 |
return data_dict
|
176 |
|
@@ -261,5 +261,5 @@ def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metri
|
|
261 |
results.append(v)
|
262 |
except KeyError: # not all eval values present
|
263 |
continue
|
264 |
-
|
265 |
return results
|
|
|
77 |
|
78 |
# Extract results available in this file (some results are split in several files)
|
79 |
harness_results = {}
|
80 |
+
if "closed-ended" in data["results"]:
|
81 |
+
for task in HarnessTasks:
|
82 |
+
task = task.value
|
83 |
+
# We average all scores of a given metric (not all metrics are present in all files)
|
84 |
+
try:
|
85 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"]["closed-ended"].items() if task.benchmark == k])
|
86 |
+
except:
|
87 |
+
# breakpoint()
|
88 |
+
accs = np.array([])
|
89 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
90 |
+
continue
|
91 |
+
mean_acc = np.mean(accs) # * 100.0
|
92 |
+
harness_results[task.benchmark] = mean_acc
|
93 |
# types_results = {}
|
94 |
# for clinical_type in ClinicalTypes:
|
95 |
# clinical_type = clinical_type.value
|
|
|
168 |
AutoEvalColumn.date.name: self.date,
|
169 |
"display_result" : self.display_result,
|
170 |
}
|
171 |
+
if len(self.dataset_results) > 0:
|
172 |
+
for task in HarnessTasks:
|
173 |
+
data_dict[task.value.col_name] = self.dataset_results[task.value.benchmark]
|
174 |
|
175 |
return data_dict
|
176 |
|
|
|
261 |
results.append(v)
|
262 |
except KeyError: # not all eval values present
|
263 |
continue
|
264 |
+
# breakpoint()
|
265 |
return results
|