Spaces:
Running
Running
Commit
·
34c150d
1
Parent(s):
d86ca68
[FIX] handled cases where one of the results are not present
Browse files- src/leaderboard/read_evals.py +17 -17
src/leaderboard/read_evals.py
CHANGED
|
@@ -77,19 +77,19 @@ class EvalResult:
|
|
| 77 |
|
| 78 |
# Extract results available in this file (some results are split in several files)
|
| 79 |
harness_results = {}
|
| 80 |
-
|
| 81 |
-
task
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
# types_results = {}
|
| 94 |
# for clinical_type in ClinicalTypes:
|
| 95 |
# clinical_type = clinical_type.value
|
|
@@ -168,9 +168,9 @@ class EvalResult:
|
|
| 168 |
AutoEvalColumn.date.name: self.date,
|
| 169 |
"display_result" : self.display_result,
|
| 170 |
}
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
|
| 175 |
return data_dict
|
| 176 |
|
|
@@ -261,5 +261,5 @@ def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metri
|
|
| 261 |
results.append(v)
|
| 262 |
except KeyError: # not all eval values present
|
| 263 |
continue
|
| 264 |
-
|
| 265 |
return results
|
|
|
|
| 77 |
|
| 78 |
# Extract results available in this file (some results are split in several files)
|
| 79 |
harness_results = {}
|
| 80 |
+
if "closed-ended" in data["results"]:
|
| 81 |
+
for task in HarnessTasks:
|
| 82 |
+
task = task.value
|
| 83 |
+
# We average all scores of a given metric (not all metrics are present in all files)
|
| 84 |
+
try:
|
| 85 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"]["closed-ended"].items() if task.benchmark == k])
|
| 86 |
+
except:
|
| 87 |
+
# breakpoint()
|
| 88 |
+
accs = np.array([])
|
| 89 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 90 |
+
continue
|
| 91 |
+
mean_acc = np.mean(accs) # * 100.0
|
| 92 |
+
harness_results[task.benchmark] = mean_acc
|
| 93 |
# types_results = {}
|
| 94 |
# for clinical_type in ClinicalTypes:
|
| 95 |
# clinical_type = clinical_type.value
|
|
|
|
| 168 |
AutoEvalColumn.date.name: self.date,
|
| 169 |
"display_result" : self.display_result,
|
| 170 |
}
|
| 171 |
+
if len(self.dataset_results) > 0:
|
| 172 |
+
for task in HarnessTasks:
|
| 173 |
+
data_dict[task.value.col_name] = self.dataset_results[task.value.benchmark]
|
| 174 |
|
| 175 |
return data_dict
|
| 176 |
|
|
|
|
| 261 |
results.append(v)
|
| 262 |
except KeyError: # not all eval values present
|
| 263 |
continue
|
| 264 |
+
# breakpoint()
|
| 265 |
return results
|