tathagataraha commited on
Commit
34c150d
·
1 Parent(s): d86ca68

[FIX] handled cases where one of the results are not present

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +17 -17
src/leaderboard/read_evals.py CHANGED
@@ -77,19 +77,19 @@ class EvalResult:
77
 
78
  # Extract results available in this file (some results are split in several files)
79
  harness_results = {}
80
- for task in HarnessTasks:
81
- task = task.value
82
-
83
- # We average all scores of a given metric (not all metrics are present in all files)
84
- try:
85
- accs = np.array([v.get(task.metric, None) for k, v in data["results"]["closed-ended"].items() if task.benchmark == k])
86
- except:
87
- breakpoint()
88
- if accs.size == 0 or any([acc is None for acc in accs]):
89
- continue
90
-
91
- mean_acc = np.mean(accs) # * 100.0
92
- harness_results[task.benchmark] = mean_acc
93
  # types_results = {}
94
  # for clinical_type in ClinicalTypes:
95
  # clinical_type = clinical_type.value
@@ -168,9 +168,9 @@ class EvalResult:
168
  AutoEvalColumn.date.name: self.date,
169
  "display_result" : self.display_result,
170
  }
171
-
172
- for task in HarnessTasks:
173
- data_dict[task.value.col_name] = self.dataset_results[task.value.benchmark]
174
 
175
  return data_dict
176
 
@@ -261,5 +261,5 @@ def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metri
261
  results.append(v)
262
  except KeyError: # not all eval values present
263
  continue
264
-
265
  return results
 
77
 
78
  # Extract results available in this file (some results are split in several files)
79
  harness_results = {}
80
+ if "closed-ended" in data["results"]:
81
+ for task in HarnessTasks:
82
+ task = task.value
83
+ # We average all scores of a given metric (not all metrics are present in all files)
84
+ try:
85
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"]["closed-ended"].items() if task.benchmark == k])
86
+ except:
87
+ # breakpoint()
88
+ accs = np.array([])
89
+ if accs.size == 0 or any([acc is None for acc in accs]):
90
+ continue
91
+ mean_acc = np.mean(accs) # * 100.0
92
+ harness_results[task.benchmark] = mean_acc
93
  # types_results = {}
94
  # for clinical_type in ClinicalTypes:
95
  # clinical_type = clinical_type.value
 
168
  AutoEvalColumn.date.name: self.date,
169
  "display_result" : self.display_result,
170
  }
171
+ if len(self.dataset_results) > 0:
172
+ for task in HarnessTasks:
173
+ data_dict[task.value.col_name] = self.dataset_results[task.value.benchmark]
174
 
175
  return data_dict
176
 
 
261
  results.append(v)
262
  except KeyError: # not all eval values present
263
  continue
264
+ # breakpoint()
265
  return results