Spaces:
Runtime error
Runtime error
Update src/leaderboard/read_evals.py
Browse files
src/leaderboard/read_evals.py
CHANGED
|
@@ -103,13 +103,6 @@ class EvalResult:
|
|
| 103 |
results[task.benchmark] = 0.0
|
| 104 |
continue
|
| 105 |
|
| 106 |
-
# New tasks have been added, we need to skip them if not exists
|
| 107 |
-
if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eq_bench", "ko_inst_follow", "kor_nat_cka", "kor_nat_sva", "ko_harmlessness", "ko_helpfulness"]:
|
| 108 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
| 109 |
-
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 110 |
-
results[task.benchmark] = 0.0
|
| 111 |
-
continue
|
| 112 |
-
|
| 113 |
# We average all scores of a given metric (mostly for mmlu)
|
| 114 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
| 115 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
|
@@ -154,23 +147,6 @@ class EvalResult:
|
|
| 154 |
# Skip the new tasks for now
|
| 155 |
# TODO: safely remove this code when the task results are all added
|
| 156 |
skip_avg_len = 0
|
| 157 |
-
if self.results['ko_winogrande'] == 0.0:
|
| 158 |
-
skip_avg_len += 1
|
| 159 |
-
if self.results['ko_gsm8k'] == 0.0:
|
| 160 |
-
skip_avg_len += 1
|
| 161 |
-
if self.results['ko_eq_bench'] == 0.0:
|
| 162 |
-
skip_avg_len += 1
|
| 163 |
-
if self.results['ko_inst_follow'] == 0.0:
|
| 164 |
-
skip_avg_len += 1
|
| 165 |
-
if self.results['kor_nat_cka'] == 0.0:
|
| 166 |
-
skip_avg_len += 1
|
| 167 |
-
if self.results['kor_nat_sva'] == 0.0:
|
| 168 |
-
skip_avg_len += 1
|
| 169 |
-
if self.results['ko_harmlessness'] == 0.0:
|
| 170 |
-
skip_avg_len += 1
|
| 171 |
-
if self.results['ko_helpfulness'] == 0.0:
|
| 172 |
-
skip_avg_len += 1
|
| 173 |
-
|
| 174 |
average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
|
| 175 |
|
| 176 |
data_dict = {
|
|
|
|
| 103 |
results[task.benchmark] = 0.0
|
| 104 |
continue
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
# We average all scores of a given metric (mostly for mmlu)
|
| 107 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
| 108 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
|
|
|
| 147 |
# Skip the new tasks for now
|
| 148 |
# TODO: safely remove this code when the task results are all added
|
| 149 |
skip_avg_len = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
|
| 151 |
|
| 152 |
data_dict = {
|