Spaces:
Runtime error
Runtime error
Update src/leaderboard/read_evals.py
Browse files
src/leaderboard/read_evals.py
CHANGED
@@ -103,13 +103,6 @@ class EvalResult:
|
|
103 |
results[task.benchmark] = 0.0
|
104 |
continue
|
105 |
|
106 |
-
# New tasks have been added, we need to skip them if not exists
|
107 |
-
if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eq_bench", "ko_inst_follow", "kor_nat_cka", "kor_nat_sva", "ko_harmlessness", "ko_helpfulness"]:
|
108 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
109 |
-
if accs.size == 0 or any([acc is None for acc in accs]):
|
110 |
-
results[task.benchmark] = 0.0
|
111 |
-
continue
|
112 |
-
|
113 |
# We average all scores of a given metric (mostly for mmlu)
|
114 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
115 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
@@ -154,23 +147,6 @@ class EvalResult:
|
|
154 |
# Skip the new tasks for now
|
155 |
# TODO: safely remove this code when the task results are all added
|
156 |
skip_avg_len = 0
|
157 |
-
if self.results['ko_winogrande'] == 0.0:
|
158 |
-
skip_avg_len += 1
|
159 |
-
if self.results['ko_gsm8k'] == 0.0:
|
160 |
-
skip_avg_len += 1
|
161 |
-
if self.results['ko_eq_bench'] == 0.0:
|
162 |
-
skip_avg_len += 1
|
163 |
-
if self.results['ko_inst_follow'] == 0.0:
|
164 |
-
skip_avg_len += 1
|
165 |
-
if self.results['kor_nat_cka'] == 0.0:
|
166 |
-
skip_avg_len += 1
|
167 |
-
if self.results['kor_nat_sva'] == 0.0:
|
168 |
-
skip_avg_len += 1
|
169 |
-
if self.results['ko_harmlessness'] == 0.0:
|
170 |
-
skip_avg_len += 1
|
171 |
-
if self.results['ko_helpfulness'] == 0.0:
|
172 |
-
skip_avg_len += 1
|
173 |
-
|
174 |
average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
|
175 |
|
176 |
data_dict = {
|
|
|
103 |
results[task.benchmark] = 0.0
|
104 |
continue
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
# We average all scores of a given metric (mostly for mmlu)
|
107 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
108 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
|
|
147 |
# Skip the new tasks for now
|
148 |
# TODO: safely remove this code when the task results are all added
|
149 |
skip_avg_len = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
|
151 |
|
152 |
data_dict = {
|