Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
5639a81
1
Parent(s):
03f7287
Add new tasks and make leadboard work without new tasks evals
Browse files- src/display/utils.py +1 -1
- src/leaderboard/read_evals.py +18 -3
- src/populate.py +1 -1
- src/submission/check_validity.py +0 -1
- src/tools/plots.py +2 -0
- tasks_config/pt_config.yaml +42 -18
src/display/utils.py
CHANGED
|
@@ -65,7 +65,7 @@ auto_eval_column_dict.append(["eval_time", ColumnContent, ColumnContent("Evaluat
|
|
| 65 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
|
| 66 |
if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
| 67 |
auto_eval_column_dict.append(["original_benchmark_average", ColumnContent, ColumnContent("🤗 Leaderboard Average", "number", False)])
|
| 68 |
-
|
| 69 |
|
| 70 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 71 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
|
| 65 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
|
| 66 |
if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
| 67 |
auto_eval_column_dict.append(["original_benchmark_average", ColumnContent, ColumnContent("🤗 Leaderboard Average", "number", False)])
|
| 68 |
+
auto_eval_column_dict.append(["npm", ColumnContent, ColumnContent("NPM (Average) ⬆️", "number", False)])
|
| 69 |
|
| 70 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 71 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -3,6 +3,7 @@ import json
|
|
| 3 |
import math
|
| 4 |
import os
|
| 5 |
from dataclasses import dataclass
|
|
|
|
| 6 |
|
| 7 |
import dateutil
|
| 8 |
import numpy as np
|
|
@@ -155,7 +156,19 @@ class EvalResult:
|
|
| 155 |
|
| 156 |
def to_dict(self):
|
| 157 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 158 |
-
average =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
data_dict = {
|
| 160 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 161 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
@@ -174,11 +187,13 @@ class EvalResult:
|
|
| 174 |
AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
|
| 175 |
AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
|
| 176 |
AutoEvalColumn.flagged.name: self.flagged,
|
| 177 |
-
AutoEvalColumn.eval_time.name: self.eval_time
|
|
|
|
| 178 |
}
|
| 179 |
|
| 180 |
for task in Tasks:
|
| 181 |
-
|
|
|
|
| 182 |
|
| 183 |
if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
| 184 |
data_dict[AutoEvalColumn.original_benchmark_average.name] = self.original_benchmark_average
|
|
|
|
| 3 |
import math
|
| 4 |
import os
|
| 5 |
from dataclasses import dataclass
|
| 6 |
+
from typing import List
|
| 7 |
|
| 8 |
import dateutil
|
| 9 |
import numpy as np
|
|
|
|
| 156 |
|
| 157 |
def to_dict(self):
|
| 158 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 159 |
+
average = []
|
| 160 |
+
npm = []
|
| 161 |
+
for task in Tasks:
|
| 162 |
+
if task.value.benchmark not in self.results:
|
| 163 |
+
continue
|
| 164 |
+
res = self.results[task.value.benchmark]
|
| 165 |
+
if res is None or np.isnan(res) or not (isinstance(res, float) or isinstance(res, int)):
|
| 166 |
+
continue
|
| 167 |
+
average.append(res)
|
| 168 |
+
npm.append((res-task.value.baseline)*100.0 / (100.0-task.value.baseline))
|
| 169 |
+
average = sum(average)/len(average)
|
| 170 |
+
npm = sum(npm)/len(npm)
|
| 171 |
+
|
| 172 |
data_dict = {
|
| 173 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 174 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
|
| 187 |
AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
|
| 188 |
AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
|
| 189 |
AutoEvalColumn.flagged.name: self.flagged,
|
| 190 |
+
AutoEvalColumn.eval_time.name: self.eval_time,
|
| 191 |
+
AutoEvalColumn.npm.name: npm
|
| 192 |
}
|
| 193 |
|
| 194 |
for task in Tasks:
|
| 195 |
+
if task.value.benchmark in self.results:
|
| 196 |
+
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
| 197 |
|
| 198 |
if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
| 199 |
data_dict[AutoEvalColumn.original_benchmark_average.name] = self.original_benchmark_average
|
src/populate.py
CHANGED
|
@@ -21,7 +21,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
|
|
| 21 |
df = df[cols].round(decimals=2)
|
| 22 |
|
| 23 |
# filter out if any of the benchmarks have not been produced
|
| 24 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 25 |
return raw_data, df
|
| 26 |
|
| 27 |
|
|
|
|
| 21 |
df = df[cols].round(decimals=2)
|
| 22 |
|
| 23 |
# filter out if any of the benchmarks have not been produced
|
| 24 |
+
#df = df[has_no_nan_values(df, benchmark_cols)]
|
| 25 |
return raw_data, df
|
| 26 |
|
| 27 |
|
src/submission/check_validity.py
CHANGED
|
@@ -22,7 +22,6 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
| 22 |
except huggingface_hub.utils.EntryNotFoundError:
|
| 23 |
return False, "Please add a model card to your model to explain how you trained/fine-tuned it.", None
|
| 24 |
except Exception as e:
|
| 25 |
-
traceback.print_exc()
|
| 26 |
return False, f"Error while loading the model card. Exception: {str(e)}", None
|
| 27 |
|
| 28 |
# Enforce license metadata
|
|
|
|
| 22 |
except huggingface_hub.utils.EntryNotFoundError:
|
| 23 |
return False, "Please add a model card to your model to explain how you trained/fine-tuned it.", None
|
| 24 |
except Exception as e:
|
|
|
|
| 25 |
return False, f"Error while loading the model card. Exception: {str(e)}", None
|
| 26 |
|
| 27 |
# Enforce license metadata
|
src/tools/plots.py
CHANGED
|
@@ -41,6 +41,8 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
| 41 |
if task.benchmark == "Average":
|
| 42 |
current_score = np.mean(list(row["results"].values()))
|
| 43 |
else:
|
|
|
|
|
|
|
| 44 |
current_score = row["results"][task.benchmark]
|
| 45 |
|
| 46 |
if current_score > current_max:
|
|
|
|
| 41 |
if task.benchmark == "Average":
|
| 42 |
current_score = np.mean(list(row["results"].values()))
|
| 43 |
else:
|
| 44 |
+
if task.benchmark not in row["results"]:
|
| 45 |
+
continue
|
| 46 |
current_score = row["results"][task.benchmark]
|
| 47 |
|
| 48 |
if current_score > current_max:
|
tasks_config/pt_config.yaml
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
version: 1.
|
| 2 |
config:
|
| 3 |
REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
|
| 4 |
QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
|
|
@@ -160,25 +160,49 @@ tasks:
|
|
| 160 |
entailment task between a question and its possible answers."
|
| 161 |
link: https://huggingface.co/datasets/ruanchaves/faquad-nli
|
| 162 |
sources: ["https://github.com/liafacom/faquad/"]
|
| 163 |
-
|
| 164 |
-
benchmark:
|
| 165 |
-
col_name:
|
| 166 |
task_list:
|
| 167 |
-
-
|
| 168 |
-
- sparrow_hate-2019-fortuna-por
|
| 169 |
-
- sparrow_sentiment-2016-mozetic-por
|
| 170 |
-
- sparrow_sentiment-2018-brum-por
|
| 171 |
metric: f1_macro
|
| 172 |
few_shot: 25
|
| 173 |
-
|
| 174 |
-
baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
|
| 175 |
human_baseline: null
|
| 176 |
expert_human_baseline: null
|
| 177 |
-
description: "
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: 1.1.0
|
| 2 |
config:
|
| 3 |
REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
|
| 4 |
QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
|
|
|
|
| 160 |
entailment task between a question and its possible answers."
|
| 161 |
link: https://huggingface.co/datasets/ruanchaves/faquad-nli
|
| 162 |
sources: ["https://github.com/liafacom/faquad/"]
|
| 163 |
+
hatebr_offensive:
|
| 164 |
+
benchmark: hatebr_offensive
|
| 165 |
+
col_name: HateBR Offensive
|
| 166 |
task_list:
|
| 167 |
+
- hatebr_offensive
|
|
|
|
|
|
|
|
|
|
| 168 |
metric: f1_macro
|
| 169 |
few_shot: 25
|
| 170 |
+
baseline: 50.0
|
|
|
|
| 171 |
human_baseline: null
|
| 172 |
expert_human_baseline: null
|
| 173 |
+
description: "HateBR is the first large-scale expert annotated dataset of Brazilian Instagram comments for abusive language detection
|
| 174 |
+
on the web and social media. The HateBR was collected from Brazilian Instagram comments of politicians and manually annotated
|
| 175 |
+
by specialists. It is composed of 7,000 documents annotated with a binary classification (offensive
|
| 176 |
+
versus non-offensive comments)."
|
| 177 |
+
link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
|
| 178 |
+
sources: ["https://github.com/franciellevargas/HateBR", "https://huggingface.co/datasets/ruanchaves/hatebr"]
|
| 179 |
+
portuguese_hate_speech:
|
| 180 |
+
benchmark: portuguese_hate_speech
|
| 181 |
+
col_name: PT Hate Speech
|
| 182 |
+
task_list:
|
| 183 |
+
- portuguese_hate_speech
|
| 184 |
+
metric: f1_macro
|
| 185 |
+
few_shot: 25
|
| 186 |
+
baseline: 47.9
|
| 187 |
+
human_baseline: null
|
| 188 |
+
expert_human_baseline: null
|
| 189 |
+
description: "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate')"
|
| 190 |
+
link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
|
| 191 |
+
sources: ["https://github.com/paulafortuna/Portuguese-Hate-Speech-Dataset", "https://huggingface.co/datasets/hate_speech_portuguese"]
|
| 192 |
+
tweetsentbr:
|
| 193 |
+
benchmark: tweetsentbr
|
| 194 |
+
col_name: tweetSentBR
|
| 195 |
+
task_list:
|
| 196 |
+
- tweetsentbr
|
| 197 |
+
metric: f1_macro
|
| 198 |
+
few_shot: 25
|
| 199 |
+
baseline: 32.8
|
| 200 |
+
human_baseline: null
|
| 201 |
+
expert_human_baseline: null
|
| 202 |
+
description: "TweetSentBR is a corpus of Tweets in Brazilian Portuguese.
|
| 203 |
+
It was labeled by several annotators following steps stablished on the literature for
|
| 204 |
+
improving reliability on the task of Sentiment Analysis. Each Tweet was annotated
|
| 205 |
+
in one of the three following classes: Positive, Negative, Neutral."
|
| 206 |
+
link: https://bitbucket.org/HBrum/tweetsentbr
|
| 207 |
+
sources: ["https://bitbucket.org/HBrum/tweetsentbr", "https://arxiv.org/abs/1712.08917"]
|
| 208 |
+
|