SCORE leaderboard v1
Browse files- app.py +15 -15
- src/about.py +45 -5
- src/envs.py +2 -1
- src/leaderboard/read_evals.py +18 -14
- src/populate.py +1 -0
- src/submission/check_validity.py +1 -0
app.py
CHANGED
@@ -33,20 +33,20 @@ def restart_space():
|
|
33 |
API.restart_space(repo_id=REPO_ID)
|
34 |
|
35 |
### Space initialisation
|
36 |
-
try:
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
except Exception:
|
42 |
-
|
43 |
-
try:
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
except Exception:
|
49 |
-
|
50 |
|
51 |
|
52 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
@@ -199,6 +199,6 @@ with demo:
|
|
199 |
)
|
200 |
|
201 |
scheduler = BackgroundScheduler()
|
202 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
203 |
scheduler.start()
|
204 |
demo.queue(default_concurrency_limit=40).launch()
|
|
|
33 |
API.restart_space(repo_id=REPO_ID)
|
34 |
|
35 |
### Space initialisation
|
36 |
+
# try:
|
37 |
+
# print(EVAL_REQUESTS_PATH)
|
38 |
+
# snapshot_download(
|
39 |
+
# repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
40 |
+
# )
|
41 |
+
# except Exception:
|
42 |
+
# restart_space()
|
43 |
+
# try:
|
44 |
+
# print(EVAL_RESULTS_PATH)
|
45 |
+
# snapshot_download(
|
46 |
+
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
47 |
+
# )
|
48 |
+
# except Exception:
|
49 |
+
# restart_space()
|
50 |
|
51 |
|
52 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
199 |
)
|
200 |
|
201 |
scheduler = BackgroundScheduler()
|
202 |
+
# scheduler.add_job(restart_space, "interval", seconds=1800)
|
203 |
scheduler.start()
|
204 |
demo.queue(default_concurrency_limit=40).launch()
|
src/about.py
CHANGED
@@ -12,20 +12,60 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("anli_r1", "acc", "ANLI")
|
16 |
-
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
20 |
|
21 |
|
22 |
-
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
# task0 = Task("anli_r1", "acc", "ANLI")
|
16 |
+
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
+
|
18 |
+
# task0 = Task('agieval', 'accuracy', 'AGIEval_acc')
|
19 |
+
# task1 = Task('agieval', 'consistency', 'AGIEval_CR')
|
20 |
+
# task2 = Task('mmlu_pro', 'accuracy', 'MMLU-Pro_acc')
|
21 |
+
# task3 = Task('mmlu_pro', 'consistency', 'MMLU-Pro_CR')
|
22 |
+
# task4 = Task('math', 'accuracy', 'Math_acc')
|
23 |
+
# task5 = Task('math', 'consistency', 'Math_CR')
|
24 |
+
|
25 |
+
task0 = Task('agieval-acc', 'accuracy', 'AGIEval Mean (Min, Max)')
|
26 |
+
task1 = Task('agieval-cr', 'consistency', 'AGIEval CR')
|
27 |
+
task2 = Task('mmlu_pro-acc', 'accuracy', 'MMLU-Pro Mean (Min, Max)')
|
28 |
+
task3 = Task('mmlu_pro-cr', 'consistency', 'MMLU-Pro CR')
|
29 |
+
task4 = Task('math-acc', 'accuracy', 'Math Mean (Min, Max)')
|
30 |
+
task5 = Task('math-cr', 'consistency', 'Math CR')
|
31 |
+
|
32 |
|
33 |
NUM_FEWSHOT = 0 # Change with your few shot
|
34 |
# ---------------------------------------------------
|
35 |
|
36 |
|
|
|
37 |
# Your leaderboard name
|
38 |
+
TITLE = """<h1 align="center" id="space-title">SCORE Leaderboard</h1>"""
|
39 |
|
40 |
# What does your leaderboard evaluate?
|
41 |
INTRODUCTION_TEXT = """
|
42 |
+
We introduce <b>SCORE</b> - an open and holistic evaluation framework for LLMs centered on robustness i.e. the ability to produce consistent responses when the input is rephrased
|
43 |
+
or presented in a slightly different way. Prediction consistency is particularly crucial for factual questions where an objective answer exists. Note that it is expected
|
44 |
+
that the predictions are equivalent and not necessarily correct. Models are evaluated multiple times in equivalent setups and accuracy range along with prediction
|
45 |
+
consistency rate is reported. Contrary to a single accuracy metrics (often derived from an optimized setup) reported during model releases, this better simulates human
|
46 |
+
interaction setups and provides better estimate of real world performance. Furthermore, models are evaluated using the same setup which makes model comparison possible.
|
47 |
+
|
48 |
+
<h1 align="center" id="space-title">Tasks</h1>
|
49 |
+
<b>Prompt Robustness</b> - Models are evaluated on ten different prompts. For multiple choice question (MCQ) datasets, prompts ask the model to choose the right option
|
50 |
+
letter. For MATH, prompts ask the model to solve the problem. The prompt set is diverse enough to cover various content and formatting styles that the model may encounter
|
51 |
+
in real life, they are not adversarial or tuned in any way. Prompts are semantically close, vary by instruction and level of response details. Prompts end with final
|
52 |
+
answer formatting instructions. We include both CoT and non-CoT prompts and vary the placement of the question in the prompt to be either in the beginning, in the middle,
|
53 |
+
or at the end of the prompt.
|
54 |
+
|
55 |
+
<b>Non Greedy Inference</b> - We study the effect of random seed during non-greedy inference. For factual questions the model's underlying distribution should be sharp enough
|
56 |
+
to be independent of the random seed for the next token sampling. There is an inherent randomness in the answer generation process, which may affect the "path" model takes to arrive at an answer.
|
57 |
+
|
58 |
+
<b>Choice Order Robustness</b> - We test models against changes in the order of choices for MCQ datasets. We swap the order of choices and ensure the correct answer
|
59 |
+
is always the same option (all correct answers are A or B, etc). Changing the order of choices does not change the input's semantics, and it is expected that the models
|
60 |
+
will be robust against such minimal change.
|
61 |
+
|
62 |
+
<h1 align="center" id="space-title">Datasets</h1>
|
63 |
+
<b>MMLU Pro</b> - text? <br>
|
64 |
+
<b>AGIEval</b> - text? <br>
|
65 |
+
<b>MATH</b> - text <br>
|
66 |
+
|
67 |
+
<h1 align="center" id="space-title">Metrics</h1>
|
68 |
+
|
69 |
"""
|
70 |
|
71 |
# Which evaluations are you running? how can people reproduce what you have?
|
src/envs.py
CHANGED
@@ -6,7 +6,8 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
-
OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
|
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
# OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
+
OWNER = "nvidia" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
11 |
# ----------------------------------
|
12 |
|
13 |
REPO_ID = f"{OWNER}/leaderboard"
|
src/leaderboard/read_evals.py
CHANGED
@@ -18,14 +18,14 @@ class EvalResult:
|
|
18 |
"""
|
19 |
eval_name: str # org_model_precision (uid)
|
20 |
full_model: str # org/model (path on hub)
|
21 |
-
org: str
|
22 |
model: str
|
23 |
revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
precision: Precision = Precision.Unknown
|
26 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
-
architecture: str = "Unknown"
|
29 |
license: str = "?"
|
30 |
likes: int = 0
|
31 |
num_params: int = 0
|
@@ -35,6 +35,7 @@ class EvalResult:
|
|
35 |
@classmethod
|
36 |
def init_from_json_file(self, json_filepath):
|
37 |
"""Inits the result from the specific model result file"""
|
|
|
38 |
with open(json_filepath) as fp:
|
39 |
data = json.load(fp)
|
40 |
|
@@ -58,7 +59,7 @@ class EvalResult:
|
|
58 |
full_model = "/".join(org_and_model)
|
59 |
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
-
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
)
|
63 |
architecture = "?"
|
64 |
if model_config is not None:
|
@@ -72,20 +73,23 @@ class EvalResult:
|
|
72 |
task = task.value
|
73 |
|
74 |
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
-
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
-
continue
|
78 |
-
|
79 |
-
mean_acc = np.mean(accs) * 100.0
|
80 |
-
results[task.benchmark] = mean_acc
|
81 |
-
|
|
|
|
|
|
|
82 |
return self(
|
83 |
eval_name=result_key,
|
84 |
full_model=full_model,
|
85 |
org=org,
|
86 |
model=model,
|
87 |
results=results,
|
88 |
-
precision=precision,
|
89 |
revision= config.get("model_sha", ""),
|
90 |
still_on_hub=still_on_hub,
|
91 |
architecture=architecture
|
@@ -109,7 +113,7 @@ class EvalResult:
|
|
109 |
|
110 |
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -119,7 +123,8 @@ class EvalResult:
|
|
119 |
AutoEvalColumn.architecture.name: self.architecture,
|
120 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
AutoEvalColumn.revision.name: self.revision,
|
122 |
-
AutoEvalColumn.average.name:
|
|
|
123 |
AutoEvalColumn.license.name: self.license,
|
124 |
AutoEvalColumn.likes.name: self.likes,
|
125 |
AutoEvalColumn.params.name: self.num_params,
|
@@ -184,7 +189,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
184 |
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
185 |
else:
|
186 |
eval_results[eval_name] = eval_result
|
187 |
-
|
188 |
results = []
|
189 |
for v in eval_results.values():
|
190 |
try:
|
|
|
18 |
"""
|
19 |
eval_name: str # org_model_precision (uid)
|
20 |
full_model: str # org/model (path on hub)
|
21 |
+
org: str
|
22 |
model: str
|
23 |
revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
precision: Precision = Precision.Unknown
|
26 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
+
architecture: str = "Unknown"
|
29 |
license: str = "?"
|
30 |
likes: int = 0
|
31 |
num_params: int = 0
|
|
|
35 |
@classmethod
|
36 |
def init_from_json_file(self, json_filepath):
|
37 |
"""Inits the result from the specific model result file"""
|
38 |
+
|
39 |
with open(json_filepath) as fp:
|
40 |
data = json.load(fp)
|
41 |
|
|
|
59 |
full_model = "/".join(org_and_model)
|
60 |
|
61 |
still_on_hub, _, model_config = is_model_on_hub(
|
62 |
+
full_model, config.get("model_sha", "main"), token=os.getenv('HF_TOKEN', None), trust_remote_code=True, test_tokenizer=False
|
63 |
)
|
64 |
architecture = "?"
|
65 |
if model_config is not None:
|
|
|
73 |
task = task.value
|
74 |
|
75 |
# We average all scores of a given metric (not all metrics are present in all files)
|
76 |
+
# accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
77 |
+
# if accs.size == 0 or any([acc is None for acc in accs]):
|
78 |
+
# continue
|
79 |
+
|
80 |
+
# mean_acc = np.mean(accs) * 100.0
|
81 |
+
# results[task.benchmark] = [mean_acc, 100]
|
82 |
+
metric = data["results"][task.benchmark.split('-')[0]].get(task.metric, None)
|
83 |
+
if task.benchmark.endswith("acc"):
|
84 |
+
metric = f"{metric[0]}, ({metric[1]}, {metric[2]})"
|
85 |
+
results[task.benchmark] = metric
|
86 |
return self(
|
87 |
eval_name=result_key,
|
88 |
full_model=full_model,
|
89 |
org=org,
|
90 |
model=model,
|
91 |
results=results,
|
92 |
+
precision=precision,
|
93 |
revision= config.get("model_sha", ""),
|
94 |
still_on_hub=still_on_hub,
|
95 |
architecture=architecture
|
|
|
113 |
|
114 |
def to_dict(self):
|
115 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
116 |
+
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
117 |
data_dict = {
|
118 |
"eval_name": self.eval_name, # not a column, just a save name,
|
119 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
123 |
AutoEvalColumn.architecture.name: self.architecture,
|
124 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
125 |
AutoEvalColumn.revision.name: self.revision,
|
126 |
+
AutoEvalColumn.average.name: 0,
|
127 |
+
# AutoEvalColumn.average.name: average,
|
128 |
AutoEvalColumn.license.name: self.license,
|
129 |
AutoEvalColumn.likes.name: self.likes,
|
130 |
AutoEvalColumn.params.name: self.num_params,
|
|
|
189 |
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
190 |
else:
|
191 |
eval_results[eval_name] = eval_result
|
|
|
192 |
results = []
|
193 |
for v in eval_results.values():
|
194 |
try:
|
src/populate.py
CHANGED
@@ -19,6 +19,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
|
|
22 |
return df
|
23 |
|
24 |
|
|
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
+
print(df)
|
23 |
return df
|
24 |
|
25 |
|
src/submission/check_validity.py
CHANGED
@@ -33,6 +33,7 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
33 |
|
34 |
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
35 |
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
|
|
36 |
try:
|
37 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
38 |
if test_tokenizer:
|
|
|
33 |
|
34 |
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
35 |
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
36 |
+
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
37 |
try:
|
38 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
39 |
if test_tokenizer:
|