Grigor-N commited on
Commit
28cf97d
·
1 Parent(s): 3e5a104

SCORE leaderboard v1

Browse files
app.py CHANGED
@@ -33,20 +33,20 @@ def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
  ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
 
51
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
@@ -199,6 +199,6 @@ with demo:
199
  )
200
 
201
  scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
  demo.queue(default_concurrency_limit=40).launch()
 
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
  ### Space initialisation
36
+ # try:
37
+ # print(EVAL_REQUESTS_PATH)
38
+ # snapshot_download(
39
+ # repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
+ # )
41
+ # except Exception:
42
+ # restart_space()
43
+ # try:
44
+ # print(EVAL_RESULTS_PATH)
45
+ # snapshot_download(
46
+ # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
+ # )
48
+ # except Exception:
49
+ # restart_space()
50
 
51
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
199
  )
200
 
201
  scheduler = BackgroundScheduler()
202
+ # scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
  demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -12,20 +12,60 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
20
 
21
 
22
-
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ # task0 = Task("anli_r1", "acc", "ANLI")
16
+ # task1 = Task("logiqa", "acc_norm", "LogiQA")
17
+
18
+ # task0 = Task('agieval', 'accuracy', 'AGIEval_acc')
19
+ # task1 = Task('agieval', 'consistency', 'AGIEval_CR')
20
+ # task2 = Task('mmlu_pro', 'accuracy', 'MMLU-Pro_acc')
21
+ # task3 = Task('mmlu_pro', 'consistency', 'MMLU-Pro_CR')
22
+ # task4 = Task('math', 'accuracy', 'Math_acc')
23
+ # task5 = Task('math', 'consistency', 'Math_CR')
24
+
25
+ task0 = Task('agieval-acc', 'accuracy', 'AGIEval Mean (Min, Max)')
26
+ task1 = Task('agieval-cr', 'consistency', 'AGIEval CR')
27
+ task2 = Task('mmlu_pro-acc', 'accuracy', 'MMLU-Pro Mean (Min, Max)')
28
+ task3 = Task('mmlu_pro-cr', 'consistency', 'MMLU-Pro CR')
29
+ task4 = Task('math-acc', 'accuracy', 'Math Mean (Min, Max)')
30
+ task5 = Task('math-cr', 'consistency', 'Math CR')
31
+
32
 
33
  NUM_FEWSHOT = 0 # Change with your few shot
34
  # ---------------------------------------------------
35
 
36
 
 
37
  # Your leaderboard name
38
+ TITLE = """<h1 align="center" id="space-title">SCORE Leaderboard</h1>"""
39
 
40
  # What does your leaderboard evaluate?
41
  INTRODUCTION_TEXT = """
42
+ We introduce <b>SCORE</b> - an open and holistic evaluation framework for LLMs centered on robustness i.e. the ability to produce consistent responses when the input is rephrased
43
+ or presented in a slightly different way. Prediction consistency is particularly crucial for factual questions where an objective answer exists. Note that it is expected
44
+ that the predictions are equivalent and not necessarily correct. Models are evaluated multiple times in equivalent setups and accuracy range along with prediction
45
+ consistency rate is reported. Contrary to a single accuracy metrics (often derived from an optimized setup) reported during model releases, this better simulates human
46
+ interaction setups and provides better estimate of real world performance. Furthermore, models are evaluated using the same setup which makes model comparison possible.
47
+
48
+ <h1 align="center" id="space-title">Tasks</h1>
49
+ <b>Prompt Robustness</b> - Models are evaluated on ten different prompts. For multiple choice question (MCQ) datasets, prompts ask the model to choose the right option
50
+ letter. For MATH, prompts ask the model to solve the problem. The prompt set is diverse enough to cover various content and formatting styles that the model may encounter
51
+ in real life, they are not adversarial or tuned in any way. Prompts are semantically close, vary by instruction and level of response details. Prompts end with final
52
+ answer formatting instructions. We include both CoT and non-CoT prompts and vary the placement of the question in the prompt to be either in the beginning, in the middle,
53
+ or at the end of the prompt.
54
+
55
+ <b>Non Greedy Inference</b> - We study the effect of random seed during non-greedy inference. For factual questions the model's underlying distribution should be sharp enough
56
+ to be independent of the random seed for the next token sampling. There is an inherent randomness in the answer generation process, which may affect the "path" model takes to arrive at an answer.
57
+
58
+ <b>Choice Order Robustness</b> - We test models against changes in the order of choices for MCQ datasets. We swap the order of choices and ensure the correct answer
59
+ is always the same option (all correct answers are A or B, etc). Changing the order of choices does not change the input's semantics, and it is expected that the models
60
+ will be robust against such minimal change.
61
+
62
+ <h1 align="center" id="space-title">Datasets</h1>
63
+ <b>MMLU Pro</b> - text? <br>
64
+ <b>AGIEval</b> - text? <br>
65
+ <b>MATH</b> - text <br>
66
+
67
+ <h1 align="center" id="space-title">Metrics</h1>
68
+
69
  """
70
 
71
  # Which evaluations are you running? how can people reproduce what you have?
src/envs.py CHANGED
@@ -6,7 +6,8 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ # OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
+ OWNER = "nvidia" # Change to your org - don't forget to create a results and request dataset, with the correct format!
11
  # ----------------------------------
12
 
13
  REPO_ID = f"{OWNER}/leaderboard"
src/leaderboard/read_evals.py CHANGED
@@ -18,14 +18,14 @@ class EvalResult:
18
  """
19
  eval_name: str # org_model_precision (uid)
20
  full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
  revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
  weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
@@ -35,6 +35,7 @@ class EvalResult:
35
  @classmethod
36
  def init_from_json_file(self, json_filepath):
37
  """Inits the result from the specific model result file"""
 
38
  with open(json_filepath) as fp:
39
  data = json.load(fp)
40
 
@@ -58,7 +59,7 @@ class EvalResult:
58
  full_model = "/".join(org_and_model)
59
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
63
  architecture = "?"
64
  if model_config is not None:
@@ -72,20 +73,23 @@ class EvalResult:
72
  task = task.value
73
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
77
- continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
81
-
 
 
 
82
  return self(
83
  eval_name=result_key,
84
  full_model=full_model,
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
  revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
  architecture=architecture
@@ -109,7 +113,7 @@ class EvalResult:
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -119,7 +123,8 @@ class EvalResult:
119
  AutoEvalColumn.architecture.name: self.architecture,
120
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
  AutoEvalColumn.revision.name: self.revision,
122
- AutoEvalColumn.average.name: average,
 
123
  AutoEvalColumn.license.name: self.license,
124
  AutoEvalColumn.likes.name: self.likes,
125
  AutoEvalColumn.params.name: self.num_params,
@@ -184,7 +189,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
184
  eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
185
  else:
186
  eval_results[eval_name] = eval_result
187
-
188
  results = []
189
  for v in eval_results.values():
190
  try:
 
18
  """
19
  eval_name: str # org_model_precision (uid)
20
  full_model: str # org/model (path on hub)
21
+ org: str
22
  model: str
23
  revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
  weight_type: WeightType = WeightType.Original # Original or Adapter
28
+ architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
 
35
  @classmethod
36
  def init_from_json_file(self, json_filepath):
37
  """Inits the result from the specific model result file"""
38
+
39
  with open(json_filepath) as fp:
40
  data = json.load(fp)
41
 
 
59
  full_model = "/".join(org_and_model)
60
 
61
  still_on_hub, _, model_config = is_model_on_hub(
62
+ full_model, config.get("model_sha", "main"), token=os.getenv('HF_TOKEN', None), trust_remote_code=True, test_tokenizer=False
63
  )
64
  architecture = "?"
65
  if model_config is not None:
 
73
  task = task.value
74
 
75
  # We average all scores of a given metric (not all metrics are present in all files)
76
+ # accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
77
+ # if accs.size == 0 or any([acc is None for acc in accs]):
78
+ # continue
79
+
80
+ # mean_acc = np.mean(accs) * 100.0
81
+ # results[task.benchmark] = [mean_acc, 100]
82
+ metric = data["results"][task.benchmark.split('-')[0]].get(task.metric, None)
83
+ if task.benchmark.endswith("acc"):
84
+ metric = f"{metric[0]}, ({metric[1]}, {metric[2]})"
85
+ results[task.benchmark] = metric
86
  return self(
87
  eval_name=result_key,
88
  full_model=full_model,
89
  org=org,
90
  model=model,
91
  results=results,
92
+ precision=precision,
93
  revision= config.get("model_sha", ""),
94
  still_on_hub=still_on_hub,
95
  architecture=architecture
 
113
 
114
  def to_dict(self):
115
  """Converts the Eval Result to a dict compatible with our dataframe display"""
116
+ # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
117
  data_dict = {
118
  "eval_name": self.eval_name, # not a column, just a save name,
119
  AutoEvalColumn.precision.name: self.precision.value.name,
 
123
  AutoEvalColumn.architecture.name: self.architecture,
124
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
125
  AutoEvalColumn.revision.name: self.revision,
126
+ AutoEvalColumn.average.name: 0,
127
+ # AutoEvalColumn.average.name: average,
128
  AutoEvalColumn.license.name: self.license,
129
  AutoEvalColumn.likes.name: self.likes,
130
  AutoEvalColumn.params.name: self.num_params,
 
189
  eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
190
  else:
191
  eval_results[eval_name] = eval_result
 
192
  results = []
193
  for v in eval_results.values():
194
  try:
src/populate.py CHANGED
@@ -19,6 +19,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
19
 
20
  # filter out if any of the benchmarks have not been produced
21
  df = df[has_no_nan_values(df, benchmark_cols)]
 
22
  return df
23
 
24
 
 
19
 
20
  # filter out if any of the benchmarks have not been produced
21
  df = df[has_no_nan_values(df, benchmark_cols)]
22
+ print(df)
23
  return df
24
 
25
 
src/submission/check_validity.py CHANGED
@@ -33,6 +33,7 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
33
 
34
  def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
 
36
  try:
37
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
  if test_tokenizer:
 
33
 
34
  def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
+ config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
37
  try:
38
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
39
  if test_tokenizer: