rima-shahbazyan commited on
Commit
70627cf
·
1 Parent(s): 28cf97d

customized leaderboard to be SCORE specific

Browse files
Files changed (4) hide show
  1. app.py +6 -93
  2. src/about.py +28 -19
  3. src/display/utils.py +10 -10
  4. src/leaderboard/read_evals.py +14 -13
app.py CHANGED
@@ -68,11 +68,11 @@ def init_leaderboard(dataframe):
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
  ColumnFilter(
77
  AutoEvalColumn.params.name,
78
  type="slider",
@@ -80,9 +80,9 @@ def init_leaderboard(dataframe):
80
  max=150,
81
  label="Select the number of parameters (B)",
82
  ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
@@ -101,93 +101,6 @@ with demo:
101
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
-
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
193
  citation_button = gr.Textbox(
 
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
+ search_columns=[AutoEvalColumn.model.name], #, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
+ # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
+ # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
  ColumnFilter(
77
  AutoEvalColumn.params.name,
78
  type="slider",
 
80
  max=150,
81
  label="Select the number of parameters (B)",
82
  ),
83
+ # ColumnFilter(
84
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
+ # ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
 
101
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  with gr.Row():
105
  with gr.Accordion("📙 Citation", open=False):
106
  citation_button = gr.Textbox(
src/about.py CHANGED
@@ -11,7 +11,7 @@ class Task:
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  # task0 = Task("anli_r1", "acc", "ANLI")
16
  # task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
@@ -39,42 +39,51 @@ TITLE = """<h1 align="center" id="space-title">SCORE Leaderboard</h1>"""
39
 
40
  # What does your leaderboard evaluate?
41
  INTRODUCTION_TEXT = """
42
- We introduce <b>SCORE</b> - an open and holistic evaluation framework for LLMs centered on robustness i.e. the ability to produce consistent responses when the input is rephrased
43
- or presented in a slightly different way. Prediction consistency is particularly crucial for factual questions where an objective answer exists. Note that it is expected
44
  that the predictions are equivalent and not necessarily correct. Models are evaluated multiple times in equivalent setups and accuracy range along with prediction
45
- consistency rate is reported. Contrary to a single accuracy metrics (often derived from an optimized setup) reported during model releases, this better simulates human
46
- interaction setups and provides better estimate of real world performance. Furthermore, models are evaluated using the same setup which makes model comparison possible.
47
 
48
  <h1 align="center" id="space-title">Tasks</h1>
49
- <b>Prompt Robustness</b> - Models are evaluated on ten different prompts. For multiple choice question (MCQ) datasets, prompts ask the model to choose the right option
50
  letter. For MATH, prompts ask the model to solve the problem. The prompt set is diverse enough to cover various content and formatting styles that the model may encounter
51
- in real life, they are not adversarial or tuned in any way. Prompts are semantically close, vary by instruction and level of response details. Prompts end with final
52
  answer formatting instructions. We include both CoT and non-CoT prompts and vary the placement of the question in the prompt to be either in the beginning, in the middle,
53
  or at the end of the prompt.
54
 
55
  <b>Non Greedy Inference</b> - We study the effect of random seed during non-greedy inference. For factual questions the model's underlying distribution should be sharp enough
56
- to be independent of the random seed for the next token sampling. There is an inherent randomness in the answer generation process, which may affect the "path" model takes to arrive at an answer.
57
 
58
- <b>Choice Order Robustness</b> - We test models against changes in the order of choices for MCQ datasets. We swap the order of choices and ensure the correct answer
59
- is always the same option (all correct answers are A or B, etc). Changing the order of choices does not change the input's semantics, and it is expected that the models
60
- will be robust against such minimal change.
61
 
62
- <h1 align="center" id="space-title">Datasets</h1>
63
- <b>MMLU Pro</b> - text? <br>
64
- <b>AGIEval</b> - text? <br>
65
- <b>MATH</b> - text <br>
66
 
67
  <h1 align="center" id="space-title">Metrics</h1>
68
-
 
 
 
69
  """
70
 
71
  # Which evaluations are you running? how can people reproduce what you have?
72
  LLM_BENCHMARKS_TEXT = f"""
73
- ## How it works
 
 
 
 
 
 
74
 
75
- ## Reproducibility
76
- To reproduce our results, here is the commands you can run:
77
 
 
78
  """
79
 
80
  EVALUATION_QUEUE_TEXT = """
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  # task0 = Task("anli_r1", "acc", "ANLI")
16
  # task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
 
39
 
40
  # What does your leaderboard evaluate?
41
  INTRODUCTION_TEXT = """
42
+ We introduce <b>SCORE</b> - an open and holistic evaluation framework for LLMs centered on robustness i.e. the ability to produce consistent responses when the input is rephrased
43
+ or presented in a slightly different way. Prediction consistency is particularly crucial for factual questions where an objective answer exists. Note that it is expected
44
  that the predictions are equivalent and not necessarily correct. Models are evaluated multiple times in equivalent setups and accuracy range along with prediction
45
+ consistency rate is reported. Contrary to a single accuracy metrics (often derived from an optimized setup) reported during model releases, this better simulates human
46
+ interaction setups and provides better estimate of real world performance. Furthermore, models are evaluated using the same setup which makes model comparison possible.
47
 
48
  <h1 align="center" id="space-title">Tasks</h1>
49
+ <b>Prompt Robustness</b> - Models are evaluated on ten different prompts. For multiple choice question (MCQ) datasets, prompts ask the model to choose the right option
50
  letter. For MATH, prompts ask the model to solve the problem. The prompt set is diverse enough to cover various content and formatting styles that the model may encounter
51
+ in real life, they are not adversarial or tuned in any way. Prompts are semantically close, vary by instruction and level of response details. Prompts end with final
52
  answer formatting instructions. We include both CoT and non-CoT prompts and vary the placement of the question in the prompt to be either in the beginning, in the middle,
53
  or at the end of the prompt.
54
 
55
  <b>Non Greedy Inference</b> - We study the effect of random seed during non-greedy inference. For factual questions the model's underlying distribution should be sharp enough
56
+ to be independent of the random seed for the next token sampling. There is an inherent randomness in the answer generation process, which may affect the "path" model takes to arrive at an answer.
57
 
58
+ <b>Choice Order Robustness</b> - We test models against changes in the order of choices for MCQ datasets. We swap the order of choices and ensure the correct answer
59
+ is always the same option (all correct answers are A or B, etc). Changing the order of choices does not change the input's semantics, and it is expected that the models
60
+ will be robust against such minimal change.
61
 
62
+ <h1 align="center" id="space-title">Datasets?</h1>
63
+ <b>MMLU Pro</b> - Massive multi-task understanding dataset tailored to more rigorously benchmark large language models' capabilities. <br>
64
+ <b>AGIEval</b> - Dataset specifically designed to assess foundation model in the context of human-centric standardized exams, such as college entrance exams, law school admission tests, math competitions, and lawyer qualification tests. <br>
65
+ <b>MATH</b> - Challenging competition mathematics problems <br>
66
 
67
  <h1 align="center" id="space-title">Metrics</h1>
68
+ <b>Accuracy</b> - We report macro accuracy for MMLU Pro and micro accuracy for AGIEval and MATH.
69
+ For all datasets, average (minimum, maximum) accuracy across all experiments is reported<br>
70
+ <b>Consistency Rate</b> - We use the consistency rate (CR) to measure the stability of model predictions.
71
+ CR calculates the proportion of consistent prediction pairs for each data point.
72
  """
73
 
74
  # Which evaluations are you running? how can people reproduce what you have?
75
  LLM_BENCHMARKS_TEXT = f"""
76
+ ## How to Evaluate on SCORE?
77
+
78
+ To evaluate your model on the SCORE benchmark, you can use [LM-EVALUATION-HARNESS](https://github.com/EleutherAI/lm-evaluation-harness).
79
+ The tasks are available under the following groups:
80
+ * score_robustness_mmlu_pro
81
+ * score_robustness_agieval
82
+ * score_robustness_math
83
 
84
+ Th numbers in the leaderboard are the average for across tasks for each dataset.
 
85
 
86
+ More details could be found in the [README of the Score task](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/score) as well as the [official repository](https://github.com/EleutherAI/lm-evaluation-harness/tree/main)
87
  """
88
 
89
  EVALUATION_QUEUE_TEXT = """
src/display/utils.py CHANGED
@@ -23,22 +23,22 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
+ # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average CR⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
+ # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
+ # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
+ # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
+ # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
+ # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
+ # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
+ # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
+ # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
src/leaderboard/read_evals.py CHANGED
@@ -90,7 +90,7 @@ class EvalResult:
90
  model=model,
91
  results=results,
92
  precision=precision,
93
- revision= config.get("model_sha", ""),
94
  still_on_hub=still_on_hub,
95
  architecture=architecture
96
  )
@@ -113,22 +113,23 @@ class EvalResult:
113
 
114
  def to_dict(self):
115
  """Converts the Eval Result to a dict compatible with our dataframe display"""
116
- # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
117
  data_dict = {
118
  "eval_name": self.eval_name, # not a column, just a save name,
119
- AutoEvalColumn.precision.name: self.precision.value.name,
120
- AutoEvalColumn.model_type.name: self.model_type.value.name,
121
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
122
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
123
- AutoEvalColumn.architecture.name: self.architecture,
124
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
125
- AutoEvalColumn.revision.name: self.revision,
126
- AutoEvalColumn.average.name: 0,
127
- # AutoEvalColumn.average.name: average,
128
- AutoEvalColumn.license.name: self.license,
129
- AutoEvalColumn.likes.name: self.likes,
130
  AutoEvalColumn.params.name: self.num_params,
131
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
132
  }
133
 
134
  for task in Tasks:
 
90
  model=model,
91
  results=results,
92
  precision=precision,
93
+ revision=config.get("model_sha", ""),
94
  still_on_hub=still_on_hub,
95
  architecture=architecture
96
  )
 
113
 
114
  def to_dict(self):
115
  """Converts the Eval Result to a dict compatible with our dataframe display"""
116
+ cr_vals = [v for b, v in self.results.items() if b.split("-")[1]=="cr"]
117
+ average = sum(cr_vals) / len(cr_vals)
118
  data_dict = {
119
  "eval_name": self.eval_name, # not a column, just a save name,
120
+ # AutoEvalColumn.precision.name: self.precision.value.name,
121
+ # AutoEvalColumn.model_type.name: self.model_type.value.name,
122
+ # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
123
+ # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
124
+ # AutoEvalColumn.architecture.name: self.architecture,
125
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
126
+ # AutoEvalColumn.revision.name: self.revision,
127
+ # AutoEvalColumn.average.name: 0,
128
+ AutoEvalColumn.average.name: average,
129
+ # AutoEvalColumn.license.name: self.license,
130
+ # AutoEvalColumn.likes.name: self.likes,
131
  AutoEvalColumn.params.name: self.num_params,
132
+ # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
133
  }
134
 
135
  for task in Tasks: