Spaces:

nvidia
/

llm-robustness-leaderboard

Running

App Files Files Community

rima-shahbazyan commited on Dec 17, 2024

Commit

70627cf

1 Parent(s): 28cf97d

customized leaderboard to be SCORE specific

Browse files

Files changed (4) hide show

app.py +6 -93
src/about.py +28 -19
src/display/utils.py +10 -10
src/leaderboard/read_evals.py +14 -13

app.py CHANGED Viewed

@@ -68,11 +68,11 @@ def init_leaderboard(dataframe):
             cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
             label="Select Columns to Display:",
         ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
         hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
             ColumnFilter(
                 AutoEvalColumn.params.name,
                 type="slider",
@@ -80,9 +80,9 @@ def init_leaderboard(dataframe):
                 max=150,
                 label="Select the number of parameters (B)",
             ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
         ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
@@ -101,93 +101,6 @@ with demo:
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(

             cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
             label="Select Columns to Display:",
         ),
+        search_columns=[AutoEvalColumn.model.name], #, AutoEvalColumn.license.name],
         hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         filter_columns=[
+            # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
+            # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
             ColumnFilter(
                 AutoEvalColumn.params.name,
                 type="slider",
                 max=150,
                 label="Select the number of parameters (B)",
             ),
+            # ColumnFilter(
+            #     AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
+            # ),
         ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(

src/about.py CHANGED Viewed

@@ -11,7 +11,7 @@ class Task:
 # Select your tasks here
 # ---------------------------------------------------
 class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     # task0 = Task("anli_r1", "acc", "ANLI")
     # task1 = Task("logiqa", "acc_norm", "LogiQA")
@@ -39,42 +39,51 @@ TITLE = """<h1 align="center" id="space-title">SCORE Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-We introduce <b>SCORE</b> - an open and holistic evaluation framework for LLMs centered on robustness i.e. the ability to produce consistent responses when the input is rephrased
-or presented in a slightly different way. Prediction consistency is particularly crucial for factual questions where an objective answer exists. Note that it is expected
 that the predictions are equivalent and not necessarily correct. Models are evaluated multiple times in equivalent setups and accuracy range along with prediction
- consistency rate is reported. Contrary to a single accuracy metrics (often derived from an optimized setup) reported during model releases, this better simulates human
- interaction setups and provides better estimate of real world performance. Furthermore, models are evaluated using the same setup which makes model comparison possible.
 <h1 align="center" id="space-title">Tasks</h1>
-<b>Prompt Robustness</b> - Models are evaluated on ten different prompts. For multiple choice question (MCQ) datasets, prompts ask the model to choose the right option
 letter. For MATH, prompts ask the model to solve the problem.  The prompt set is diverse enough to cover various content and formatting styles that the model may encounter
- in real life, they are not adversarial or tuned in any way. Prompts are semantically close, vary by instruction and level of response details. Prompts end with final
  answer formatting instructions. We include both CoT and non-CoT prompts and vary the placement of the question in the prompt to be either in the beginning, in the middle,
 or at the end of the prompt.
 <b>Non Greedy Inference</b> - We study the effect of random seed during non-greedy inference. For factual questions the model's underlying distribution should be sharp enough
- to be independent of the random seed for the next token sampling. There is an inherent randomness in the answer generation process, which may affect the "path" model takes to arrive at an answer.
-<b>Choice Order Robustness</b> - We test models against changes in the order of choices for MCQ datasets. We swap the order of choices and ensure the correct answer
-is always the same option (all correct answers are A or B, etc). Changing the order of choices does not change the input's semantics, and it is expected that the models
-will be robust against such minimal change.
-<h1 align="center" id="space-title">Datasets</h1>
-	<b>MMLU Pro</b> - text? <br>
-	<b>AGIEval</b> - text? <br>
-	<b>MATH</b> - text <br>
 <h1 align="center" id="space-title">Metrics</h1>
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
-## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
 """
 EVALUATION_QUEUE_TEXT = """

 # Select your tasks here
 # ---------------------------------------------------
 class Tasks(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     # task0 = Task("anli_r1", "acc", "ANLI")
     # task1 = Task("logiqa", "acc_norm", "LogiQA")
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+We introduce <b>SCORE</b> - an open and holistic evaluation framework for LLMs centered on robustness i.e. the ability to produce consistent responses when the input is rephrased
+or presented in a slightly different way. Prediction consistency is particularly crucial for factual questions where an objective answer exists. Note that it is expected
 that the predictions are equivalent and not necessarily correct. Models are evaluated multiple times in equivalent setups and accuracy range along with prediction
+ consistency rate is reported. Contrary to a single accuracy metrics (often derived from an optimized setup) reported during model releases, this better simulates human
+ interaction setups and provides better estimate of real world performance. Furthermore, models are evaluated using the same setup which makes model comparison possible.
 <h1 align="center" id="space-title">Tasks</h1>
+<b>Prompt Robustness</b> - Models are evaluated on ten different prompts. For multiple choice question (MCQ) datasets, prompts ask the model to choose the right option
 letter. For MATH, prompts ask the model to solve the problem.  The prompt set is diverse enough to cover various content and formatting styles that the model may encounter
+ in real life, they are not adversarial or tuned in any way. Prompts are semantically close, vary by instruction and level of response details. Prompts end with final
  answer formatting instructions. We include both CoT and non-CoT prompts and vary the placement of the question in the prompt to be either in the beginning, in the middle,
 or at the end of the prompt.
 <b>Non Greedy Inference</b> - We study the effect of random seed during non-greedy inference. For factual questions the model's underlying distribution should be sharp enough
+ to be independent of the random seed for the next token sampling. There is an inherent randomness in the answer generation process, which may affect the "path" model takes to arrive at an answer.
+<b>Choice Order Robustness</b> - We test models against changes in the order of choices for MCQ datasets. We swap the order of choices and ensure the correct answer
+is always the same option (all correct answers are A or B, etc). Changing the order of choices does not change the input's semantics, and it is expected that the models
+will be robust against such minimal change.
+<h1 align="center" id="space-title">Datasets?</h1>
+	<b>MMLU Pro</b> - Massive multi-task understanding dataset tailored to more rigorously benchmark large language models' capabilities. <br>
+	<b>AGIEval</b> - Dataset specifically designed to assess foundation model in the context of human-centric standardized exams, such as college entrance exams, law school admission tests, math competitions, and lawyer qualification tests. <br>
+	<b>MATH</b> - Challenging competition mathematics problems <br>
 <h1 align="center" id="space-title">Metrics</h1>
+<b>Accuracy</b> - We report macro accuracy for MMLU Pro and micro accuracy for AGIEval and MATH.
+                  For all datasets, average (minimum, maximum) accuracy across all experiments is reported<br>
+<b>Consistency Rate</b> - We use the consistency rate (CR) to measure the stability of model predictions.
+                         CR calculates the proportion of consistent prediction pairs for each data point.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
+## How to Evaluate on SCORE?
+To evaluate your model on the SCORE benchmark, you can use [LM-EVALUATION-HARNESS](https://github.com/EleutherAI/lm-evaluation-harness).
+The tasks are available under the following groups:
+* score_robustness_mmlu_pro
+* score_robustness_agieval
+* score_robustness_math
+Th numbers in the leaderboard are the average for across tasks for each dataset.
+More details could be found in the [README of the Score task](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/score) as well as the [official repository](https://github.com/EleutherAI/lm-evaluation-harness/tree/main)
 """
 EVALUATION_QUEUE_TEXT = """

src/display/utils.py CHANGED Viewed

@@ -23,22 +23,22 @@ class ColumnContent:
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
-auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
-auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
-auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
-auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
-auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
 auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
-auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)

 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
+# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
+auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average CR⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
+# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
+# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
 auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)

src/leaderboard/read_evals.py CHANGED Viewed

@@ -90,7 +90,7 @@ class EvalResult:
             model=model,
             results=results,
             precision=precision,
-            revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
         )
@@ -113,22 +113,23 @@ class EvalResult:
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision.value.name,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.revision.name: self.revision,
-            AutoEvalColumn.average.name: 0,
-            # AutoEvalColumn.average.name: average,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
             AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
         }
         for task in Tasks:

             model=model,
             results=results,
             precision=precision,
+            revision=config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
         )
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
+        cr_vals = [v for b, v in self.results.items() if b.split("-")[1]=="cr"]
+        average = sum(cr_vals) / len(cr_vals)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
+            # AutoEvalColumn.precision.name: self.precision.value.name,
+            # AutoEvalColumn.model_type.name: self.model_type.value.name,
+            # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
+            # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
+            # AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
+            # AutoEvalColumn.revision.name: self.revision,
+            # AutoEvalColumn.average.name: 0,
+            AutoEvalColumn.average.name: average,
+            # AutoEvalColumn.license.name: self.license,
+            # AutoEvalColumn.likes.name: self.likes,
             AutoEvalColumn.params.name: self.num_params,
+            # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
         }
         for task in Tasks: