Spaces:

nvidia
/

llm-robustness-leaderboard

Running

App Files Files Community

Grigor-N commited on Nov 22, 2024

Commit

28cf97d

1 Parent(s): 3e5a104

SCORE leaderboard v1

Browse files

Files changed (6) hide show

app.py +15 -15
src/about.py +45 -5
src/envs.py +2 -1
src/leaderboard/read_evals.py +18 -14
src/populate.py +1 -0
src/submission/check_validity.py +1 -0

app.py CHANGED Viewed

@@ -33,20 +33,20 @@ def restart_space():
     API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
 LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
@@ -199,6 +199,6 @@ with demo:
             )
 scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()

     API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
+# try:
+#     print(EVAL_REQUESTS_PATH)
+#     snapshot_download(
+#         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+#     )
+# except Exception:
+#     restart_space()
+# try:
+#     print(EVAL_RESULTS_PATH)
+#     snapshot_download(
+#         repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+#     )
+# except Exception:
+#     restart_space()
 LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
             )
 scheduler = BackgroundScheduler()
+# scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()

src/about.py CHANGED Viewed

@@ -12,20 +12,60 @@ class Task:
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?

 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    # task0 = Task("anli_r1", "acc", "ANLI")
+    # task1 = Task("logiqa", "acc_norm", "LogiQA")
+    # task0 = Task('agieval', 'accuracy', 'AGIEval_acc')
+    # task1 = Task('agieval', 'consistency', 'AGIEval_CR')
+    # task2 = Task('mmlu_pro', 'accuracy', 'MMLU-Pro_acc')
+    # task3 = Task('mmlu_pro', 'consistency', 'MMLU-Pro_CR')
+    # task4 = Task('math', 'accuracy', 'Math_acc')
+    # task5 = Task('math', 'consistency', 'Math_CR')
+    task0 = Task('agieval-acc', 'accuracy', 'AGIEval Mean (Min, Max)')
+    task1 = Task('agieval-cr', 'consistency', 'AGIEval CR')
+    task2 = Task('mmlu_pro-acc', 'accuracy', 'MMLU-Pro Mean (Min, Max)')
+    task3 = Task('mmlu_pro-cr', 'consistency', 'MMLU-Pro CR')
+    task4 = Task('math-acc', 'accuracy', 'Math Mean (Min, Max)')
+    task5 = Task('math-cr', 'consistency', 'Math CR')
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">SCORE Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+We introduce <b>SCORE</b> - an open and holistic evaluation framework for LLMs centered on robustness i.e. the ability to produce consistent responses when the input is rephrased
+or presented in a slightly different way. Prediction consistency is particularly crucial for factual questions where an objective answer exists. Note that it is expected
+that the predictions are equivalent and not necessarily correct. Models are evaluated multiple times in equivalent setups and accuracy range along with prediction
+ consistency rate is reported. Contrary to a single accuracy metrics (often derived from an optimized setup) reported during model releases, this better simulates human
+ interaction setups and provides better estimate of real world performance. Furthermore, models are evaluated using the same setup which makes model comparison possible.
+<h1 align="center" id="space-title">Tasks</h1>
+<b>Prompt Robustness</b> - Models are evaluated on ten different prompts. For multiple choice question (MCQ) datasets, prompts ask the model to choose the right option
+letter. For MATH, prompts ask the model to solve the problem.  The prompt set is diverse enough to cover various content and formatting styles that the model may encounter
+ in real life, they are not adversarial or tuned in any way. Prompts are semantically close, vary by instruction and level of response details. Prompts end with final
+ answer formatting instructions. We include both CoT and non-CoT prompts and vary the placement of the question in the prompt to be either in the beginning, in the middle,
+or at the end of the prompt.
+<b>Non Greedy Inference</b> - We study the effect of random seed during non-greedy inference. For factual questions the model's underlying distribution should be sharp enough
+ to be independent of the random seed for the next token sampling. There is an inherent randomness in the answer generation process, which may affect the "path" model takes to arrive at an answer.
+<b>Choice Order Robustness</b> - We test models against changes in the order of choices for MCQ datasets. We swap the order of choices and ensure the correct answer
+is always the same option (all correct answers are A or B, etc). Changing the order of choices does not change the input's semantics, and it is expected that the models
+will be robust against such minimal change.
+<h1 align="center" id="space-title">Datasets</h1>
+	<b>MMLU Pro</b> - text? <br>
+	<b>AGIEval</b> - text? <br>
+	<b>MATH</b> - text <br>
+<h1 align="center" id="space-title">Metrics</h1>
 """
 # Which evaluations are you running? how can people reproduce what you have?

src/envs.py CHANGED Viewed

@@ -6,7 +6,8 @@ from huggingface_hub import HfApi
 # ----------------------------------
 TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
-OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
 REPO_ID = f"{OWNER}/leaderboard"

 # ----------------------------------
 TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
+# OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
+OWNER = "nvidia" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
 REPO_ID = f"{OWNER}/leaderboard"

src/leaderboard/read_evals.py CHANGED Viewed

@@ -18,14 +18,14 @@ class EvalResult:
     """
     eval_name: str # org_model_precision (uid)
     full_model: str # org/model (path on hub)
-    org: str
     model: str
     revision: str # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
@@ -35,6 +35,7 @@ class EvalResult:
     @classmethod
     def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
@@ -58,7 +59,7 @@ class EvalResult:
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
         if model_config is not None:
@@ -72,20 +73,23 @@ class EvalResult:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
-                continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
         return self(
             eval_name=result_key,
             full_model=full_model,
             org=org,
             model=model,
             results=results,
-            precision=precision,
             revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
@@ -109,7 +113,7 @@ class EvalResult:
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
@@ -119,7 +123,8 @@ class EvalResult:
             AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
             AutoEvalColumn.revision.name: self.revision,
-            AutoEvalColumn.average.name: average,
             AutoEvalColumn.license.name: self.license,
             AutoEvalColumn.likes.name: self.likes,
             AutoEvalColumn.params.name: self.num_params,
@@ -184,7 +189,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
             eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
         else:
             eval_results[eval_name] = eval_result
     results = []
     for v in eval_results.values():
         try:

     """
     eval_name: str # org_model_precision (uid)
     full_model: str # org/model (path on hub)
+    org: str
     model: str
     revision: str # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original # Original or Adapter
+    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
     @classmethod
     def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
+            full_model, config.get("model_sha", "main"), token=os.getenv('HF_TOKEN', None), trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
         if model_config is not None:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
+            # accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
+            # if accs.size == 0 or any([acc is None for acc in accs]):
+                # continue
+            # mean_acc = np.mean(accs) * 100.0
+            # results[task.benchmark] = [mean_acc, 100]
+            metric = data["results"][task.benchmark.split('-')[0]].get(task.metric, None)
+            if task.benchmark.endswith("acc"):
+                metric = f"{metric[0]}, ({metric[1]}, {metric[2]})"
+            results[task.benchmark] = metric
         return self(
             eval_name=result_key,
             full_model=full_model,
             org=org,
             model=model,
             results=results,
+            precision=precision,
             revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
+        # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
             AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
             AutoEvalColumn.revision.name: self.revision,
+            AutoEvalColumn.average.name: 0,
+            # AutoEvalColumn.average.name: average,
             AutoEvalColumn.license.name: self.license,
             AutoEvalColumn.likes.name: self.likes,
             AutoEvalColumn.params.name: self.num_params,
             eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
         else:
             eval_results[eval_name] = eval_result
     results = []
     for v in eval_results.values():
         try:

src/populate.py CHANGED Viewed

@@ -19,6 +19,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
     return df

     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
+    print(df)
     return df

src/submission/check_validity.py CHANGED Viewed

@@ -33,6 +33,7 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
 def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
     """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
     try:
         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer:

 def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
     """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
+    config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
     try:
         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer: