Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Alina Lozovskaia
commited on
Commit
·
f86eaae
1
Parent(s):
87e47c2
Fixing WIP
Browse files- src/display/utils.py +21 -0
- src/leaderboard/filter_models.py +0 -3
- src/leaderboard/read_evals.py +39 -36
src/display/utils.py
CHANGED
|
@@ -1,9 +1,30 @@
|
|
| 1 |
from dataclasses import dataclass, make_dataclass
|
| 2 |
from enum import Enum
|
| 3 |
import json
|
|
|
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
def load_json_data(file_path):
|
| 8 |
"""Safely load JSON data from a file."""
|
| 9 |
try:
|
|
|
|
| 1 |
from dataclasses import dataclass, make_dataclass
|
| 2 |
from enum import Enum
|
| 3 |
import json
|
| 4 |
+
import logging
|
| 5 |
+
from datetime import datetime
|
| 6 |
import pandas as pd
|
| 7 |
|
| 8 |
|
| 9 |
+
# Configure logging
|
| 10 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 11 |
+
|
| 12 |
+
def parse_datetime(datetime_str):
|
| 13 |
+
formats = [
|
| 14 |
+
"%Y-%m-%dT%H-%M-%S.%f", # Format with dashes
|
| 15 |
+
"%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
|
| 16 |
+
"%Y-%m-%dT%H %M %S.%f", # Spaces as separator
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
for fmt in formats:
|
| 20 |
+
try:
|
| 21 |
+
return datetime.strptime(datetime_str, fmt)
|
| 22 |
+
except ValueError:
|
| 23 |
+
continue
|
| 24 |
+
# in rare cases set unix start time for files with incorrect time (legacy files)
|
| 25 |
+
logging.error(f"No valid date format found for: {datetime_str}")
|
| 26 |
+
return datetime(1970, 1, 1)
|
| 27 |
+
|
| 28 |
def load_json_data(file_path):
|
| 29 |
"""Safely load JSON data from a file."""
|
| 30 |
try:
|
src/leaderboard/filter_models.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
-
import logging
|
| 2 |
from src.display.formatting import model_hyperlink
|
| 3 |
from src.display.utils import AutoEvalColumn
|
| 4 |
|
| 5 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 6 |
|
| 7 |
# Models which have been flagged by users as being problematic for a reason or another
|
| 8 |
# (Model name to forum discussion link)
|
|
@@ -141,7 +139,6 @@ def flag_models(leaderboard_data: list[dict]):
|
|
| 141 |
else:
|
| 142 |
flag_key = model_data[AutoEvalColumn.fullname.name]
|
| 143 |
if flag_key in FLAGGED_MODELS:
|
| 144 |
-
# logging.info(f"Flagged model: {flag_key}") # Do we need to print out the list of flagged models?
|
| 145 |
issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
|
| 146 |
issue_link = model_hyperlink(
|
| 147 |
FLAGGED_MODELS[flag_key],
|
|
|
|
|
|
|
| 1 |
from src.display.formatting import model_hyperlink
|
| 2 |
from src.display.utils import AutoEvalColumn
|
| 3 |
|
|
|
|
| 4 |
|
| 5 |
# Models which have been flagged by users as being problematic for a reason or another
|
| 6 |
# (Model name to forum discussion link)
|
|
|
|
| 139 |
else:
|
| 140 |
flag_key = model_data[AutoEvalColumn.fullname.name]
|
| 141 |
if flag_key in FLAGGED_MODELS:
|
|
|
|
| 142 |
issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
|
| 143 |
issue_link = model_hyperlink(
|
| 144 |
FLAGGED_MODELS[flag_key],
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import json
|
| 2 |
from pathlib import Path
|
| 3 |
-
from datetime import datetime
|
| 4 |
from json import JSONDecodeError
|
| 5 |
import logging
|
| 6 |
import math
|
|
@@ -14,7 +13,7 @@ from tqdm.contrib.logging import logging_redirect_tqdm
|
|
| 14 |
import numpy as np
|
| 15 |
|
| 16 |
from src.display.formatting import make_clickable_model
|
| 17 |
-
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
|
| 18 |
|
| 19 |
# Configure logging
|
| 20 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
@@ -54,7 +53,14 @@ class EvalResult:
|
|
| 54 |
org_and_model = config.get("model_name", "").split("/", 1)
|
| 55 |
org = org_and_model[0] if len(org_and_model) > 1 else None
|
| 56 |
model = org_and_model[-1]
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
full_model = "/".join(org_and_model)
|
| 59 |
|
| 60 |
results = cls.extract_results(data) # Properly call the method to extract results
|
|
@@ -71,26 +77,39 @@ class EvalResult:
|
|
| 71 |
|
| 72 |
@staticmethod
|
| 73 |
def extract_results(data: Dict) -> Dict[str, float]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
results = {}
|
| 75 |
for task in Tasks:
|
| 76 |
-
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
continue
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
if math.isnan(float(task_metric_value)):
|
| 87 |
-
results[task_value.benchmark] = 0.0
|
| 88 |
-
continue
|
| 89 |
|
| 90 |
-
|
| 91 |
-
if accs:
|
| 92 |
-
mean_acc = np.mean(accs) * 100.0
|
| 93 |
-
results[task_value.benchmark] = mean_acc
|
| 94 |
|
| 95 |
return results
|
| 96 |
|
|
@@ -192,23 +211,6 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
| 192 |
return request_file
|
| 193 |
|
| 194 |
|
| 195 |
-
def parse_datetime(datetime_str):
|
| 196 |
-
formats = [
|
| 197 |
-
"%Y-%m-%dT%H-%M-%S.%f", # Format with dashes
|
| 198 |
-
"%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
|
| 199 |
-
"%Y-%m-%dT%H %M %S.%f", # Spaces as separator
|
| 200 |
-
]
|
| 201 |
-
|
| 202 |
-
for fmt in formats:
|
| 203 |
-
try:
|
| 204 |
-
return datetime.strptime(datetime_str, fmt)
|
| 205 |
-
except ValueError:
|
| 206 |
-
continue
|
| 207 |
-
# in rare cases set unix start time for files with incorrect time (legacy files)
|
| 208 |
-
logging.error(f"No valid date format found for: {datetime_str}")
|
| 209 |
-
return datetime(1970, 1, 1)
|
| 210 |
-
|
| 211 |
-
|
| 212 |
def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
|
| 213 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 214 |
with open(dynamic_path) as f:
|
|
@@ -246,7 +248,8 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
|
|
| 246 |
v.to_dict() # we test if the dict version is complete
|
| 247 |
results.append(v)
|
| 248 |
except KeyError as e:
|
| 249 |
-
logging.error(f"Error while checking model {k}
|
| 250 |
continue
|
| 251 |
|
| 252 |
-
return results
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
from pathlib import Path
|
|
|
|
| 3 |
from json import JSONDecodeError
|
| 4 |
import logging
|
| 5 |
import math
|
|
|
|
| 13 |
import numpy as np
|
| 14 |
|
| 15 |
from src.display.formatting import make_clickable_model
|
| 16 |
+
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
|
| 17 |
|
| 18 |
# Configure logging
|
| 19 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
| 53 |
org_and_model = config.get("model_name", "").split("/", 1)
|
| 54 |
org = org_and_model[0] if len(org_and_model) > 1 else None
|
| 55 |
model = org_and_model[-1]
|
| 56 |
+
if len(org_and_model) == 1:
|
| 57 |
+
org = None
|
| 58 |
+
model = org_and_model[0]
|
| 59 |
+
result_key = f"{model}_{precision.value.name}"
|
| 60 |
+
else:
|
| 61 |
+
org = org_and_model[0]
|
| 62 |
+
model = org_and_model[1]
|
| 63 |
+
result_key = f"{org}_{model}_{precision.value.name}"
|
| 64 |
full_model = "/".join(org_and_model)
|
| 65 |
|
| 66 |
results = cls.extract_results(data) # Properly call the method to extract results
|
|
|
|
| 77 |
|
| 78 |
@staticmethod
|
| 79 |
def extract_results(data: Dict) -> Dict[str, float]:
|
| 80 |
+
"""
|
| 81 |
+
Extracts and computes average scores from test result data for different benchmarks.
|
| 82 |
+
Skips entries based on specific conditions and handles NaN values appropriately.
|
| 83 |
+
Returns a dictionary with benchmarks as keys and their averaged scores as values in percentage.
|
| 84 |
+
|
| 85 |
+
Parameters:
|
| 86 |
+
- data (Dict): Input data with 'versions' and 'results'.
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
- Dict[str, float]: A dictionary with benchmark names and their computed average scores.
|
| 90 |
+
"""
|
| 91 |
results = {}
|
| 92 |
for task in Tasks:
|
| 93 |
+
task = task.value
|
| 94 |
|
| 95 |
+
# We skip old mmlu entries
|
| 96 |
+
if task.benchmark == "hendrycksTest":
|
| 97 |
+
for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
|
| 98 |
+
if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
|
| 99 |
+
continue
|
| 100 |
+
|
| 101 |
+
# Some truthfulQA values are NaNs
|
| 102 |
+
if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
|
| 103 |
+
if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
|
| 104 |
+
results[task.benchmark] = 0.0
|
| 105 |
continue
|
| 106 |
|
| 107 |
+
# We average all scores of a given metric (mostly for mmlu)
|
| 108 |
+
accs = [v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k]
|
| 109 |
+
if accs or any([acc is None for acc in accs]):
|
| 110 |
+
continue
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
+
results[task.benchmark] = np.mean(accs) * 100.0
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
return results
|
| 115 |
|
|
|
|
| 211 |
return request_file
|
| 212 |
|
| 213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
|
| 215 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 216 |
with open(dynamic_path) as f:
|
|
|
|
| 248 |
v.to_dict() # we test if the dict version is complete
|
| 249 |
results.append(v)
|
| 250 |
except KeyError as e:
|
| 251 |
+
logging.error(f"Error while checking model {k} {v.date} json, no key: {e}") # not all eval values present
|
| 252 |
continue
|
| 253 |
|
| 254 |
+
return results
|
| 255 |
+
|