import glob import json import os from dataclasses import dataclass from functools import lru_cache import numpy as np from app.display.formatting import make_clickable_model from app.display.utils import AutoEvalColumn, ModelType, Precision, Tasks from app.submission.check_validity import is_model_on_hub # Add caching for hub checks to avoid repeated network calls @lru_cache(maxsize=256) def cached_is_model_on_hub(full_model, revision): """Cached version of is_model_on_hub to avoid repeated network calls""" return is_model_on_hub(full_model, revision, trust_remote_code=True, test_tokenizer=False) @dataclass class EvalResult: """Represents one full evaluation. Built from a combination of the result and request file for a given run.""" eval_name: str # org_model_precision (uid) full_model: str # org/model (path on hub) org: str model: str revision: str # commit hash, "" if main results: dict precision: Precision = Precision.Unknown model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ... architecture: str = "Unknown" license: str = "?" likes: int = 0 num_params: int = 0 date: str = "" # submission date of request file still_on_hub: bool = False reasoning: bool = False # Whether reasoning is enabled for this model note: str = "" # Extra information about the model (e.g., thinking budget, warnings) @classmethod def init_from_new_format_json_file(self, json_filepath): """Inits the result from the new format model result file""" with open(json_filepath) as fp: data = json.load(fp) results = data.get("results") full_model = data.get("config_general", {}).get("model_name", "").strip() result_key = full_model.replace("/", "_") org, model = full_model.split("/", 1) if "/" in full_model else ("", full_model) still_on_hub, _, model_config = cached_is_model_on_hub(full_model, "main") architecture = "?" if model_config is not None: architectures = getattr(model_config, "architectures", None) if architectures: architecture = ";".join(architectures) # Extract results available in this file score_results = {} for task in Tasks: task = task.value benchmark_id = task.benchmark metric = task.metric scores = [ results[key][metric] for key in results if "|" in key and benchmark_id.startswith(key.split("|")[1].removeprefix("icelandic_evals:")) ] if len(scores) == 0: continue mean_acc = np.mean(scores) * 100.0 score_results[benchmark_id] = mean_acc return self( eval_name=result_key, full_model=full_model, org=org, model=model, results=score_results, revision="", still_on_hub=still_on_hub, architecture=architecture, ) def update_with_request_file(self, requests_path): """Finds the relevant request file for the current model and updates info with it""" request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name) try: with open(request_file, "r") as f: request = json.load(f) self.model_type = ModelType.from_str(request.get("model_type", "")) self.license = request.get("license", "?") self.likes = request.get("likes", 0) self.num_params = request.get("params", 0) self.date = request.get("submitted_time", "") self.reasoning = request.get("reasoning", False) or request.get("gen_kwargs", {}).get( "reasoning_effort", None ) self.note = request.get("note", "") # Default to empty string if missing except FileNotFoundError: print( f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}" ) def to_dict(self): """Converts the Eval Result to a dict compatible with our dataframe display""" average = sum([v for v in self.results.values() if v is not None]) / len(Tasks) data_dict = { "eval_name": self.eval_name, # not a column, just a save name, AutoEvalColumn.precision.name: self.precision.value.name, AutoEvalColumn.model_type.name: self.model_type.value.name, AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, AutoEvalColumn.architecture.name: self.architecture, AutoEvalColumn.model.name: make_clickable_model(self.full_model), AutoEvalColumn.revision.name: self.revision, AutoEvalColumn.average.name: average, AutoEvalColumn.license.name: self.license, AutoEvalColumn.likes.name: self.likes, AutoEvalColumn.params.name: self.num_params, AutoEvalColumn.still_on_hub.name: self.still_on_hub, AutoEvalColumn.reasoning.name: self.reasoning, AutoEvalColumn.note.name: self.note, } for task in Tasks: if task.value.benchmark in self.results.keys(): data_dict[task.value.col_name] = self.results[task.value.benchmark] else: data_dict[task.value.col_name] = None return data_dict def get_request_file_for_model(requests_path, model_name, precision): """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED""" request_files = os.path.join( requests_path, f"{model_name}_eval_request_*.json", ) request_files = glob.glob(request_files) if len(request_files) == 1: return request_files[0] # Select correct request file (precision) request_file = "" request_files = sorted(request_files, reverse=True) for tmp_request_file in request_files: with open(tmp_request_file, "r") as f: req_content = json.load(f) if req_content["precision"] == precision.split(".")[-1] or req_content["precision"] is None: request_file = tmp_request_file return request_file def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]: """From the path of the results folder root, extract all needed info for results""" model_result_filepaths = [] # Collect all JSON files first for root, _, files in os.walk(results_path): # We should only have json files in model results json_files = [f for f in files if f.endswith(".json")] if len(json_files) == 0: continue # Sort JSON files by date (newer later) try: json_files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7]) except (ValueError, IndexError): # If sorting fails, just use the files as-is or take the last one json_files = [json_files[-1]] if json_files else [] for file in json_files: model_result_filepaths.append(os.path.join(root, file)) eval_results = {} for model_result_filepath in model_result_filepaths: try: # Creation of result eval_result = EvalResult.init_from_new_format_json_file(model_result_filepath) eval_result.update_with_request_file(requests_path) # Store results of same eval together eval_name = eval_result.eval_name if eval_name in eval_results: # Update with newer scores eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None}) else: eval_results[eval_name] = eval_result except Exception as e: # Log error but continue processing other files print(f"Error processing {model_result_filepath}: {e}") continue results = [] for v in eval_results.values(): try: v.to_dict() # we test if the dict version is complete results.append(v) except KeyError: # not all eval values present continue return results