import os import json import logging from typing import List, Dict, Any from pathlib import Path from huggingface_hub import snapshot_download from fastapi import HTTPException from app.config import ( QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, HF_TOKEN ) from app.core.cache import cache_config logger = logging.getLogger(__name__) try: from app.leaderboard.read_evals import get_raw_eval_results from app.populate import get_leaderboard_df from app.display.utils import COLS, BENCHMARK_COLS, Tasks except ImportError as e: # Fallback for development without mounted volume logger.warning(f"Could not import original modules: {e}") # Define minimal fallbacks COLS = ["Model", "Average ⬆️", "Type", "Precision", "Architecture", "Hub License", "Hub ❤️", "#Params (B)", "Available on the hub", "Model sha"] BENCHMARK_COLS = ["WinoGrande-IS (3-shot)", "GED", "Inflection (1-shot)", "Belebele (IS)", "ARC-Challenge-IS", "WikiQA-IS"] class MockTask: def __init__(self, name, col_name): self.name = name self.col_name = col_name class Tasks: task0 = MockTask("winogrande_is", "WinoGrande-IS (3-shot)") task1 = MockTask("ged", "GED") task2 = MockTask("inflection", "Inflection (1-shot)") task5 = MockTask("belebele_is", "Belebele (IS)") task6 = MockTask("arc_challenge_is", "ARC-Challenge-IS") task7 = MockTask("wiki_qa_is", "WikiQA-IS") class IcelandicLeaderboardService: def __init__(self): self.results_path = EVAL_RESULTS_PATH self.requests_path = EVAL_REQUESTS_PATH async def _ensure_data_available(self): """Ensure evaluation data is available locally""" try: # Download results if not exists or empty if not os.path.exists(self.results_path) or not os.listdir(self.results_path): logger.info(f"Downloading results to {self.results_path}") snapshot_download( repo_id=RESULTS_REPO, local_dir=self.results_path, repo_type="dataset", token=HF_TOKEN, tqdm_class=None, etag_timeout=30 ) # Download requests if not exists or empty if not os.path.exists(self.requests_path) or not os.listdir(self.requests_path): logger.info(f"Downloading requests to {self.requests_path}") snapshot_download( repo_id=QUEUE_REPO, local_dir=self.requests_path, repo_type="dataset", token=HF_TOKEN, tqdm_class=None, etag_timeout=30 ) except Exception as e: logger.error(f"Failed to download data: {e}") raise HTTPException(status_code=500, detail=f"Failed to download data: {str(e)}") async def fetch_raw_data(self) -> List[Dict[str, Any]]: """Fetch raw leaderboard data using original Icelandic processing logic""" try: await self._ensure_data_available() logger.info("Processing Icelandic leaderboard data") # Try to use original processing logic if available try: raw_data, df = get_leaderboard_df( self.results_path, self.requests_path, COLS, BENCHMARK_COLS ) # Convert DataFrame to list of dictionaries data = df.to_dict('records') logger.info(f"Processed {len(data)} Icelandic leaderboard entries") return data except NameError: # Fallback: return mock data for testing logger.warning("Using mock data - original processing modules not available") return self._generate_mock_data() except Exception as e: logger.error(f"Failed to fetch Icelandic leaderboard data: {e}") raise HTTPException(status_code=500, detail=str(e)) def _generate_mock_data(self) -> List[Dict[str, Any]]: """Generate mock data for testing when original modules aren't available""" return [ { "Model": "test-model/icelandic-gpt-7b", "Average ⬆️": 85.5, "Type": "fine-tuned", "T": "🔶", "Precision": "bfloat16", "Architecture": "LlamaForCausalLM", "Hub License": "apache-2.0", "Hub ❤️": 42, "#Params (B)": 7.0, "Available on the hub": True, "Model sha": "abc123def456", "WinoGrande-IS (3-shot)": 78.5, "GED": 92.3, "Inflection (1-shot)": 85.1, "Belebele (IS)": 80.7, "ARC-Challenge-IS": 76.2, "WikiQA-IS": 89.4, "Reasoning": False, "Note": "" }, { "Model": "test-model/icelandic-llama-13b", "Average ⬆️": 88.2, "Type": "instruction-tuned", "T": "⭕", "Precision": "float16", "Architecture": "LlamaForCausalLM", "Hub License": "mit", "Hub ❤️": 156, "#Params (B)": 13.0, "Available on the hub": True, "Model sha": "def456abc789", "WinoGrande-IS (3-shot)": 82.1, "GED": 94.8, "Inflection (1-shot)": 87.9, "Belebele (IS)": 85.3, "ARC-Challenge-IS": 79.8, "WikiQA-IS": 91.2, "Reasoning": True, "Note": "reasoning model with 32k thinking budget" } ] async def get_formatted_data(self) -> List[Dict[str, Any]]: """Get formatted leaderboard data compatible with React frontend""" try: raw_data = await self.fetch_raw_data() formatted_data = [] for item in raw_data: try: formatted_item = await self.transform_data(item) formatted_data.append(formatted_item) except Exception as e: logger.error(f"Failed to format entry: {e}") continue logger.info(f"Formatted {len(formatted_data)} entries for frontend") return formatted_data except Exception as e: logger.error(f"Failed to format leaderboard data: {e}") raise HTTPException(status_code=500, detail=str(e)) async def transform_data(self, data: Dict[str, Any]) -> Dict[str, Any]: """Transform Icelandic leaderboard data into format expected by React frontend""" # Create unique ID and clean model name raw_model_name = data.get("Model", "Unknown") # Extract clean model name from HTML if present if '([^<]+)', raw_model_name) model_name = match.group(1) if match else raw_model_name else: model_name = raw_model_name precision = data.get("Precision", "Unknown") revision = data.get("Model sha", "Unknown") unique_id = f"{model_name}_{precision}_{revision}" # Map Icelandic tasks to evaluations format evaluations = {} task_mapping = { "WinoGrande-IS (3-shot)": "winogrande_is", "GED": "ged", "Inflection (1-shot)": "inflection", "Belebele (IS)": "belebele_is", "ARC-Challenge-IS": "arc_challenge_is", "WikiQA-IS": "wiki_qa_is" } for task_display_name, task_key in task_mapping.items(): if task_display_name in data: evaluations[task_key] = { "name": task_display_name, "value": data.get(task_display_name, 0), "normalized_score": data.get(task_display_name, 0) } # Extract model type and clean it model_type_symbol = data.get("T", "") model_type_name = data.get("Type", "Unknown") # Map Icelandic model types to frontend format type_mapping = { "pretrained": "pretrained", "fine-tuned": "fine-tuned", "instruction-tuned": "instruction-tuned", "RL-tuned": "RL-tuned" } clean_model_type = type_mapping.get(model_type_name, model_type_name) features = { "is_not_available_on_hub": not data.get("Available on the hub", True), "is_merged": False, # Not tracked in Icelandic leaderboard "is_moe": False, # Not tracked in Icelandic leaderboard "is_flagged": False, # Not tracked in Icelandic leaderboard "is_official_provider": False # Not tracked in Icelandic leaderboard } metadata = { "upload_date": None, # Not available in Icelandic data "submission_date": None, # Not available in Icelandic data "generation": None, # Not available in Icelandic data "base_model": None, # Not available in Icelandic data "hub_license": data.get("Hub License", ""), "hub_hearts": data.get("Hub ❤️", 0), "params_billions": data.get("#Params (B)", 0), "co2_cost": 0 # Not tracked in Icelandic leaderboard } transformed_data = { "id": unique_id, "model": { "name": model_name, "sha": revision, "precision": precision, "type": clean_model_type, "weight_type": None, # Not available in Icelandic data "architecture": data.get("Architecture", "Unknown"), "average_score": data.get("Average ⬆️", 0), "has_chat_template": False, # Not tracked in Icelandic leaderboard "reasoning": data.get("Reasoning", False), # Reasoning enabled flag "note": data.get("Note", "") # Extra model information }, "evaluations": evaluations, "features": features, "metadata": metadata } return transformed_data