from typing import Optional, Dict import pandas as pd from functools import lru_cache from huggingface_hub import snapshot_download import logging import time import os import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from config import CONFIG logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Configure requests with retries def create_retry_session( retries=5, backoff_factor=0.5, status_forcelist=(500, 502, 503, 504), timeout=30 ): """Create a requests session with retry capabilities""" session = requests.Session() retry = Retry( total=retries, read=retries, connect=retries, backoff_factor=backoff_factor, status_forcelist=status_forcelist, ) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) session.timeout = timeout return session class DataManager: def __init__(self): self._leaderboard_data: Optional[pd.DataFrame] = None self._responses_data: Optional[pd.DataFrame] = None self._section_results_data: Optional[pd.DataFrame] = None self._session = create_retry_session() self._max_retries = 3 self._retry_delay = 2 # seconds def _load_dataset(self, path: str) -> pd.DataFrame: """Load dataset with retries.""" attempts = 0 last_error = None while attempts < self._max_retries: try: logger.info(f"Attempting to load dataset from {path} (attempt {attempts+1}/{self._max_retries})") return pd.read_parquet(path) except Exception as e: last_error = e logger.warning(f"Error loading dataset from {path}: {e}. Retrying in {self._retry_delay} seconds...") attempts += 1 time.sleep(self._retry_delay) # If we get here, all attempts failed logger.error(f"Failed to load dataset after {self._max_retries} attempts: {last_error}") # Return empty fallback dataframe with appropriate columns if "leaderboard" in path: return self._create_fallback_leaderboard() elif "responses" in path: return self._create_fallback_responses() elif "section_results" in path: return self._create_fallback_section_results() else: return pd.DataFrame() def _create_fallback_leaderboard(self) -> pd.DataFrame: """Create a fallback leaderboard dataframe when loading fails.""" logger.info("Creating fallback leaderboard data") return pd.DataFrame({ "model": ["Example Model"], "family": ["Example"], "quantization_level": ["None"], "score": [0.0], "timestamp": [pd.Timestamp.now()] }) def _create_fallback_responses(self) -> pd.DataFrame: """Create a fallback responses dataframe when loading fails.""" logger.info("Creating fallback responses data") return pd.DataFrame({ "bolum": ["Example"], "soru": ["Example question"], "cevap": ["Example answer"], "Example_Model_cevap": ["Example model response"] }) def _create_fallback_section_results(self) -> pd.DataFrame: """Create a fallback section results dataframe when loading fails.""" logger.info("Creating fallback section results data") return pd.DataFrame({ "section": ["Example Section"], "score": [0.0] }) def refresh_datasets(self) -> None: """Refresh all datasets from source.""" try: logger.info("Starting dataset refresh...") snapshot_download( repo_id="alibayram", repo_type="dataset", local_dir=CONFIG["dataset"].cache_dir, max_retries=5, retry_delay_seconds=2 ) # Clear cached data to force reload self._leaderboard_data = None self._responses_data = None self._section_results_data = None logger.info("Datasets refreshed successfully") except Exception as e: logger.error(f"Error refreshing datasets: {e}") @property def leaderboard_data(self) -> pd.DataFrame: if self._leaderboard_data is None: self._leaderboard_data = self._load_dataset(CONFIG["dataset"].leaderboard_path) return self._leaderboard_data @property def responses_data(self) -> pd.DataFrame: if self._responses_data is None: self._responses_data = self._load_dataset(CONFIG["dataset"].responses_path) return self._responses_data @property def section_results_data(self) -> pd.DataFrame: if self._section_results_data is None: self._section_results_data = self._load_dataset(CONFIG["dataset"].section_results_path) return self._section_results_data # Global instance data_manager = DataManager()