turkish_mmlu_leaderboard / data_manager.py
alibayram's picture
Implement robust data loading with retry logic, enhance error handling in Gradio app, and improve user experience with fallback data for leaderboard and responses. Update configuration for request timeouts and retries.
3ce2f84
from typing import Optional, Dict
import pandas as pd
from functools import lru_cache
from huggingface_hub import snapshot_download
import logging
import time
import os
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from config import CONFIG
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Configure requests with retries
def create_retry_session(
retries=5,
backoff_factor=0.5,
status_forcelist=(500, 502, 503, 504),
timeout=30
):
"""Create a requests session with retry capabilities"""
session = requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
session.timeout = timeout
return session
class DataManager:
def __init__(self):
self._leaderboard_data: Optional[pd.DataFrame] = None
self._responses_data: Optional[pd.DataFrame] = None
self._section_results_data: Optional[pd.DataFrame] = None
self._session = create_retry_session()
self._max_retries = 3
self._retry_delay = 2 # seconds
def _load_dataset(self, path: str) -> pd.DataFrame:
"""Load dataset with retries."""
attempts = 0
last_error = None
while attempts < self._max_retries:
try:
logger.info(f"Attempting to load dataset from {path} (attempt {attempts+1}/{self._max_retries})")
return pd.read_parquet(path)
except Exception as e:
last_error = e
logger.warning(f"Error loading dataset from {path}: {e}. Retrying in {self._retry_delay} seconds...")
attempts += 1
time.sleep(self._retry_delay)
# If we get here, all attempts failed
logger.error(f"Failed to load dataset after {self._max_retries} attempts: {last_error}")
# Return empty fallback dataframe with appropriate columns
if "leaderboard" in path:
return self._create_fallback_leaderboard()
elif "responses" in path:
return self._create_fallback_responses()
elif "section_results" in path:
return self._create_fallback_section_results()
else:
return pd.DataFrame()
def _create_fallback_leaderboard(self) -> pd.DataFrame:
"""Create a fallback leaderboard dataframe when loading fails."""
logger.info("Creating fallback leaderboard data")
return pd.DataFrame({
"model": ["Example Model"],
"family": ["Example"],
"quantization_level": ["None"],
"score": [0.0],
"timestamp": [pd.Timestamp.now()]
})
def _create_fallback_responses(self) -> pd.DataFrame:
"""Create a fallback responses dataframe when loading fails."""
logger.info("Creating fallback responses data")
return pd.DataFrame({
"bolum": ["Example"],
"soru": ["Example question"],
"cevap": ["Example answer"],
"Example_Model_cevap": ["Example model response"]
})
def _create_fallback_section_results(self) -> pd.DataFrame:
"""Create a fallback section results dataframe when loading fails."""
logger.info("Creating fallback section results data")
return pd.DataFrame({
"section": ["Example Section"],
"score": [0.0]
})
def refresh_datasets(self) -> None:
"""Refresh all datasets from source."""
try:
logger.info("Starting dataset refresh...")
snapshot_download(
repo_id="alibayram",
repo_type="dataset",
local_dir=CONFIG["dataset"].cache_dir,
max_retries=5,
retry_delay_seconds=2
)
# Clear cached data to force reload
self._leaderboard_data = None
self._responses_data = None
self._section_results_data = None
logger.info("Datasets refreshed successfully")
except Exception as e:
logger.error(f"Error refreshing datasets: {e}")
@property
def leaderboard_data(self) -> pd.DataFrame:
if self._leaderboard_data is None:
self._leaderboard_data = self._load_dataset(CONFIG["dataset"].leaderboard_path)
return self._leaderboard_data
@property
def responses_data(self) -> pd.DataFrame:
if self._responses_data is None:
self._responses_data = self._load_dataset(CONFIG["dataset"].responses_path)
return self._responses_data
@property
def section_results_data(self) -> pd.DataFrame:
if self._section_results_data is None:
self._section_results_data = self._load_dataset(CONFIG["dataset"].section_results_path)
return self._section_results_data
# Global instance
data_manager = DataManager()