|
from typing import Optional, Dict |
|
import pandas as pd |
|
from functools import lru_cache |
|
from huggingface_hub import snapshot_download |
|
import logging |
|
import time |
|
import os |
|
import requests |
|
from requests.adapters import HTTPAdapter |
|
from urllib3.util.retry import Retry |
|
from config import CONFIG |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def create_retry_session( |
|
retries=5, |
|
backoff_factor=0.5, |
|
status_forcelist=(500, 502, 503, 504), |
|
timeout=30 |
|
): |
|
"""Create a requests session with retry capabilities""" |
|
session = requests.Session() |
|
retry = Retry( |
|
total=retries, |
|
read=retries, |
|
connect=retries, |
|
backoff_factor=backoff_factor, |
|
status_forcelist=status_forcelist, |
|
) |
|
adapter = HTTPAdapter(max_retries=retry) |
|
session.mount('http://', adapter) |
|
session.mount('https://', adapter) |
|
session.timeout = timeout |
|
return session |
|
|
|
class DataManager: |
|
def __init__(self): |
|
self._leaderboard_data: Optional[pd.DataFrame] = None |
|
self._responses_data: Optional[pd.DataFrame] = None |
|
self._section_results_data: Optional[pd.DataFrame] = None |
|
self._session = create_retry_session() |
|
self._max_retries = 3 |
|
self._retry_delay = 2 |
|
|
|
def _load_dataset(self, path: str) -> pd.DataFrame: |
|
"""Load dataset with retries.""" |
|
attempts = 0 |
|
last_error = None |
|
|
|
while attempts < self._max_retries: |
|
try: |
|
logger.info(f"Attempting to load dataset from {path} (attempt {attempts+1}/{self._max_retries})") |
|
return pd.read_parquet(path) |
|
except Exception as e: |
|
last_error = e |
|
logger.warning(f"Error loading dataset from {path}: {e}. Retrying in {self._retry_delay} seconds...") |
|
attempts += 1 |
|
time.sleep(self._retry_delay) |
|
|
|
|
|
logger.error(f"Failed to load dataset after {self._max_retries} attempts: {last_error}") |
|
|
|
|
|
if "leaderboard" in path: |
|
return self._create_fallback_leaderboard() |
|
elif "responses" in path: |
|
return self._create_fallback_responses() |
|
elif "section_results" in path: |
|
return self._create_fallback_section_results() |
|
else: |
|
return pd.DataFrame() |
|
|
|
def _create_fallback_leaderboard(self) -> pd.DataFrame: |
|
"""Create a fallback leaderboard dataframe when loading fails.""" |
|
logger.info("Creating fallback leaderboard data") |
|
return pd.DataFrame({ |
|
"model": ["Example Model"], |
|
"family": ["Example"], |
|
"quantization_level": ["None"], |
|
"score": [0.0], |
|
"timestamp": [pd.Timestamp.now()] |
|
}) |
|
|
|
def _create_fallback_responses(self) -> pd.DataFrame: |
|
"""Create a fallback responses dataframe when loading fails.""" |
|
logger.info("Creating fallback responses data") |
|
return pd.DataFrame({ |
|
"bolum": ["Example"], |
|
"soru": ["Example question"], |
|
"cevap": ["Example answer"], |
|
"Example_Model_cevap": ["Example model response"] |
|
}) |
|
|
|
def _create_fallback_section_results(self) -> pd.DataFrame: |
|
"""Create a fallback section results dataframe when loading fails.""" |
|
logger.info("Creating fallback section results data") |
|
return pd.DataFrame({ |
|
"section": ["Example Section"], |
|
"score": [0.0] |
|
}) |
|
|
|
def refresh_datasets(self) -> None: |
|
"""Refresh all datasets from source.""" |
|
try: |
|
logger.info("Starting dataset refresh...") |
|
snapshot_download( |
|
repo_id="alibayram", |
|
repo_type="dataset", |
|
local_dir=CONFIG["dataset"].cache_dir, |
|
max_retries=5, |
|
retry_delay_seconds=2 |
|
) |
|
|
|
self._leaderboard_data = None |
|
self._responses_data = None |
|
self._section_results_data = None |
|
logger.info("Datasets refreshed successfully") |
|
except Exception as e: |
|
logger.error(f"Error refreshing datasets: {e}") |
|
|
|
@property |
|
def leaderboard_data(self) -> pd.DataFrame: |
|
if self._leaderboard_data is None: |
|
self._leaderboard_data = self._load_dataset(CONFIG["dataset"].leaderboard_path) |
|
return self._leaderboard_data |
|
|
|
@property |
|
def responses_data(self) -> pd.DataFrame: |
|
if self._responses_data is None: |
|
self._responses_data = self._load_dataset(CONFIG["dataset"].responses_path) |
|
return self._responses_data |
|
|
|
@property |
|
def section_results_data(self) -> pd.DataFrame: |
|
if self._section_results_data is None: |
|
self._section_results_data = self._load_dataset(CONFIG["dataset"].section_results_path) |
|
return self._section_results_data |
|
|
|
|
|
data_manager = DataManager() |