File size: 5,147 Bytes
1c73b10 3ce2f84 1c73b10 3ce2f84 1c73b10 3ce2f84 1c73b10 3ce2f84 1c73b10 3ce2f84 1c73b10 3ce2f84 1c73b10 3ce2f84 1c73b10 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
from typing import Optional, Dict
import pandas as pd
from functools import lru_cache
from huggingface_hub import snapshot_download
import logging
import time
import os
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from config import CONFIG
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Configure requests with retries
def create_retry_session(
retries=5,
backoff_factor=0.5,
status_forcelist=(500, 502, 503, 504),
timeout=30
):
"""Create a requests session with retry capabilities"""
session = requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
session.timeout = timeout
return session
class DataManager:
def __init__(self):
self._leaderboard_data: Optional[pd.DataFrame] = None
self._responses_data: Optional[pd.DataFrame] = None
self._section_results_data: Optional[pd.DataFrame] = None
self._session = create_retry_session()
self._max_retries = 3
self._retry_delay = 2 # seconds
def _load_dataset(self, path: str) -> pd.DataFrame:
"""Load dataset with retries."""
attempts = 0
last_error = None
while attempts < self._max_retries:
try:
logger.info(f"Attempting to load dataset from {path} (attempt {attempts+1}/{self._max_retries})")
return pd.read_parquet(path)
except Exception as e:
last_error = e
logger.warning(f"Error loading dataset from {path}: {e}. Retrying in {self._retry_delay} seconds...")
attempts += 1
time.sleep(self._retry_delay)
# If we get here, all attempts failed
logger.error(f"Failed to load dataset after {self._max_retries} attempts: {last_error}")
# Return empty fallback dataframe with appropriate columns
if "leaderboard" in path:
return self._create_fallback_leaderboard()
elif "responses" in path:
return self._create_fallback_responses()
elif "section_results" in path:
return self._create_fallback_section_results()
else:
return pd.DataFrame()
def _create_fallback_leaderboard(self) -> pd.DataFrame:
"""Create a fallback leaderboard dataframe when loading fails."""
logger.info("Creating fallback leaderboard data")
return pd.DataFrame({
"model": ["Example Model"],
"family": ["Example"],
"quantization_level": ["None"],
"score": [0.0],
"timestamp": [pd.Timestamp.now()]
})
def _create_fallback_responses(self) -> pd.DataFrame:
"""Create a fallback responses dataframe when loading fails."""
logger.info("Creating fallback responses data")
return pd.DataFrame({
"bolum": ["Example"],
"soru": ["Example question"],
"cevap": ["Example answer"],
"Example_Model_cevap": ["Example model response"]
})
def _create_fallback_section_results(self) -> pd.DataFrame:
"""Create a fallback section results dataframe when loading fails."""
logger.info("Creating fallback section results data")
return pd.DataFrame({
"section": ["Example Section"],
"score": [0.0]
})
def refresh_datasets(self) -> None:
"""Refresh all datasets from source."""
try:
logger.info("Starting dataset refresh...")
snapshot_download(
repo_id="alibayram",
repo_type="dataset",
local_dir=CONFIG["dataset"].cache_dir,
max_retries=5,
retry_delay_seconds=2
)
# Clear cached data to force reload
self._leaderboard_data = None
self._responses_data = None
self._section_results_data = None
logger.info("Datasets refreshed successfully")
except Exception as e:
logger.error(f"Error refreshing datasets: {e}")
@property
def leaderboard_data(self) -> pd.DataFrame:
if self._leaderboard_data is None:
self._leaderboard_data = self._load_dataset(CONFIG["dataset"].leaderboard_path)
return self._leaderboard_data
@property
def responses_data(self) -> pd.DataFrame:
if self._responses_data is None:
self._responses_data = self._load_dataset(CONFIG["dataset"].responses_path)
return self._responses_data
@property
def section_results_data(self) -> pd.DataFrame:
if self._section_results_data is None:
self._section_results_data = self._load_dataset(CONFIG["dataset"].section_results_path)
return self._section_results_data
# Global instance
data_manager = DataManager() |