File size: 5,147 Bytes
1c73b10
 
 
 
 
3ce2f84
 
 
 
 
1c73b10
 
 
 
 
3ce2f84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c73b10
 
 
 
 
3ce2f84
 
 
1c73b10
 
3ce2f84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c73b10
 
 
 
3ce2f84
1c73b10
 
 
3ce2f84
 
 
1c73b10
3ce2f84
 
 
 
1c73b10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from typing import Optional, Dict
import pandas as pd
from functools import lru_cache
from huggingface_hub import snapshot_download
import logging
import time
import os
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from config import CONFIG

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Configure requests with retries
def create_retry_session(
    retries=5,
    backoff_factor=0.5,
    status_forcelist=(500, 502, 503, 504),
    timeout=30
):
    """Create a requests session with retry capabilities"""
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    session.timeout = timeout
    return session

class DataManager:
    def __init__(self):
        self._leaderboard_data: Optional[pd.DataFrame] = None
        self._responses_data: Optional[pd.DataFrame] = None
        self._section_results_data: Optional[pd.DataFrame] = None
        self._session = create_retry_session()
        self._max_retries = 3
        self._retry_delay = 2  # seconds

    def _load_dataset(self, path: str) -> pd.DataFrame:
        """Load dataset with retries."""
        attempts = 0
        last_error = None
        
        while attempts < self._max_retries:
            try:
                logger.info(f"Attempting to load dataset from {path} (attempt {attempts+1}/{self._max_retries})")
                return pd.read_parquet(path)
            except Exception as e:
                last_error = e
                logger.warning(f"Error loading dataset from {path}: {e}. Retrying in {self._retry_delay} seconds...")
                attempts += 1
                time.sleep(self._retry_delay)
        
        # If we get here, all attempts failed
        logger.error(f"Failed to load dataset after {self._max_retries} attempts: {last_error}")
        
        # Return empty fallback dataframe with appropriate columns
        if "leaderboard" in path:
            return self._create_fallback_leaderboard()
        elif "responses" in path:
            return self._create_fallback_responses()
        elif "section_results" in path:
            return self._create_fallback_section_results()
        else:
            return pd.DataFrame()

    def _create_fallback_leaderboard(self) -> pd.DataFrame:
        """Create a fallback leaderboard dataframe when loading fails."""
        logger.info("Creating fallback leaderboard data")
        return pd.DataFrame({
            "model": ["Example Model"],
            "family": ["Example"],
            "quantization_level": ["None"],
            "score": [0.0],
            "timestamp": [pd.Timestamp.now()]
        })

    def _create_fallback_responses(self) -> pd.DataFrame:
        """Create a fallback responses dataframe when loading fails."""
        logger.info("Creating fallback responses data")
        return pd.DataFrame({
            "bolum": ["Example"],
            "soru": ["Example question"],
            "cevap": ["Example answer"],
            "Example_Model_cevap": ["Example model response"]
        })

    def _create_fallback_section_results(self) -> pd.DataFrame:
        """Create a fallback section results dataframe when loading fails."""
        logger.info("Creating fallback section results data")
        return pd.DataFrame({
            "section": ["Example Section"],
            "score": [0.0]
        })

    def refresh_datasets(self) -> None:
        """Refresh all datasets from source."""
        try:
            logger.info("Starting dataset refresh...")
            snapshot_download(
                repo_id="alibayram",
                repo_type="dataset",
                local_dir=CONFIG["dataset"].cache_dir,
                max_retries=5,
                retry_delay_seconds=2
            )
            # Clear cached data to force reload
            self._leaderboard_data = None
            self._responses_data = None
            self._section_results_data = None
            logger.info("Datasets refreshed successfully")
        except Exception as e:
            logger.error(f"Error refreshing datasets: {e}")

    @property
    def leaderboard_data(self) -> pd.DataFrame:
        if self._leaderboard_data is None:
            self._leaderboard_data = self._load_dataset(CONFIG["dataset"].leaderboard_path)
        return self._leaderboard_data

    @property
    def responses_data(self) -> pd.DataFrame:
        if self._responses_data is None:
            self._responses_data = self._load_dataset(CONFIG["dataset"].responses_path)
        return self._responses_data

    @property
    def section_results_data(self) -> pd.DataFrame:
        if self._section_results_data is None:
            self._section_results_data = self._load_dataset(CONFIG["dataset"].section_results_path)
        return self._section_results_data

# Global instance
data_manager = DataManager()