Implement robust data loading with retry logic, enhance error handling in Gradio app, and improve user experience with fallback data for leaderboard and responses. Update configuration for request timeouts and retries.
Browse files- Dockerfile +32 -0
- README.md +80 -0
- app.py +51 -14
- config.py +3 -0
- data_manager.py +93 -10
- utils.py +91 -41
Dockerfile
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
# Install system dependencies
|
6 |
+
RUN apt-get update && apt-get install -y \
|
7 |
+
build-essential \
|
8 |
+
curl \
|
9 |
+
software-properties-common \
|
10 |
+
git \
|
11 |
+
&& rm -rf /var/lib/apt/lists/*
|
12 |
+
|
13 |
+
# Copy requirements first for better caching
|
14 |
+
COPY requirements.txt .
|
15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
16 |
+
|
17 |
+
# Copy application code
|
18 |
+
COPY . .
|
19 |
+
|
20 |
+
# Create cache directory
|
21 |
+
RUN mkdir -p cache
|
22 |
+
|
23 |
+
# Set environment variables
|
24 |
+
ENV PYTHONUNBUFFERED=1
|
25 |
+
ENV GRADIO_SERVER_NAME=0.0.0.0
|
26 |
+
ENV GRADIO_SERVER_PORT=7860
|
27 |
+
|
28 |
+
# Expose port
|
29 |
+
EXPOSE 7860
|
30 |
+
|
31 |
+
# Command to run the application
|
32 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -10,6 +10,86 @@ license: cc-by-nc-4.0
|
|
10 |
short_description: Leaderboard showcasing Turkish MMLU dataset results.
|
11 |
---
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
# Start the configuration
|
14 |
|
15 |
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
|
|
10 |
short_description: Leaderboard showcasing Turkish MMLU dataset results.
|
11 |
---
|
12 |
|
13 |
+
# 🏆 Turkish MMLU Leaderboard
|
14 |
+
|
15 |
+
A web application for exploring, evaluating, and comparing AI model performance on the Turkish Massive Multitask Language Understanding (MMLU) benchmark.
|
16 |
+
|
17 |
+
## Features
|
18 |
+
|
19 |
+
- 📊 Interactive leaderboard with filtering capabilities
|
20 |
+
- 🔍 Search through model responses
|
21 |
+
- 📈 Visualize section-wise performance results
|
22 |
+
- ➕ Submit new models for evaluation
|
23 |
+
|
24 |
+
## Local Development
|
25 |
+
|
26 |
+
### Prerequisites
|
27 |
+
|
28 |
+
- Python 3.8+
|
29 |
+
- pip
|
30 |
+
|
31 |
+
### Installation
|
32 |
+
|
33 |
+
1. Clone the repository:
|
34 |
+
```bash
|
35 |
+
git clone https://github.com/yourusername/turkish_mmlu_leaderboard.git
|
36 |
+
cd turkish_mmlu_leaderboard
|
37 |
+
```
|
38 |
+
|
39 |
+
2. Install dependencies:
|
40 |
+
```bash
|
41 |
+
pip install -r requirements.txt
|
42 |
+
```
|
43 |
+
|
44 |
+
3. Run the application:
|
45 |
+
```bash
|
46 |
+
python app.py
|
47 |
+
```
|
48 |
+
|
49 |
+
4. Open your browser and navigate to `http://127.0.0.1:7860`
|
50 |
+
|
51 |
+
## Deploying to Hugging Face Spaces
|
52 |
+
|
53 |
+
### Option 1: Using the Hugging Face UI
|
54 |
+
|
55 |
+
1. Go to [Hugging Face Spaces](https://huggingface.co/spaces)
|
56 |
+
2. Click "Create a new Space"
|
57 |
+
3. Select "Gradio" as the SDK
|
58 |
+
4. Upload your files or connect to your GitHub repository
|
59 |
+
5. The Space will automatically build and deploy
|
60 |
+
|
61 |
+
### Option 2: Using the Dockerfile
|
62 |
+
|
63 |
+
1. Create a new Space on Hugging Face
|
64 |
+
2. Select "Docker" as the SDK
|
65 |
+
3. Upload your files including the Dockerfile
|
66 |
+
4. The Space will build and deploy using your Dockerfile
|
67 |
+
|
68 |
+
### Troubleshooting Hugging Face Deployment
|
69 |
+
|
70 |
+
If you encounter timeout issues when loading datasets:
|
71 |
+
|
72 |
+
1. Check the Space logs for specific error messages
|
73 |
+
2. Increase the timeout values in `config.py`
|
74 |
+
3. Make sure your datasets are accessible from Hugging Face Spaces
|
75 |
+
4. Consider using smaller datasets or pre-caching data
|
76 |
+
|
77 |
+
## Configuration
|
78 |
+
|
79 |
+
The application can be configured by modifying the `config.py` file:
|
80 |
+
|
81 |
+
- `DatasetConfig`: Configure dataset paths, cache settings, and refresh intervals
|
82 |
+
- `UIConfig`: Customize the UI appearance
|
83 |
+
- `ModelConfig`: Define model-related options
|
84 |
+
|
85 |
+
## Contributing
|
86 |
+
|
87 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
88 |
+
|
89 |
+
## License
|
90 |
+
|
91 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
92 |
+
|
93 |
# Start the configuration
|
94 |
|
95 |
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
app.py
CHANGED
@@ -2,6 +2,8 @@ import gradio as gr
|
|
2 |
from apscheduler.schedulers.background import BackgroundScheduler
|
3 |
from typing import Optional
|
4 |
import logging
|
|
|
|
|
5 |
|
6 |
from config import CONFIG
|
7 |
from data_manager import data_manager
|
@@ -13,6 +15,29 @@ logger = logging.getLogger(__name__)
|
|
13 |
def create_app() -> gr.Blocks:
|
14 |
"""Create and configure the Gradio application."""
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
with gr.Blocks(css=CONFIG["ui"].css, theme=CONFIG["ui"].theme) as app:
|
17 |
gr.HTML(f"<h1>{CONFIG['ui'].title}</h1>")
|
18 |
gr.Markdown(CONFIG["ui"].description)
|
@@ -22,12 +47,12 @@ def create_app() -> gr.Blocks:
|
|
22 |
with gr.TabItem("📊 Leaderboard"):
|
23 |
with gr.Row():
|
24 |
family_filter = gr.Dropdown(
|
25 |
-
choices=data_manager.leaderboard_data["family"].unique().tolist(),
|
26 |
label="Filter by Family",
|
27 |
multiselect=False
|
28 |
)
|
29 |
quantization_filter = gr.Dropdown(
|
30 |
-
choices=data_manager.leaderboard_data["quantization_level"].unique().tolist(),
|
31 |
label="Filter by Quantization Level"
|
32 |
)
|
33 |
|
@@ -47,7 +72,7 @@ def create_app() -> gr.Blocks:
|
|
47 |
with gr.TabItem("🔍 Model Responses"):
|
48 |
with gr.Row():
|
49 |
model_dropdown = gr.Dropdown(
|
50 |
-
choices=data_manager.leaderboard_data["model"].unique().tolist(),
|
51 |
label="Select Model"
|
52 |
)
|
53 |
query_input = gr.Textbox(
|
@@ -113,18 +138,30 @@ def create_app() -> gr.Blocks:
|
|
113 |
return app
|
114 |
|
115 |
def main():
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
124 |
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
if __name__ == "__main__":
|
130 |
main()
|
|
|
2 |
from apscheduler.schedulers.background import BackgroundScheduler
|
3 |
from typing import Optional
|
4 |
import logging
|
5 |
+
import sys
|
6 |
+
import time
|
7 |
|
8 |
from config import CONFIG
|
9 |
from data_manager import data_manager
|
|
|
15 |
def create_app() -> gr.Blocks:
|
16 |
"""Create and configure the Gradio application."""
|
17 |
|
18 |
+
# Pre-load data with retries to avoid startup failures
|
19 |
+
def safe_get_data():
|
20 |
+
max_attempts = 3
|
21 |
+
for attempt in range(max_attempts):
|
22 |
+
try:
|
23 |
+
logger.info(f"Pre-loading data (attempt {attempt+1}/{max_attempts})...")
|
24 |
+
# Try to access data to trigger loading
|
25 |
+
families = data_manager.leaderboard_data["family"].unique().tolist() if not data_manager.leaderboard_data.empty else []
|
26 |
+
models = data_manager.leaderboard_data["model"].unique().tolist() if not data_manager.leaderboard_data.empty else []
|
27 |
+
logger.info(f"Successfully loaded data with {len(families)} families and {len(models)} models")
|
28 |
+
return True
|
29 |
+
except Exception as e:
|
30 |
+
logger.error(f"Error pre-loading data: {e}")
|
31 |
+
if attempt < max_attempts - 1:
|
32 |
+
logger.info(f"Retrying in {CONFIG['dataset'].retry_delay} seconds...")
|
33 |
+
time.sleep(CONFIG["dataset"].retry_delay)
|
34 |
+
else:
|
35 |
+
logger.warning("Using fallback data due to loading failures")
|
36 |
+
return False
|
37 |
+
|
38 |
+
# Try to pre-load data
|
39 |
+
safe_get_data()
|
40 |
+
|
41 |
with gr.Blocks(css=CONFIG["ui"].css, theme=CONFIG["ui"].theme) as app:
|
42 |
gr.HTML(f"<h1>{CONFIG['ui'].title}</h1>")
|
43 |
gr.Markdown(CONFIG["ui"].description)
|
|
|
47 |
with gr.TabItem("📊 Leaderboard"):
|
48 |
with gr.Row():
|
49 |
family_filter = gr.Dropdown(
|
50 |
+
choices=data_manager.leaderboard_data["family"].unique().tolist() if not data_manager.leaderboard_data.empty else [],
|
51 |
label="Filter by Family",
|
52 |
multiselect=False
|
53 |
)
|
54 |
quantization_filter = gr.Dropdown(
|
55 |
+
choices=data_manager.leaderboard_data["quantization_level"].unique().tolist() if not data_manager.leaderboard_data.empty else [],
|
56 |
label="Filter by Quantization Level"
|
57 |
)
|
58 |
|
|
|
72 |
with gr.TabItem("🔍 Model Responses"):
|
73 |
with gr.Row():
|
74 |
model_dropdown = gr.Dropdown(
|
75 |
+
choices=data_manager.leaderboard_data["model"].unique().tolist() if not data_manager.leaderboard_data.empty else [],
|
76 |
label="Select Model"
|
77 |
)
|
78 |
query_input = gr.Textbox(
|
|
|
138 |
return app
|
139 |
|
140 |
def main():
|
141 |
+
try:
|
142 |
+
# Initialize scheduler for data refresh
|
143 |
+
scheduler = BackgroundScheduler()
|
144 |
+
scheduler.add_job(
|
145 |
+
data_manager.refresh_datasets,
|
146 |
+
"interval",
|
147 |
+
seconds=CONFIG["dataset"].refresh_interval
|
148 |
+
)
|
149 |
+
scheduler.start()
|
150 |
|
151 |
+
# Create and launch app
|
152 |
+
app = create_app()
|
153 |
+
app.queue(default_concurrency_limit=40).launch(
|
154 |
+
inbrowser=True,
|
155 |
+
server_name="0.0.0.0", # Use 0.0.0.0 to listen on all interfaces
|
156 |
+
server_port=7860,
|
157 |
+
share=False,
|
158 |
+
debug=False,
|
159 |
+
show_error=True,
|
160 |
+
max_threads=40
|
161 |
+
)
|
162 |
+
except Exception as e:
|
163 |
+
logger.error(f"Error starting application: {e}")
|
164 |
+
sys.exit(1)
|
165 |
|
166 |
if __name__ == "__main__":
|
167 |
main()
|
config.py
CHANGED
@@ -8,6 +8,9 @@ class DatasetConfig:
|
|
8 |
section_results_path: str = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_bolum_sonuclari/data/train-00000-of-00001.parquet"
|
9 |
cache_dir: str = "cache"
|
10 |
refresh_interval: int = 1800 # 30 minutes
|
|
|
|
|
|
|
11 |
|
12 |
@dataclass
|
13 |
class UIConfig:
|
|
|
8 |
section_results_path: str = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_bolum_sonuclari/data/train-00000-of-00001.parquet"
|
9 |
cache_dir: str = "cache"
|
10 |
refresh_interval: int = 1800 # 30 minutes
|
11 |
+
request_timeout: int = 60 # seconds
|
12 |
+
max_retries: int = 5
|
13 |
+
retry_delay: int = 2 # seconds
|
14 |
|
15 |
@dataclass
|
16 |
class UIConfig:
|
data_manager.py
CHANGED
@@ -3,36 +3,119 @@ import pandas as pd
|
|
3 |
from functools import lru_cache
|
4 |
from huggingface_hub import snapshot_download
|
5 |
import logging
|
|
|
|
|
|
|
|
|
|
|
6 |
from config import CONFIG
|
7 |
|
8 |
logging.basicConfig(level=logging.INFO)
|
9 |
logger = logging.getLogger(__name__)
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
class DataManager:
|
12 |
def __init__(self):
|
13 |
self._leaderboard_data: Optional[pd.DataFrame] = None
|
14 |
self._responses_data: Optional[pd.DataFrame] = None
|
15 |
self._section_results_data: Optional[pd.DataFrame] = None
|
|
|
|
|
|
|
16 |
|
17 |
-
@lru_cache(maxsize=1)
|
18 |
def _load_dataset(self, path: str) -> pd.DataFrame:
|
19 |
-
"""Load dataset with
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def refresh_datasets(self) -> None:
|
27 |
"""Refresh all datasets from source."""
|
28 |
try:
|
|
|
29 |
snapshot_download(
|
30 |
repo_id="alibayram",
|
31 |
repo_type="dataset",
|
32 |
-
local_dir=CONFIG["dataset"].cache_dir
|
|
|
|
|
33 |
)
|
34 |
-
# Clear
|
35 |
-
self.
|
|
|
|
|
36 |
logger.info("Datasets refreshed successfully")
|
37 |
except Exception as e:
|
38 |
logger.error(f"Error refreshing datasets: {e}")
|
|
|
3 |
from functools import lru_cache
|
4 |
from huggingface_hub import snapshot_download
|
5 |
import logging
|
6 |
+
import time
|
7 |
+
import os
|
8 |
+
import requests
|
9 |
+
from requests.adapters import HTTPAdapter
|
10 |
+
from urllib3.util.retry import Retry
|
11 |
from config import CONFIG
|
12 |
|
13 |
logging.basicConfig(level=logging.INFO)
|
14 |
logger = logging.getLogger(__name__)
|
15 |
|
16 |
+
# Configure requests with retries
|
17 |
+
def create_retry_session(
|
18 |
+
retries=5,
|
19 |
+
backoff_factor=0.5,
|
20 |
+
status_forcelist=(500, 502, 503, 504),
|
21 |
+
timeout=30
|
22 |
+
):
|
23 |
+
"""Create a requests session with retry capabilities"""
|
24 |
+
session = requests.Session()
|
25 |
+
retry = Retry(
|
26 |
+
total=retries,
|
27 |
+
read=retries,
|
28 |
+
connect=retries,
|
29 |
+
backoff_factor=backoff_factor,
|
30 |
+
status_forcelist=status_forcelist,
|
31 |
+
)
|
32 |
+
adapter = HTTPAdapter(max_retries=retry)
|
33 |
+
session.mount('http://', adapter)
|
34 |
+
session.mount('https://', adapter)
|
35 |
+
session.timeout = timeout
|
36 |
+
return session
|
37 |
+
|
38 |
class DataManager:
|
39 |
def __init__(self):
|
40 |
self._leaderboard_data: Optional[pd.DataFrame] = None
|
41 |
self._responses_data: Optional[pd.DataFrame] = None
|
42 |
self._section_results_data: Optional[pd.DataFrame] = None
|
43 |
+
self._session = create_retry_session()
|
44 |
+
self._max_retries = 3
|
45 |
+
self._retry_delay = 2 # seconds
|
46 |
|
|
|
47 |
def _load_dataset(self, path: str) -> pd.DataFrame:
|
48 |
+
"""Load dataset with retries."""
|
49 |
+
attempts = 0
|
50 |
+
last_error = None
|
51 |
+
|
52 |
+
while attempts < self._max_retries:
|
53 |
+
try:
|
54 |
+
logger.info(f"Attempting to load dataset from {path} (attempt {attempts+1}/{self._max_retries})")
|
55 |
+
return pd.read_parquet(path)
|
56 |
+
except Exception as e:
|
57 |
+
last_error = e
|
58 |
+
logger.warning(f"Error loading dataset from {path}: {e}. Retrying in {self._retry_delay} seconds...")
|
59 |
+
attempts += 1
|
60 |
+
time.sleep(self._retry_delay)
|
61 |
+
|
62 |
+
# If we get here, all attempts failed
|
63 |
+
logger.error(f"Failed to load dataset after {self._max_retries} attempts: {last_error}")
|
64 |
+
|
65 |
+
# Return empty fallback dataframe with appropriate columns
|
66 |
+
if "leaderboard" in path:
|
67 |
+
return self._create_fallback_leaderboard()
|
68 |
+
elif "responses" in path:
|
69 |
+
return self._create_fallback_responses()
|
70 |
+
elif "section_results" in path:
|
71 |
+
return self._create_fallback_section_results()
|
72 |
+
else:
|
73 |
+
return pd.DataFrame()
|
74 |
+
|
75 |
+
def _create_fallback_leaderboard(self) -> pd.DataFrame:
|
76 |
+
"""Create a fallback leaderboard dataframe when loading fails."""
|
77 |
+
logger.info("Creating fallback leaderboard data")
|
78 |
+
return pd.DataFrame({
|
79 |
+
"model": ["Example Model"],
|
80 |
+
"family": ["Example"],
|
81 |
+
"quantization_level": ["None"],
|
82 |
+
"score": [0.0],
|
83 |
+
"timestamp": [pd.Timestamp.now()]
|
84 |
+
})
|
85 |
+
|
86 |
+
def _create_fallback_responses(self) -> pd.DataFrame:
|
87 |
+
"""Create a fallback responses dataframe when loading fails."""
|
88 |
+
logger.info("Creating fallback responses data")
|
89 |
+
return pd.DataFrame({
|
90 |
+
"bolum": ["Example"],
|
91 |
+
"soru": ["Example question"],
|
92 |
+
"cevap": ["Example answer"],
|
93 |
+
"Example_Model_cevap": ["Example model response"]
|
94 |
+
})
|
95 |
+
|
96 |
+
def _create_fallback_section_results(self) -> pd.DataFrame:
|
97 |
+
"""Create a fallback section results dataframe when loading fails."""
|
98 |
+
logger.info("Creating fallback section results data")
|
99 |
+
return pd.DataFrame({
|
100 |
+
"section": ["Example Section"],
|
101 |
+
"score": [0.0]
|
102 |
+
})
|
103 |
|
104 |
def refresh_datasets(self) -> None:
|
105 |
"""Refresh all datasets from source."""
|
106 |
try:
|
107 |
+
logger.info("Starting dataset refresh...")
|
108 |
snapshot_download(
|
109 |
repo_id="alibayram",
|
110 |
repo_type="dataset",
|
111 |
+
local_dir=CONFIG["dataset"].cache_dir,
|
112 |
+
max_retries=5,
|
113 |
+
retry_delay_seconds=2
|
114 |
)
|
115 |
+
# Clear cached data to force reload
|
116 |
+
self._leaderboard_data = None
|
117 |
+
self._responses_data = None
|
118 |
+
self._section_results_data = None
|
119 |
logger.info("Datasets refreshed successfully")
|
120 |
except Exception as e:
|
121 |
logger.error(f"Error refreshing datasets: {e}")
|
utils.py
CHANGED
@@ -1,53 +1,97 @@
|
|
1 |
from typing import Optional, Dict
|
2 |
import pandas as pd
|
3 |
import matplotlib.pyplot as plt
|
|
|
4 |
from data_manager import data_manager
|
5 |
|
|
|
|
|
6 |
def filter_leaderboard(
|
7 |
family: Optional[str] = None,
|
8 |
quantization_level: Optional[str] = None
|
9 |
) -> pd.DataFrame:
|
10 |
"""Filter leaderboard data based on criteria."""
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
def search_responses(query: str, model: str) -> pd.DataFrame:
|
21 |
"""Search model responses based on query."""
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
data_manager.responses_data
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
def plot_section_results() -> plt.Figure:
|
33 |
"""Generate section results plot."""
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
def validate_model_submission(
|
53 |
model_name: str,
|
@@ -58,10 +102,16 @@ def validate_model_submission(
|
|
58 |
model_type: str
|
59 |
) -> tuple[bool, str]:
|
60 |
"""Validate model submission parameters."""
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from typing import Optional, Dict
|
2 |
import pandas as pd
|
3 |
import matplotlib.pyplot as plt
|
4 |
+
import logging
|
5 |
from data_manager import data_manager
|
6 |
|
7 |
+
logger = logging.getLogger(__name__)
|
8 |
+
|
9 |
def filter_leaderboard(
|
10 |
family: Optional[str] = None,
|
11 |
quantization_level: Optional[str] = None
|
12 |
) -> pd.DataFrame:
|
13 |
"""Filter leaderboard data based on criteria."""
|
14 |
+
try:
|
15 |
+
df = data_manager.leaderboard_data.copy()
|
16 |
+
|
17 |
+
if df.empty:
|
18 |
+
logger.warning("Leaderboard data is empty, returning empty DataFrame")
|
19 |
+
return pd.DataFrame()
|
20 |
+
|
21 |
+
if family:
|
22 |
+
df = df[df["family"] == family]
|
23 |
+
if quantization_level:
|
24 |
+
df = df[df["quantization_level"] == quantization_level]
|
25 |
+
|
26 |
+
return df.sort_values("score", ascending=False)
|
27 |
+
except Exception as e:
|
28 |
+
logger.error(f"Error filtering leaderboard: {e}")
|
29 |
+
return pd.DataFrame()
|
30 |
|
31 |
def search_responses(query: str, model: str) -> pd.DataFrame:
|
32 |
"""Search model responses based on query."""
|
33 |
+
try:
|
34 |
+
if not query or not model:
|
35 |
+
return pd.DataFrame()
|
36 |
+
|
37 |
+
df = data_manager.responses_data
|
38 |
+
|
39 |
+
if df.empty:
|
40 |
+
logger.warning("Responses data is empty, returning empty DataFrame")
|
41 |
+
return pd.DataFrame()
|
42 |
+
|
43 |
+
# Check if model column exists
|
44 |
+
model_column = f"{model}_cevap"
|
45 |
+
if model_column not in df.columns:
|
46 |
+
logger.warning(f"Model column '{model_column}' not found in responses data")
|
47 |
+
return pd.DataFrame({"error": [f"Model '{model}' responses not found"]})
|
48 |
+
|
49 |
+
filtered = df[
|
50 |
+
df["bolum"].str.contains(query, case=False, na=False)
|
51 |
+
]
|
52 |
+
|
53 |
+
selected_columns = ["bolum", "soru", "cevap", model_column]
|
54 |
+
return filtered[selected_columns].dropna()
|
55 |
+
except Exception as e:
|
56 |
+
logger.error(f"Error searching responses: {e}")
|
57 |
+
return pd.DataFrame({"error": [f"Error: {str(e)}"]})
|
58 |
|
59 |
def plot_section_results() -> plt.Figure:
|
60 |
"""Generate section results plot."""
|
61 |
+
try:
|
62 |
+
df = data_manager.section_results_data
|
63 |
+
|
64 |
+
if df.empty:
|
65 |
+
logger.warning("Section results data is empty, returning empty plot")
|
66 |
+
fig, ax = plt.subplots(figsize=(12, 6))
|
67 |
+
ax.text(0.5, 0.5, "No data available", ha='center', va='center', fontsize=14)
|
68 |
+
ax.set_title("Section-Wise Performance", pad=20)
|
69 |
+
plt.tight_layout()
|
70 |
+
return fig
|
71 |
+
|
72 |
+
fig, ax = plt.subplots(figsize=(12, 6))
|
73 |
+
avg_scores = df.mean(numeric_only=True)
|
74 |
+
|
75 |
+
bars = avg_scores.plot(kind="bar", ax=ax)
|
76 |
+
|
77 |
+
# Customize plot
|
78 |
+
ax.set_title("Average Section-Wise Performance", pad=20)
|
79 |
+
ax.set_ylabel("Accuracy (%)")
|
80 |
+
ax.set_xlabel("Sections")
|
81 |
+
plt.xticks(rotation=45, ha='right')
|
82 |
+
plt.tight_layout()
|
83 |
+
|
84 |
+
# Add value labels
|
85 |
+
for i, v in enumerate(avg_scores):
|
86 |
+
ax.text(i, v, f'{v:.1f}%', ha='center', va='bottom')
|
87 |
+
|
88 |
+
return fig
|
89 |
+
except Exception as e:
|
90 |
+
logger.error(f"Error plotting section results: {e}")
|
91 |
+
fig, ax = plt.subplots(figsize=(12, 6))
|
92 |
+
ax.text(0.5, 0.5, f"Error generating plot: {str(e)}", ha='center', va='center', fontsize=12)
|
93 |
+
plt.tight_layout()
|
94 |
+
return fig
|
95 |
|
96 |
def validate_model_submission(
|
97 |
model_name: str,
|
|
|
102 |
model_type: str
|
103 |
) -> tuple[bool, str]:
|
104 |
"""Validate model submission parameters."""
|
105 |
+
try:
|
106 |
+
if not all([model_name, base_model]):
|
107 |
+
return False, "Model name and base model are required."
|
108 |
+
|
109 |
+
# Check if leaderboard data is available
|
110 |
+
if not data_manager.leaderboard_data.empty:
|
111 |
+
if model_name in data_manager.leaderboard_data["model"].values:
|
112 |
+
return False, "Model name already exists."
|
113 |
+
|
114 |
+
return True, "Validation successful"
|
115 |
+
except Exception as e:
|
116 |
+
logger.error(f"Error validating model submission: {e}")
|
117 |
+
return False, f"Validation error: {str(e)}"
|