Spaces:

alibayram
/

turkish_mmlu_leaderboard

Running

App Files Files Community

alibayram commited on Mar 14

Commit

3ce2f84

1 Parent(s): 1c73b10

Implement robust data loading with retry logic, enhance error handling in Gradio app, and improve user experience with fallback data for leaderboard and responses. Update configuration for request timeouts and retries.

Browse files

Files changed (6) hide show

Dockerfile +32 -0
README.md +80 -0
app.py +51 -14
config.py +3 -0
data_manager.py +93 -10
utils.py +91 -41

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    software-properties-common \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create cache directory
+RUN mkdir -p cache
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV GRADIO_SERVER_NAME=0.0.0.0
+ENV GRADIO_SERVER_PORT=7860
+# Expose port
+EXPOSE 7860
+# Command to run the application
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -10,6 +10,86 @@ license: cc-by-nc-4.0
 short_description: Leaderboard showcasing Turkish MMLU dataset results.
 ---
 # Start the configuration
 Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).

 short_description: Leaderboard showcasing Turkish MMLU dataset results.
 ---
+# 🏆 Turkish MMLU Leaderboard
+A web application for exploring, evaluating, and comparing AI model performance on the Turkish Massive Multitask Language Understanding (MMLU) benchmark.
+## Features
+- 📊 Interactive leaderboard with filtering capabilities
+- 🔍 Search through model responses
+- 📈 Visualize section-wise performance results
+- ➕ Submit new models for evaluation
+## Local Development
+### Prerequisites
+- Python 3.8+
+- pip
+### Installation
+1. Clone the repository:
+   ```bash
+   git clone https://github.com/yourusername/turkish_mmlu_leaderboard.git
+   cd turkish_mmlu_leaderboard
+   ```
+2. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. Run the application:
+   ```bash
+   python app.py
+   ```
+4. Open your browser and navigate to `http://127.0.0.1:7860`
+## Deploying to Hugging Face Spaces
+### Option 1: Using the Hugging Face UI
+1. Go to [Hugging Face Spaces](https://huggingface.co/spaces)
+2. Click "Create a new Space"
+3. Select "Gradio" as the SDK
+4. Upload your files or connect to your GitHub repository
+5. The Space will automatically build and deploy
+### Option 2: Using the Dockerfile
+1. Create a new Space on Hugging Face
+2. Select "Docker" as the SDK
+3. Upload your files including the Dockerfile
+4. The Space will build and deploy using your Dockerfile
+### Troubleshooting Hugging Face Deployment
+If you encounter timeout issues when loading datasets:
+1. Check the Space logs for specific error messages
+2. Increase the timeout values in `config.py`
+3. Make sure your datasets are accessible from Hugging Face Spaces
+4. Consider using smaller datasets or pre-caching data
+## Configuration
+The application can be configured by modifying the `config.py` file:
+- `DatasetConfig`: Configure dataset paths, cache settings, and refresh intervals
+- `UIConfig`: Customize the UI appearance
+- `ModelConfig`: Define model-related options
+## Contributing
+Contributions are welcome! Please feel free to submit a Pull Request.
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.
 # Start the configuration
 Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
 from typing import Optional
 import logging
 from config import CONFIG
 from data_manager import data_manager
@@ -13,6 +15,29 @@ logger = logging.getLogger(__name__)
 def create_app() -> gr.Blocks:
     """Create and configure the Gradio application."""
     with gr.Blocks(css=CONFIG["ui"].css, theme=CONFIG["ui"].theme) as app:
         gr.HTML(f"<h1>{CONFIG['ui'].title}</h1>")
         gr.Markdown(CONFIG["ui"].description)
@@ -22,12 +47,12 @@ def create_app() -> gr.Blocks:
             with gr.TabItem("📊 Leaderboard"):
                 with gr.Row():
                     family_filter = gr.Dropdown(
-                        choices=data_manager.leaderboard_data["family"].unique().tolist(),
                         label="Filter by Family",
                         multiselect=False
                     )
                     quantization_filter = gr.Dropdown(
-                        choices=data_manager.leaderboard_data["quantization_level"].unique().tolist(),
                         label="Filter by Quantization Level"
                     )
@@ -47,7 +72,7 @@ def create_app() -> gr.Blocks:
             with gr.TabItem("🔍 Model Responses"):
                 with gr.Row():
                     model_dropdown = gr.Dropdown(
-                        choices=data_manager.leaderboard_data["model"].unique().tolist(),
                         label="Select Model"
                     )
                     query_input = gr.Textbox(
@@ -113,18 +138,30 @@ def create_app() -> gr.Blocks:
     return app
 def main():
-    # Initialize scheduler for data refresh
-    scheduler = BackgroundScheduler()
-    scheduler.add_job(
-        data_manager.refresh_datasets,
-        "interval",
-        seconds=CONFIG["dataset"].refresh_interval
-    )
-    scheduler.start()
-    # Create and launch app
-    app = create_app()
-    app.queue(default_concurrency_limit=40).launch()
 if __name__ == "__main__":
     main()

 from apscheduler.schedulers.background import BackgroundScheduler
 from typing import Optional
 import logging
+import sys
+import time
 from config import CONFIG
 from data_manager import data_manager
 def create_app() -> gr.Blocks:
     """Create and configure the Gradio application."""
+    # Pre-load data with retries to avoid startup failures
+    def safe_get_data():
+        max_attempts = 3
+        for attempt in range(max_attempts):
+            try:
+                logger.info(f"Pre-loading data (attempt {attempt+1}/{max_attempts})...")
+                # Try to access data to trigger loading
+                families = data_manager.leaderboard_data["family"].unique().tolist() if not data_manager.leaderboard_data.empty else []
+                models = data_manager.leaderboard_data["model"].unique().tolist() if not data_manager.leaderboard_data.empty else []
+                logger.info(f"Successfully loaded data with {len(families)} families and {len(models)} models")
+                return True
+            except Exception as e:
+                logger.error(f"Error pre-loading data: {e}")
+                if attempt < max_attempts - 1:
+                    logger.info(f"Retrying in {CONFIG['dataset'].retry_delay} seconds...")
+                    time.sleep(CONFIG["dataset"].retry_delay)
+                else:
+                    logger.warning("Using fallback data due to loading failures")
+                    return False
+    # Try to pre-load data
+    safe_get_data()
     with gr.Blocks(css=CONFIG["ui"].css, theme=CONFIG["ui"].theme) as app:
         gr.HTML(f"<h1>{CONFIG['ui'].title}</h1>")
         gr.Markdown(CONFIG["ui"].description)
             with gr.TabItem("📊 Leaderboard"):
                 with gr.Row():
                     family_filter = gr.Dropdown(
+                        choices=data_manager.leaderboard_data["family"].unique().tolist() if not data_manager.leaderboard_data.empty else [],
                         label="Filter by Family",
                         multiselect=False
                     )
                     quantization_filter = gr.Dropdown(
+                        choices=data_manager.leaderboard_data["quantization_level"].unique().tolist() if not data_manager.leaderboard_data.empty else [],
                         label="Filter by Quantization Level"
                     )
             with gr.TabItem("🔍 Model Responses"):
                 with gr.Row():
                     model_dropdown = gr.Dropdown(
+                        choices=data_manager.leaderboard_data["model"].unique().tolist() if not data_manager.leaderboard_data.empty else [],
                         label="Select Model"
                     )
                     query_input = gr.Textbox(
     return app
 def main():
+    try:
+        # Initialize scheduler for data refresh
+        scheduler = BackgroundScheduler()
+        scheduler.add_job(
+            data_manager.refresh_datasets,
+            "interval",
+            seconds=CONFIG["dataset"].refresh_interval
+        )
+        scheduler.start()
+        # Create and launch app
+        app = create_app()
+        app.queue(default_concurrency_limit=40).launch(
+            inbrowser=True,
+            server_name="0.0.0.0",  # Use 0.0.0.0 to listen on all interfaces
+            server_port=7860,
+            share=False,
+            debug=False,
+            show_error=True,
+            max_threads=40
+        )
+    except Exception as e:
+        logger.error(f"Error starting application: {e}")
+        sys.exit(1)
 if __name__ == "__main__":
     main()

config.py CHANGED Viewed

@@ -8,6 +8,9 @@ class DatasetConfig:
     section_results_path: str = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_bolum_sonuclari/data/train-00000-of-00001.parquet"
     cache_dir: str = "cache"
     refresh_interval: int = 1800  # 30 minutes
 @dataclass
 class UIConfig:

     section_results_path: str = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_bolum_sonuclari/data/train-00000-of-00001.parquet"
     cache_dir: str = "cache"
     refresh_interval: int = 1800  # 30 minutes
+    request_timeout: int = 60  # seconds
+    max_retries: int = 5
+    retry_delay: int = 2  # seconds
 @dataclass
 class UIConfig:

data_manager.py CHANGED Viewed

@@ -3,36 +3,119 @@ import pandas as pd
 from functools import lru_cache
 from huggingface_hub import snapshot_download
 import logging
 from config import CONFIG
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class DataManager:
     def __init__(self):
         self._leaderboard_data: Optional[pd.DataFrame] = None
         self._responses_data: Optional[pd.DataFrame] = None
         self._section_results_data: Optional[pd.DataFrame] = None
-    @lru_cache(maxsize=1)
     def _load_dataset(self, path: str) -> pd.DataFrame:
-        """Load dataset with caching."""
-        try:
-            return pd.read_parquet(path)
-        except Exception as e:
-            logger.error(f"Error loading dataset from {path}: {e}")
-            raise RuntimeError(f"Failed to load dataset: {e}")
     def refresh_datasets(self) -> None:
         """Refresh all datasets from source."""
         try:
             snapshot_download(
                 repo_id="alibayram",
                 repo_type="dataset",
-                local_dir=CONFIG["dataset"].cache_dir
             )
-            # Clear cache to force reload
-            self._load_dataset.cache_clear()
             logger.info("Datasets refreshed successfully")
         except Exception as e:
             logger.error(f"Error refreshing datasets: {e}")

 from functools import lru_cache
 from huggingface_hub import snapshot_download
 import logging
+import time
+import os
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
 from config import CONFIG
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Configure requests with retries
+def create_retry_session(
+    retries=5,
+    backoff_factor=0.5,
+    status_forcelist=(500, 502, 503, 504),
+    timeout=30
+):
+    """Create a requests session with retry capabilities"""
+    session = requests.Session()
+    retry = Retry(
+        total=retries,
+        read=retries,
+        connect=retries,
+        backoff_factor=backoff_factor,
+        status_forcelist=status_forcelist,
+    )
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount('http://', adapter)
+    session.mount('https://', adapter)
+    session.timeout = timeout
+    return session
 class DataManager:
     def __init__(self):
         self._leaderboard_data: Optional[pd.DataFrame] = None
         self._responses_data: Optional[pd.DataFrame] = None
         self._section_results_data: Optional[pd.DataFrame] = None
+        self._session = create_retry_session()
+        self._max_retries = 3
+        self._retry_delay = 2  # seconds
     def _load_dataset(self, path: str) -> pd.DataFrame:
+        """Load dataset with retries."""
+        attempts = 0
+        last_error = None
+        while attempts < self._max_retries:
+            try:
+                logger.info(f"Attempting to load dataset from {path} (attempt {attempts+1}/{self._max_retries})")
+                return pd.read_parquet(path)
+            except Exception as e:
+                last_error = e
+                logger.warning(f"Error loading dataset from {path}: {e}. Retrying in {self._retry_delay} seconds...")
+                attempts += 1
+                time.sleep(self._retry_delay)
+        # If we get here, all attempts failed
+        logger.error(f"Failed to load dataset after {self._max_retries} attempts: {last_error}")
+        # Return empty fallback dataframe with appropriate columns
+        if "leaderboard" in path:
+            return self._create_fallback_leaderboard()
+        elif "responses" in path:
+            return self._create_fallback_responses()
+        elif "section_results" in path:
+            return self._create_fallback_section_results()
+        else:
+            return pd.DataFrame()
+    def _create_fallback_leaderboard(self) -> pd.DataFrame:
+        """Create a fallback leaderboard dataframe when loading fails."""
+        logger.info("Creating fallback leaderboard data")
+        return pd.DataFrame({
+            "model": ["Example Model"],
+            "family": ["Example"],
+            "quantization_level": ["None"],
+            "score": [0.0],
+            "timestamp": [pd.Timestamp.now()]
+        })
+    def _create_fallback_responses(self) -> pd.DataFrame:
+        """Create a fallback responses dataframe when loading fails."""
+        logger.info("Creating fallback responses data")
+        return pd.DataFrame({
+            "bolum": ["Example"],
+            "soru": ["Example question"],
+            "cevap": ["Example answer"],
+            "Example_Model_cevap": ["Example model response"]
+        })
+    def _create_fallback_section_results(self) -> pd.DataFrame:
+        """Create a fallback section results dataframe when loading fails."""
+        logger.info("Creating fallback section results data")
+        return pd.DataFrame({
+            "section": ["Example Section"],
+            "score": [0.0]
+        })
     def refresh_datasets(self) -> None:
         """Refresh all datasets from source."""
         try:
+            logger.info("Starting dataset refresh...")
             snapshot_download(
                 repo_id="alibayram",
                 repo_type="dataset",
+                local_dir=CONFIG["dataset"].cache_dir,
+                max_retries=5,
+                retry_delay_seconds=2
             )
+            # Clear cached data to force reload
+            self._leaderboard_data = None
+            self._responses_data = None
+            self._section_results_data = None
             logger.info("Datasets refreshed successfully")
         except Exception as e:
             logger.error(f"Error refreshing datasets: {e}")

utils.py CHANGED Viewed

@@ -1,53 +1,97 @@
 from typing import Optional, Dict
 import pandas as pd
 import matplotlib.pyplot as plt
 from data_manager import data_manager
 def filter_leaderboard(
     family: Optional[str] = None,
     quantization_level: Optional[str] = None
 ) -> pd.DataFrame:
     """Filter leaderboard data based on criteria."""
-    df = data_manager.leaderboard_data.copy()
-    if family:
-        df = df[df["family"] == family]
-    if quantization_level:
-        df = df[df["quantization_level"] == quantization_level]
-    return df.sort_values("score", ascending=False)
 def search_responses(query: str, model: str) -> pd.DataFrame:
     """Search model responses based on query."""
-    if not query or not model:
-        return pd.DataFrame()
-    filtered = data_manager.responses_data[
-        data_manager.responses_data["bolum"].str.contains(query, case=False, na=False)
-    ]
-    selected_columns = ["bolum", "soru", "cevap", f"{model}_cevap"]
-    return filtered[selected_columns].dropna()
 def plot_section_results() -> plt.Figure:
     """Generate section results plot."""
-    fig, ax = plt.subplots(figsize=(12, 6))
-    avg_scores = data_manager.section_results_data.mean(numeric_only=True)
-    bars = avg_scores.plot(kind="bar", ax=ax)
-    # Customize plot
-    ax.set_title("Average Section-Wise Performance", pad=20)
-    ax.set_ylabel("Accuracy (%)")
-    ax.set_xlabel("Sections")
-    plt.xticks(rotation=45, ha='right')
-    plt.tight_layout()
-    # Add value labels
-    for i, v in enumerate(avg_scores):
-        ax.text(i, v, f'{v:.1f}%', ha='center', va='bottom')
-    return fig
 def validate_model_submission(
     model_name: str,
@@ -58,10 +102,16 @@ def validate_model_submission(
     model_type: str
 ) -> tuple[bool, str]:
     """Validate model submission parameters."""
-    if not all([model_name, base_model]):
-        return False, "Model name and base model are required."
-    if model_name in data_manager.leaderboard_data["model"].values:
-        return False, "Model name already exists."
-    return True, "Validation successful"

 from typing import Optional, Dict
 import pandas as pd
 import matplotlib.pyplot as plt
+import logging
 from data_manager import data_manager
+logger = logging.getLogger(__name__)
 def filter_leaderboard(
     family: Optional[str] = None,
     quantization_level: Optional[str] = None
 ) -> pd.DataFrame:
     """Filter leaderboard data based on criteria."""
+    try:
+        df = data_manager.leaderboard_data.copy()
+        if df.empty:
+            logger.warning("Leaderboard data is empty, returning empty DataFrame")
+            return pd.DataFrame()
+        if family:
+            df = df[df["family"] == family]
+        if quantization_level:
+            df = df[df["quantization_level"] == quantization_level]
+        return df.sort_values("score", ascending=False)
+    except Exception as e:
+        logger.error(f"Error filtering leaderboard: {e}")
+        return pd.DataFrame()
 def search_responses(query: str, model: str) -> pd.DataFrame:
     """Search model responses based on query."""
+    try:
+        if not query or not model:
+            return pd.DataFrame()
+        df = data_manager.responses_data
+        if df.empty:
+            logger.warning("Responses data is empty, returning empty DataFrame")
+            return pd.DataFrame()
+        # Check if model column exists
+        model_column = f"{model}_cevap"
+        if model_column not in df.columns:
+            logger.warning(f"Model column '{model_column}' not found in responses data")
+            return pd.DataFrame({"error": [f"Model '{model}' responses not found"]})
+        filtered = df[
+            df["bolum"].str.contains(query, case=False, na=False)
+        ]
+        selected_columns = ["bolum", "soru", "cevap", model_column]
+        return filtered[selected_columns].dropna()
+    except Exception as e:
+        logger.error(f"Error searching responses: {e}")
+        return pd.DataFrame({"error": [f"Error: {str(e)}"]})
 def plot_section_results() -> plt.Figure:
     """Generate section results plot."""
+    try:
+        df = data_manager.section_results_data
+        if df.empty:
+            logger.warning("Section results data is empty, returning empty plot")
+            fig, ax = plt.subplots(figsize=(12, 6))
+            ax.text(0.5, 0.5, "No data available", ha='center', va='center', fontsize=14)
+            ax.set_title("Section-Wise Performance", pad=20)
+            plt.tight_layout()
+            return fig
+        fig, ax = plt.subplots(figsize=(12, 6))
+        avg_scores = df.mean(numeric_only=True)
+        bars = avg_scores.plot(kind="bar", ax=ax)
+        # Customize plot
+        ax.set_title("Average Section-Wise Performance", pad=20)
+        ax.set_ylabel("Accuracy (%)")
+        ax.set_xlabel("Sections")
+        plt.xticks(rotation=45, ha='right')
+        plt.tight_layout()
+        # Add value labels
+        for i, v in enumerate(avg_scores):
+            ax.text(i, v, f'{v:.1f}%', ha='center', va='bottom')
+        return fig
+    except Exception as e:
+        logger.error(f"Error plotting section results: {e}")
+        fig, ax = plt.subplots(figsize=(12, 6))
+        ax.text(0.5, 0.5, f"Error generating plot: {str(e)}", ha='center', va='center', fontsize=12)
+        plt.tight_layout()
+        return fig
 def validate_model_submission(
     model_name: str,
     model_type: str
 ) -> tuple[bool, str]:
     """Validate model submission parameters."""
+    try:
+        if not all([model_name, base_model]):
+            return False, "Model name and base model are required."
+        # Check if leaderboard data is available
+        if not data_manager.leaderboard_data.empty:
+            if model_name in data_manager.leaderboard_data["model"].values:
+                return False, "Model name already exists."
+        return True, "Validation successful"
+    except Exception as e:
+        logger.error(f"Error validating model submission: {e}")
+        return False, f"Validation error: {str(e)}"