Spaces:

Omartificial-Intelligence-Space
/

Arabic-MMMLU-Leaderborad

Running

App Files Files Community

Omartificial-Intelligence-Space commited on Feb 27

Commit

df4e9ef

verified ·

1 Parent(s): 8355c4d

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -222

app.py CHANGED Viewed

@@ -1,263 +1,148 @@
 import gradio as gr
 import pandas as pd
-import os
-import json
-from src.populate import get_leaderboard_df
-from src.display.utils import COLUMNS, COLS, BENCHMARK_COLS, EVAL_COLS
-from src.envs import EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH
-# Print paths for debugging
-print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
-print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
-# Minimal CSS
-minimal_css = """
-.container {
-    max-width: 1200px;
-    margin: 0 auto;
-}
-.header {
-    text-align: center;
-    margin-bottom: 20px;
-}
-"""
-# Function to load data directly from JSON files
-def load_data_directly():
-    if not os.path.exists(EVAL_RESULTS_PATH):
-        print(f"Path does not exist: {EVAL_RESULTS_PATH}")
-        return pd.DataFrame()
-    result_files = [
-        os.path.join(EVAL_RESULTS_PATH, f)
-        for f in os.listdir(EVAL_RESULTS_PATH)
-        if f.endswith('.json')
-    ]
-    print(f"Found {len(result_files)} JSON files")
-    data_list = []
-    for file in result_files:
-        try:
-            with open(file, 'r') as f:
-                data = json.load(f)
-            flattened_data = {}
-            # Extract both config and results
-            flattened_data.update(data.get('config', {}))
-            flattened_data.update(data.get('results', {}))
-            data_list.append(flattened_data)
-        except Exception as e:
-            print(f"Error loading file {file}: {e}")
-    if not data_list:
-        print("No data loaded from JSON files")
-        return pd.DataFrame()
-    df = pd.DataFrame(data_list)
-    print(f"Successfully loaded DataFrame with shape: {df.shape}")
-    return df
-# Try to load data using both methods
 try:
-    print("Attempting to load data using get_leaderboard_df...")
-    LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-    print(f"get_leaderboard_df result shape: {LEADERBOARD_DF.shape}")
-    # If that fails or returns empty, try direct loading
-    if LEADERBOARD_DF.empty:
-        print("get_leaderboard_df returned empty DataFrame, trying direct loading...")
-        LEADERBOARD_DF = load_data_directly()
-    # If still empty, create a sample
-    if LEADERBOARD_DF.empty:
-        print("Both methods returned empty DataFrames, creating sample data")
-        LEADERBOARD_DF = pd.DataFrame([{
-            "model_name": "Sample Model",
-            "average": 75.5,
-            "model_type": "Encoder",
-            "precision": "float16"
-        }])
-except Exception as e:
-    print(f"Error in data loading: {e}")
-    # Create a minimal DataFrame
-    LEADERBOARD_DF = pd.DataFrame([{
-        "model_name": "Error Loading Data",
-        "average": 0
-    }])
-# Print final DataFrame info
-print(f"Final DataFrame shape: {LEADERBOARD_DF.shape}")
-print(f"Final DataFrame columns: {LEADERBOARD_DF.columns.tolist()}")
-# Select important columns for display
-display_cols = ["model_name", "average", "model_type", "precision", "weight_type", "license"]
-# Add some subject columns
-subject_cols = [
-    "abstract_algebra", "anatomy", "astronomy", "business_ethics",
-    "college_biology", "college_chemistry", "college_computer_science",
-    "high_school_mathematics", "machine_learning"
-]
-# Add all detected subject columns
-for col in LEADERBOARD_DF.columns:
-    if col not in display_cols and col not in ["submitted_time", "revision", "base_model", "likes", "params"]:
-        subject_cols.append(col)
-# Combine columns, filtering to only those that exist
-all_display_cols = display_cols + subject_cols
-actual_display_cols = [col for col in all_display_cols if col in LEADERBOARD_DF.columns]
-# Ensure we have at least some columns
-if not actual_display_cols and not LEADERBOARD_DF.empty:
-    actual_display_cols = LEADERBOARD_DF.columns.tolist()
-# Filter the DataFrame
-if not LEADERBOARD_DF.empty:
-    display_df = LEADERBOARD_DF[actual_display_cols].copy()
-    # Round numeric columns for display
-    for col in display_df.columns:
-        if pd.api.types.is_numeric_dtype(display_df[col]):
-            display_df[col] = display_df[col].round(2)
-    # Sort by average if it exists
-    if "average" in display_df.columns:
-        display_df = display_df.sort_values(by="average", ascending=False)
-else:
-    display_df = LEADERBOARD_DF
-# Create the app
-with gr.Blocks(css=minimal_css) as demo:
-    gr.HTML("<div class='header'><h1>ILMAAM: Index for Language Models for Arabic Assessment on Multitasks</h1></div>")
-    with gr.Tabs() as tabs:
-        with gr.TabItem("LLM Benchmark"):
-            # Add debug output
-            with gr.Accordion("Debug Info", open=True):
-                gr.Markdown(f"DataFrame Shape: {display_df.shape}")
-                gr.Markdown(f"Column Names: {', '.join(display_df.columns[:10])}" + ("..." if len(display_df.columns) > 10 else ""))
-            # Use standard DataTable
-            datatable = gr.DataFrame(
-                value=display_df,
-                interactive=False,
-                wrap=True
-            )
-            # Add filter functionality using dropdowns
             with gr.Row():
-                if "model_type" in display_df.columns and not display_df.empty:
-                    model_types = ["All"] + sorted(display_df["model_type"].dropna().unique().tolist())
-                    model_type_filter = gr.Dropdown(
-                        choices=model_types,
-                        value="All",
-                        label="Filter by Model Type",
-                        interactive=True
-                    )
-                if "precision" in display_df.columns and not display_df.empty:
-                    precisions = ["All"] + sorted(display_df["precision"].dropna().unique().tolist())
-                    precision_filter = gr.Dropdown(
-                        choices=precisions,
-                        value="All",
-                        label="Filter by Precision",
-                        interactive=True
-                    )
-                search_input = gr.Textbox(
-                    label="Search by Model Name",
-                    placeholder="Enter model name...",
-                    interactive=True
-                )
-            # Filter function
-            def filter_data(model_type, precision, search):
-                filtered_df = display_df.copy()
-                if model_type != "All" and "model_type" in filtered_df.columns:
-                    filtered_df = filtered_df[filtered_df["model_type"] == model_type]
-                if precision != "All" and "precision" in filtered_df.columns:
-                    filtered_df = filtered_df[filtered_df["precision"] == precision]
-                if search and "model_name" in filtered_df.columns:
-                    filtered_df = filtered_df[filtered_df["model_name"].str.contains(search, case=False)]
-                return filtered_df
-            # Connect filters
-            filter_inputs = []
-            if "model_type" in display_df.columns and not display_df.empty:
-                filter_inputs.append(model_type_filter)
-            if "precision" in display_df.columns and not display_df.empty:
-                filter_inputs.append(precision_filter)
-            filter_inputs.append(search_input)
-            # If we have filter inputs, connect them
-            if filter_inputs:
-                for input_component in filter_inputs:
-                    input_component.change(
-                        filter_data,
-                        inputs=filter_inputs,
-                        outputs=datatable
-                    )
-        with gr.TabItem("About"):
-            gr.Markdown("""
-            # About ILMAAM
-            The **Index for Language Models for Arabic Assessment on Multitasks (ILMAAM)** showcases the performance of various Arabic LLMs on the newly released MMMLU OpenAI Benchmark across different subjects.
-            This benchmark evaluates language models specifically for Arabic language capabilities.
-            """)
-        with gr.TabItem("Submit"):
-            gr.Markdown("""
-            # Submit Your Model
-            You can submit your Arabic language model for benchmark evaluation. Fill out the form below:
-            """)
             with gr.Row():
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                     model_type = gr.Dropdown(
-                        choices=["Encoder", "Decoder"],
                         label="Model type",
                         multiselect=False,
-                        interactive=True
                     )
                 with gr.Column():
                     precision = gr.Dropdown(
-                        choices=["float16", "float32", "int8", "int4"],
                         label="Precision",
                         multiselect=False,
                         value="float16",
-                        interactive=True
                     )
                     weight_type = gr.Dropdown(
-                        choices=["Original", "Quantized", "Distilled"],
                         label="Weights type",
                         multiselect=False,
                         value="Original",
-                        interactive=True
                     )
-                    base_model_name_textbox = gr.Textbox(label="Base model (if applicable)")
-            submit_button = gr.Button("Submit for Evaluation")
             submission_result = gr.Markdown()
-            def mock_submission(model_name, base_model, revision, precision, weight_type, model_type):
-                if not model_name:
-                    return "Error: Model name is required."
-                return f"Model '{model_name}' submitted successfully! It will be evaluated soon."
             submit_button.click(
-                mock_submission,
                 [
                     model_name_textbox,
                     base_model_name_textbox,
@@ -269,4 +154,17 @@ with gr.Blocks(css=minimal_css) as demo:
                 submission_result,
             )
-demo.launch(debug=True, share=False)

 import gradio as gr
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    COLUMNS,
+    COLS,
+    BENCHMARK_COLS,
+    EVAL_COLS,
+    EVAL_TYPES,
+    ModelType,
+    WeightType,
+    Precision
+)
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval
+def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+### Space initialization
 try:
+    print(EVAL_REQUESTS_PATH)
+    snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+try:
+    print(EVAL_RESULTS_PATH)
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+# Load the leaderboard DataFrame
+LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+print("LEADERBOARD_DF Shape:", LEADERBOARD_DF.shape)  # Debug
+print("LEADERBOARD_DF Columns:", LEADERBOARD_DF.columns.tolist())  # Debug
+# Load the evaluation queue DataFrames
+finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+            if LEADERBOARD_DF.empty:
+                gr.Markdown("No evaluations have been performed yet. The leaderboard is currently empty.")
+            else:
+                default_selection = [col.name for col in COLUMNS if col.displayed_by_default]
+                print("Default Selection before ensuring 'model_name':", default_selection)  # Debug
+                # Ensure "model_name" is included
+                if "model_name" not in default_selection:
+                    default_selection.insert(0, "model_name")
+                    print("Default Selection after ensuring 'model_name':", default_selection)  # Debug
+                leaderboard = Leaderboard(
+                    value=LEADERBOARD_DF,
+                    datatype=[col.type for col in COLUMNS],
+                    select_columns=SelectColumns(
+                        default_selection=default_selection,
+                        cant_deselect=[col.name for col in COLUMNS if col.never_hidden],
+                        label="Select Columns to Display:",
+                    ),
+                    search_columns=[col.name for col in COLUMNS if col.name in ["model_name", "license"]],  # Updated to 'model_name'
+                    hide_columns=[col.name for col in COLUMNS if col.hidden],
+                    filter_columns=[
+                        ColumnFilter("model_type", type="checkboxgroup", label="Model types"),
+                        ColumnFilter("precision", type="checkboxgroup", label="Precision"),
+                        ColumnFilter(
+                            "still_on_hub", type="boolean", label="Deleted/incomplete", default=True
+                        ),
+                    ],
+                    bool_checkboxgroup_label="Hide models",
+                    interactive=False,
+                )
+                # No need to call leaderboard.render() since it's created within the Gradio context
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
+            with gr.Column():
+                with gr.Row():
+                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                # Since the evaluation queues are empty, display a message
+                with gr.Column():
+                    gr.Markdown("Evaluations are performed immediately upon submission. There are no pending or running evaluations.")
             with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                     model_type = gr.Dropdown(
+                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                         label="Model type",
                         multiselect=False,
+                        value=None,
+                        interactive=True,
                     )
                 with gr.Column():
                     precision = gr.Dropdown(
+                        choices=[i.value for i in Precision if i != Precision.Unknown],
                         label="Precision",
                         multiselect=False,
                         value="float16",
+                        interactive=True,
                     )
                     weight_type = gr.Dropdown(
+                        choices=[i.value for i in WeightType],
                         label="Weights type",
                         multiselect=False,
                         value="Original",
+                        interactive=True,
                     )
+                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+            submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
             submit_button.click(
+                add_new_eval,
                 [
                     model_name_textbox,
                     base_model_name_textbox,
                 submission_result,
             )
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                lines=20,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()