Spaces:

Omartificial-Intelligence-Space
/

Arabic-MMMLU-Leaderborad

Running

App Files Files Community

Omartificial-Intelligence-Space commited on Feb 27

Commit

8355c4d

verified ·

1 Parent(s): e1da145

update app.py

Browse files

Files changed (1) hide show

app.py +161 -40

app.py CHANGED Viewed

@@ -3,11 +3,12 @@ import pandas as pd
 import os
 import json
 from src.populate import get_leaderboard_df
-from src.display.utils import COLUMNS, COLS, BENCHMARK_COLS
 from src.envs import EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH
-# Ensure directories exist
-os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
 # Minimal CSS
 minimal_css = """
@@ -21,14 +22,56 @@ minimal_css = """
 }
 """
 try:
-    # Load the leaderboard DataFrame
     LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-    print("LEADERBOARD_DF Shape:", LEADERBOARD_DF.shape)
-    # If DataFrame is empty, create a sample
     if LEADERBOARD_DF.empty:
-        print("Creating sample data for testing")
         LEADERBOARD_DF = pd.DataFrame([{
             "model_name": "Sample Model",
             "average": 75.5,
@@ -36,36 +79,56 @@ try:
             "precision": "float16"
         }])
 except Exception as e:
-    print(f"Error loading leaderboard data: {e}")
     # Create a minimal DataFrame
     LEADERBOARD_DF = pd.DataFrame([{
         "model_name": "Error Loading Data",
         "average": 0
     }])
-# Select common columns for display
-display_cols = ["model_name", "average"]
-# Add some subject columns if they exist
-subject_cols = ["abstract_algebra", "anatomy", "astronomy", "business_ethics"]
-for col in subject_cols:
-    if col in LEADERBOARD_DF.columns:
-        display_cols.append(col)
-# Add model metadata if they exist
-meta_cols = ["model_type", "precision", "weight_type", "license"]
-for col in meta_cols:
-    if col in LEADERBOARD_DF.columns:
-        display_cols.append(col)
-# Filter the DataFrame to only include display columns that actually exist
-actual_display_cols = [col for col in display_cols if col in LEADERBOARD_DF.columns]
-display_df = LEADERBOARD_DF[actual_display_cols].copy()
-# Round numeric columns for display
-for col in display_df.columns:
-    if pd.api.types.is_numeric_dtype(display_df[col]):
-        display_df[col] = display_df[col].round(2)
-# Create a very simple app using standard DataTable instead of Leaderboard
 with gr.Blocks(css=minimal_css) as demo:
     gr.HTML("<div class='header'><h1>ILMAAM: Index for Language Models for Arabic Assessment on Multitasks</h1></div>")
@@ -74,20 +137,19 @@ with gr.Blocks(css=minimal_css) as demo:
             # Add debug output
             with gr.Accordion("Debug Info", open=True):
                 gr.Markdown(f"DataFrame Shape: {display_df.shape}")
-                gr.Markdown(f"Column Names: {', '.join(display_df.columns)}")
-            # Use standard DataTable instead of Leaderboard
             datatable = gr.DataFrame(
                 value=display_df,
                 interactive=False,
-                wrap=True,
-                column_widths=[200] + [100] * (len(actual_display_cols) - 1)
             )
             # Add filter functionality using dropdowns
             with gr.Row():
-                if "model_type" in display_df.columns:
-                    model_types = ["All"] + sorted(display_df["model_type"].unique().tolist())
                     model_type_filter = gr.Dropdown(
                         choices=model_types,
                         value="All",
@@ -95,8 +157,8 @@ with gr.Blocks(css=minimal_css) as demo:
                         interactive=True
                     )
-                if "precision" in display_df.columns:
-                    precisions = ["All"] + sorted(display_df["precision"].unique().tolist())
                     precision_filter = gr.Dropdown(
                         choices=precisions,
                         value="All",
@@ -127,9 +189,9 @@ with gr.Blocks(css=minimal_css) as demo:
             # Connect filters
             filter_inputs = []
-            if "model_type" in display_df.columns:
                 filter_inputs.append(model_type_filter)
-            if "precision" in display_df.columns:
                 filter_inputs.append(precision_filter)
             filter_inputs.append(search_input)
@@ -143,9 +205,68 @@ with gr.Blocks(css=minimal_css) as demo:
                     )
         with gr.TabItem("About"):
-            gr.Markdown("This is a benchmark for Arabic language models.")
         with gr.TabItem("Submit"):
-            gr.Markdown("Submission form will be available here.")
 demo.launch(debug=True, share=False)

 import os
 import json
 from src.populate import get_leaderboard_df
+from src.display.utils import COLUMNS, COLS, BENCHMARK_COLS, EVAL_COLS
 from src.envs import EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH
+# Print paths for debugging
+print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
+print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
 # Minimal CSS
 minimal_css = """
 }
 """
+# Function to load data directly from JSON files
+def load_data_directly():
+    if not os.path.exists(EVAL_RESULTS_PATH):
+        print(f"Path does not exist: {EVAL_RESULTS_PATH}")
+        return pd.DataFrame()
+    result_files = [
+        os.path.join(EVAL_RESULTS_PATH, f)
+        for f in os.listdir(EVAL_RESULTS_PATH)
+        if f.endswith('.json')
+    ]
+    print(f"Found {len(result_files)} JSON files")
+    data_list = []
+    for file in result_files:
+        try:
+            with open(file, 'r') as f:
+                data = json.load(f)
+            flattened_data = {}
+            # Extract both config and results
+            flattened_data.update(data.get('config', {}))
+            flattened_data.update(data.get('results', {}))
+            data_list.append(flattened_data)
+        except Exception as e:
+            print(f"Error loading file {file}: {e}")
+    if not data_list:
+        print("No data loaded from JSON files")
+        return pd.DataFrame()
+    df = pd.DataFrame(data_list)
+    print(f"Successfully loaded DataFrame with shape: {df.shape}")
+    return df
+# Try to load data using both methods
 try:
+    print("Attempting to load data using get_leaderboard_df...")
     LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+    print(f"get_leaderboard_df result shape: {LEADERBOARD_DF.shape}")
+    # If that fails or returns empty, try direct loading
+    if LEADERBOARD_DF.empty:
+        print("get_leaderboard_df returned empty DataFrame, trying direct loading...")
+        LEADERBOARD_DF = load_data_directly()
+    # If still empty, create a sample
     if LEADERBOARD_DF.empty:
+        print("Both methods returned empty DataFrames, creating sample data")
         LEADERBOARD_DF = pd.DataFrame([{
             "model_name": "Sample Model",
             "average": 75.5,
             "precision": "float16"
         }])
 except Exception as e:
+    print(f"Error in data loading: {e}")
     # Create a minimal DataFrame
     LEADERBOARD_DF = pd.DataFrame([{
         "model_name": "Error Loading Data",
         "average": 0
     }])
+# Print final DataFrame info
+print(f"Final DataFrame shape: {LEADERBOARD_DF.shape}")
+print(f"Final DataFrame columns: {LEADERBOARD_DF.columns.tolist()}")
+# Select important columns for display
+display_cols = ["model_name", "average", "model_type", "precision", "weight_type", "license"]
+# Add some subject columns
+subject_cols = [
+    "abstract_algebra", "anatomy", "astronomy", "business_ethics",
+    "college_biology", "college_chemistry", "college_computer_science",
+    "high_school_mathematics", "machine_learning"
+]
+# Add all detected subject columns
+for col in LEADERBOARD_DF.columns:
+    if col not in display_cols and col not in ["submitted_time", "revision", "base_model", "likes", "params"]:
+        subject_cols.append(col)
+# Combine columns, filtering to only those that exist
+all_display_cols = display_cols + subject_cols
+actual_display_cols = [col for col in all_display_cols if col in LEADERBOARD_DF.columns]
+# Ensure we have at least some columns
+if not actual_display_cols and not LEADERBOARD_DF.empty:
+    actual_display_cols = LEADERBOARD_DF.columns.tolist()
+# Filter the DataFrame
+if not LEADERBOARD_DF.empty:
+    display_df = LEADERBOARD_DF[actual_display_cols].copy()
+    # Round numeric columns for display
+    for col in display_df.columns:
+        if pd.api.types.is_numeric_dtype(display_df[col]):
+            display_df[col] = display_df[col].round(2)
+    # Sort by average if it exists
+    if "average" in display_df.columns:
+        display_df = display_df.sort_values(by="average", ascending=False)
+else:
+    display_df = LEADERBOARD_DF
+# Create the app
 with gr.Blocks(css=minimal_css) as demo:
     gr.HTML("<div class='header'><h1>ILMAAM: Index for Language Models for Arabic Assessment on Multitasks</h1></div>")
             # Add debug output
             with gr.Accordion("Debug Info", open=True):
                 gr.Markdown(f"DataFrame Shape: {display_df.shape}")
+                gr.Markdown(f"Column Names: {', '.join(display_df.columns[:10])}" + ("..." if len(display_df.columns) > 10 else ""))
+            # Use standard DataTable
             datatable = gr.DataFrame(
                 value=display_df,
                 interactive=False,
+                wrap=True
             )
             # Add filter functionality using dropdowns
             with gr.Row():
+                if "model_type" in display_df.columns and not display_df.empty:
+                    model_types = ["All"] + sorted(display_df["model_type"].dropna().unique().tolist())
                     model_type_filter = gr.Dropdown(
                         choices=model_types,
                         value="All",
                         interactive=True
                     )
+                if "precision" in display_df.columns and not display_df.empty:
+                    precisions = ["All"] + sorted(display_df["precision"].dropna().unique().tolist())
                     precision_filter = gr.Dropdown(
                         choices=precisions,
                         value="All",
             # Connect filters
             filter_inputs = []
+            if "model_type" in display_df.columns and not display_df.empty:
                 filter_inputs.append(model_type_filter)
+            if "precision" in display_df.columns and not display_df.empty:
                 filter_inputs.append(precision_filter)
             filter_inputs.append(search_input)
                     )
         with gr.TabItem("About"):
+            gr.Markdown("""
+            # About ILMAAM
+            The **Index for Language Models for Arabic Assessment on Multitasks (ILMAAM)** showcases the performance of various Arabic LLMs on the newly released MMMLU OpenAI Benchmark across different subjects.
+            This benchmark evaluates language models specifically for Arabic language capabilities.
+            """)
         with gr.TabItem("Submit"):
+            gr.Markdown("""
+            # Submit Your Model
+            You can submit your Arabic language model for benchmark evaluation. Fill out the form below:
+            """)
+            with gr.Row():
+                with gr.Column():
+                    model_name_textbox = gr.Textbox(label="Model name")
+                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+                    model_type = gr.Dropdown(
+                        choices=["Encoder", "Decoder"],
+                        label="Model type",
+                        multiselect=False,
+                        interactive=True
+                    )
+                with gr.Column():
+                    precision = gr.Dropdown(
+                        choices=["float16", "float32", "int8", "int4"],
+                        label="Precision",
+                        multiselect=False,
+                        value="float16",
+                        interactive=True
+                    )
+                    weight_type = gr.Dropdown(
+                        choices=["Original", "Quantized", "Distilled"],
+                        label="Weights type",
+                        multiselect=False,
+                        value="Original",
+                        interactive=True
+                    )
+                    base_model_name_textbox = gr.Textbox(label="Base model (if applicable)")
+            submit_button = gr.Button("Submit for Evaluation")
+            submission_result = gr.Markdown()
+            def mock_submission(model_name, base_model, revision, precision, weight_type, model_type):
+                if not model_name:
+                    return "Error: Model name is required."
+                return f"Model '{model_name}' submitted successfully! It will be evaluated soon."
+            submit_button.click(
+                mock_submission,
+                [
+                    model_name_textbox,
+                    base_model_name_textbox,
+                    revision_name_textbox,
+                    precision,
+                    weight_type,
+                    model_type,
+                ],
+                submission_result,
+            )
 demo.launch(debug=True, share=False)