Spaces:

Omartificial-Intelligence-Space
/

Arabic-MMMLU-Leaderborad

Running

File size: 10,513 Bytes

import gradio as gr
import pandas as pd
import os
import json
from src.populate import get_leaderboard_df
from src.display.utils import COLUMNS, COLS, BENCHMARK_COLS, EVAL_COLS
from src.envs import EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH

# Print paths for debugging
print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")

# Minimal CSS
minimal_css = """
.container {
    max-width: 1200px;
    margin: 0 auto;
}
.header {
    text-align: center;
    margin-bottom: 20px;
}
"""

# Function to load data directly from JSON files
def load_data_directly():
    if not os.path.exists(EVAL_RESULTS_PATH):
        print(f"Path does not exist: {EVAL_RESULTS_PATH}")
        return pd.DataFrame()
    
    result_files = [
        os.path.join(EVAL_RESULTS_PATH, f) 
        for f in os.listdir(EVAL_RESULTS_PATH) 
        if f.endswith('.json')
    ]
    
    print(f"Found {len(result_files)} JSON files")
    
    data_list = []
    for file in result_files:
        try:
            with open(file, 'r') as f:
                data = json.load(f)
                
            flattened_data = {}
            # Extract both config and results
            flattened_data.update(data.get('config', {}))
            flattened_data.update(data.get('results', {}))
            data_list.append(flattened_data)
        except Exception as e:
            print(f"Error loading file {file}: {e}")
    
    if not data_list:
        print("No data loaded from JSON files")
        return pd.DataFrame()
    
    df = pd.DataFrame(data_list)
    print(f"Successfully loaded DataFrame with shape: {df.shape}")
    return df

# Try to load data using both methods
try:
    print("Attempting to load data using get_leaderboard_df...")
    LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
    print(f"get_leaderboard_df result shape: {LEADERBOARD_DF.shape}")
    
    # If that fails or returns empty, try direct loading
    if LEADERBOARD_DF.empty:
        print("get_leaderboard_df returned empty DataFrame, trying direct loading...")
        LEADERBOARD_DF = load_data_directly()
    
    # If still empty, create a sample
    if LEADERBOARD_DF.empty:
        print("Both methods returned empty DataFrames, creating sample data")
        LEADERBOARD_DF = pd.DataFrame([{
            "model_name": "Sample Model",
            "average": 75.5,
            "model_type": "Encoder",
            "precision": "float16"
        }])
except Exception as e:
    print(f"Error in data loading: {e}")
    # Create a minimal DataFrame
    LEADERBOARD_DF = pd.DataFrame([{
        "model_name": "Error Loading Data",
        "average": 0
    }])

# Print final DataFrame info
print(f"Final DataFrame shape: {LEADERBOARD_DF.shape}")
print(f"Final DataFrame columns: {LEADERBOARD_DF.columns.tolist()}")

# Select important columns for display
display_cols = ["model_name", "average", "model_type", "precision", "weight_type", "license"]
    
# Add some subject columns 
subject_cols = [
    "abstract_algebra", "anatomy", "astronomy", "business_ethics",
    "college_biology", "college_chemistry", "college_computer_science",
    "high_school_mathematics", "machine_learning"
]

# Add all detected subject columns
for col in LEADERBOARD_DF.columns:
    if col not in display_cols and col not in ["submitted_time", "revision", "base_model", "likes", "params"]:
        subject_cols.append(col)

# Combine columns, filtering to only those that exist
all_display_cols = display_cols + subject_cols
actual_display_cols = [col for col in all_display_cols if col in LEADERBOARD_DF.columns]

# Ensure we have at least some columns
if not actual_display_cols and not LEADERBOARD_DF.empty:
    actual_display_cols = LEADERBOARD_DF.columns.tolist()

# Filter the DataFrame
if not LEADERBOARD_DF.empty:
    display_df = LEADERBOARD_DF[actual_display_cols].copy()
    
    # Round numeric columns for display
    for col in display_df.columns:
        if pd.api.types.is_numeric_dtype(display_df[col]):
            display_df[col] = display_df[col].round(2)
            
    # Sort by average if it exists
    if "average" in display_df.columns:
        display_df = display_df.sort_values(by="average", ascending=False)
else:
    display_df = LEADERBOARD_DF

# Create the app
with gr.Blocks(css=minimal_css) as demo:
    gr.HTML("<div class='header'><h1>ILMAAM: Index for Language Models for Arabic Assessment on Multitasks</h1></div>")
    
    with gr.Tabs() as tabs:
        with gr.TabItem("LLM Benchmark"):
            # Add debug output
            with gr.Accordion("Debug Info", open=True):
                gr.Markdown(f"DataFrame Shape: {display_df.shape}")
                gr.Markdown(f"Column Names: {', '.join(display_df.columns[:10])}" + ("..." if len(display_df.columns) > 10 else ""))
            
            # Use standard DataTable
            datatable = gr.DataFrame(
                value=display_df,
                interactive=False,
                wrap=True
            )
            
            # Add filter functionality using dropdowns
            with gr.Row():
                if "model_type" in display_df.columns and not display_df.empty:
                    model_types = ["All"] + sorted(display_df["model_type"].dropna().unique().tolist())
                    model_type_filter = gr.Dropdown(
                        choices=model_types,
                        value="All",
                        label="Filter by Model Type",
                        interactive=True
                    )

                if "precision" in display_df.columns and not display_df.empty:
                    precisions = ["All"] + sorted(display_df["precision"].dropna().unique().tolist())
                    precision_filter = gr.Dropdown(
                        choices=precisions,
                        value="All",
                        label="Filter by Precision",
                        interactive=True
                    )
                
                search_input = gr.Textbox(
                    label="Search by Model Name",
                    placeholder="Enter model name...",
                    interactive=True
                )
            
            # Filter function
            def filter_data(model_type, precision, search):
                filtered_df = display_df.copy()
                
                if model_type != "All" and "model_type" in filtered_df.columns:
                    filtered_df = filtered_df[filtered_df["model_type"] == model_type]
                    
                if precision != "All" and "precision" in filtered_df.columns:
                    filtered_df = filtered_df[filtered_df["precision"] == precision]
                    
                if search and "model_name" in filtered_df.columns:
                    filtered_df = filtered_df[filtered_df["model_name"].str.contains(search, case=False)]
                    
                return filtered_df
            
            # Connect filters
            filter_inputs = []
            if "model_type" in display_df.columns and not display_df.empty:
                filter_inputs.append(model_type_filter)
            if "precision" in display_df.columns and not display_df.empty:
                filter_inputs.append(precision_filter)
            filter_inputs.append(search_input)
            
            # If we have filter inputs, connect them
            if filter_inputs:
                for input_component in filter_inputs:
                    input_component.change(
                        filter_data, 
                        inputs=filter_inputs, 
                        outputs=datatable
                    )
            
        with gr.TabItem("About"):
            gr.Markdown("""
            # About ILMAAM
            
            The **Index for Language Models for Arabic Assessment on Multitasks (ILMAAM)** showcases the performance of various Arabic LLMs on the newly released MMMLU OpenAI Benchmark across different subjects.
            
            This benchmark evaluates language models specifically for Arabic language capabilities.
            """)
            
        with gr.TabItem("Submit"):
            gr.Markdown("""
            # Submit Your Model
            
            You can submit your Arabic language model for benchmark evaluation. Fill out the form below:
            """)
            
            with gr.Row():
                with gr.Column():
                    model_name_textbox = gr.Textbox(label="Model name")
                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                    model_type = gr.Dropdown(
                        choices=["Encoder", "Decoder"],
                        label="Model type",
                        multiselect=False,
                        interactive=True
                    )

                with gr.Column():
                    precision = gr.Dropdown(
                        choices=["float16", "float32", "int8", "int4"],
                        label="Precision",
                        multiselect=False,
                        value="float16",
                        interactive=True
                    )
                    weight_type = gr.Dropdown(
                        choices=["Original", "Quantized", "Distilled"],
                        label="Weights type",
                        multiselect=False,
                        value="Original",
                        interactive=True
                    )
                    base_model_name_textbox = gr.Textbox(label="Base model (if applicable)")

            submit_button = gr.Button("Submit for Evaluation")
            submission_result = gr.Markdown()
            
            def mock_submission(model_name, base_model, revision, precision, weight_type, model_type):
                if not model_name:
                    return "Error: Model name is required."
                return f"Model '{model_name}' submitted successfully! It will be evaluated soon."
            
            submit_button.click(
                mock_submission,
                [
                    model_name_textbox,
                    base_model_name_textbox,
                    revision_name_textbox,
                    precision,
                    weight_type,
                    model_type,
                ],
                submission_result,
            )

demo.launch(debug=True, share=False)