Omartificial-Intelligence-Space's picture
update app.py
8355c4d verified
raw
history blame
10.5 kB
import gradio as gr
import pandas as pd
import os
import json
from src.populate import get_leaderboard_df
from src.display.utils import COLUMNS, COLS, BENCHMARK_COLS, EVAL_COLS
from src.envs import EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH
# Print paths for debugging
print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
# Minimal CSS
minimal_css = """
.container {
max-width: 1200px;
margin: 0 auto;
}
.header {
text-align: center;
margin-bottom: 20px;
}
"""
# Function to load data directly from JSON files
def load_data_directly():
if not os.path.exists(EVAL_RESULTS_PATH):
print(f"Path does not exist: {EVAL_RESULTS_PATH}")
return pd.DataFrame()
result_files = [
os.path.join(EVAL_RESULTS_PATH, f)
for f in os.listdir(EVAL_RESULTS_PATH)
if f.endswith('.json')
]
print(f"Found {len(result_files)} JSON files")
data_list = []
for file in result_files:
try:
with open(file, 'r') as f:
data = json.load(f)
flattened_data = {}
# Extract both config and results
flattened_data.update(data.get('config', {}))
flattened_data.update(data.get('results', {}))
data_list.append(flattened_data)
except Exception as e:
print(f"Error loading file {file}: {e}")
if not data_list:
print("No data loaded from JSON files")
return pd.DataFrame()
df = pd.DataFrame(data_list)
print(f"Successfully loaded DataFrame with shape: {df.shape}")
return df
# Try to load data using both methods
try:
print("Attempting to load data using get_leaderboard_df...")
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
print(f"get_leaderboard_df result shape: {LEADERBOARD_DF.shape}")
# If that fails or returns empty, try direct loading
if LEADERBOARD_DF.empty:
print("get_leaderboard_df returned empty DataFrame, trying direct loading...")
LEADERBOARD_DF = load_data_directly()
# If still empty, create a sample
if LEADERBOARD_DF.empty:
print("Both methods returned empty DataFrames, creating sample data")
LEADERBOARD_DF = pd.DataFrame([{
"model_name": "Sample Model",
"average": 75.5,
"model_type": "Encoder",
"precision": "float16"
}])
except Exception as e:
print(f"Error in data loading: {e}")
# Create a minimal DataFrame
LEADERBOARD_DF = pd.DataFrame([{
"model_name": "Error Loading Data",
"average": 0
}])
# Print final DataFrame info
print(f"Final DataFrame shape: {LEADERBOARD_DF.shape}")
print(f"Final DataFrame columns: {LEADERBOARD_DF.columns.tolist()}")
# Select important columns for display
display_cols = ["model_name", "average", "model_type", "precision", "weight_type", "license"]
# Add some subject columns
subject_cols = [
"abstract_algebra", "anatomy", "astronomy", "business_ethics",
"college_biology", "college_chemistry", "college_computer_science",
"high_school_mathematics", "machine_learning"
]
# Add all detected subject columns
for col in LEADERBOARD_DF.columns:
if col not in display_cols and col not in ["submitted_time", "revision", "base_model", "likes", "params"]:
subject_cols.append(col)
# Combine columns, filtering to only those that exist
all_display_cols = display_cols + subject_cols
actual_display_cols = [col for col in all_display_cols if col in LEADERBOARD_DF.columns]
# Ensure we have at least some columns
if not actual_display_cols and not LEADERBOARD_DF.empty:
actual_display_cols = LEADERBOARD_DF.columns.tolist()
# Filter the DataFrame
if not LEADERBOARD_DF.empty:
display_df = LEADERBOARD_DF[actual_display_cols].copy()
# Round numeric columns for display
for col in display_df.columns:
if pd.api.types.is_numeric_dtype(display_df[col]):
display_df[col] = display_df[col].round(2)
# Sort by average if it exists
if "average" in display_df.columns:
display_df = display_df.sort_values(by="average", ascending=False)
else:
display_df = LEADERBOARD_DF
# Create the app
with gr.Blocks(css=minimal_css) as demo:
gr.HTML("<div class='header'><h1>ILMAAM: Index for Language Models for Arabic Assessment on Multitasks</h1></div>")
with gr.Tabs() as tabs:
with gr.TabItem("LLM Benchmark"):
# Add debug output
with gr.Accordion("Debug Info", open=True):
gr.Markdown(f"DataFrame Shape: {display_df.shape}")
gr.Markdown(f"Column Names: {', '.join(display_df.columns[:10])}" + ("..." if len(display_df.columns) > 10 else ""))
# Use standard DataTable
datatable = gr.DataFrame(
value=display_df,
interactive=False,
wrap=True
)
# Add filter functionality using dropdowns
with gr.Row():
if "model_type" in display_df.columns and not display_df.empty:
model_types = ["All"] + sorted(display_df["model_type"].dropna().unique().tolist())
model_type_filter = gr.Dropdown(
choices=model_types,
value="All",
label="Filter by Model Type",
interactive=True
)
if "precision" in display_df.columns and not display_df.empty:
precisions = ["All"] + sorted(display_df["precision"].dropna().unique().tolist())
precision_filter = gr.Dropdown(
choices=precisions,
value="All",
label="Filter by Precision",
interactive=True
)
search_input = gr.Textbox(
label="Search by Model Name",
placeholder="Enter model name...",
interactive=True
)
# Filter function
def filter_data(model_type, precision, search):
filtered_df = display_df.copy()
if model_type != "All" and "model_type" in filtered_df.columns:
filtered_df = filtered_df[filtered_df["model_type"] == model_type]
if precision != "All" and "precision" in filtered_df.columns:
filtered_df = filtered_df[filtered_df["precision"] == precision]
if search and "model_name" in filtered_df.columns:
filtered_df = filtered_df[filtered_df["model_name"].str.contains(search, case=False)]
return filtered_df
# Connect filters
filter_inputs = []
if "model_type" in display_df.columns and not display_df.empty:
filter_inputs.append(model_type_filter)
if "precision" in display_df.columns and not display_df.empty:
filter_inputs.append(precision_filter)
filter_inputs.append(search_input)
# If we have filter inputs, connect them
if filter_inputs:
for input_component in filter_inputs:
input_component.change(
filter_data,
inputs=filter_inputs,
outputs=datatable
)
with gr.TabItem("About"):
gr.Markdown("""
# About ILMAAM
The **Index for Language Models for Arabic Assessment on Multitasks (ILMAAM)** showcases the performance of various Arabic LLMs on the newly released MMMLU OpenAI Benchmark across different subjects.
This benchmark evaluates language models specifically for Arabic language capabilities.
""")
with gr.TabItem("Submit"):
gr.Markdown("""
# Submit Your Model
You can submit your Arabic language model for benchmark evaluation. Fill out the form below:
""")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name")
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
model_type = gr.Dropdown(
choices=["Encoder", "Decoder"],
label="Model type",
multiselect=False,
interactive=True
)
with gr.Column():
precision = gr.Dropdown(
choices=["float16", "float32", "int8", "int4"],
label="Precision",
multiselect=False,
value="float16",
interactive=True
)
weight_type = gr.Dropdown(
choices=["Original", "Quantized", "Distilled"],
label="Weights type",
multiselect=False,
value="Original",
interactive=True
)
base_model_name_textbox = gr.Textbox(label="Base model (if applicable)")
submit_button = gr.Button("Submit for Evaluation")
submission_result = gr.Markdown()
def mock_submission(model_name, base_model, revision, precision, weight_type, model_type):
if not model_name:
return "Error: Model name is required."
return f"Model '{model_name}' submitted successfully! It will be evaluated soon."
submit_button.click(
mock_submission,
[
model_name_textbox,
base_model_name_textbox,
revision_name_textbox,
precision,
weight_type,
model_type,
],
submission_result,
)
demo.launch(debug=True, share=False)