|
import gradio as gr |
|
import pandas as pd |
|
import os |
|
import json |
|
from src.populate import get_leaderboard_df |
|
from src.display.utils import COLUMNS, COLS, BENCHMARK_COLS, EVAL_COLS |
|
from src.envs import EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH |
|
|
|
|
|
print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}") |
|
print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}") |
|
|
|
|
|
minimal_css = """ |
|
.container { |
|
max-width: 1200px; |
|
margin: 0 auto; |
|
} |
|
.header { |
|
text-align: center; |
|
margin-bottom: 20px; |
|
} |
|
""" |
|
|
|
|
|
def load_data_directly(): |
|
if not os.path.exists(EVAL_RESULTS_PATH): |
|
print(f"Path does not exist: {EVAL_RESULTS_PATH}") |
|
return pd.DataFrame() |
|
|
|
result_files = [ |
|
os.path.join(EVAL_RESULTS_PATH, f) |
|
for f in os.listdir(EVAL_RESULTS_PATH) |
|
if f.endswith('.json') |
|
] |
|
|
|
print(f"Found {len(result_files)} JSON files") |
|
|
|
data_list = [] |
|
for file in result_files: |
|
try: |
|
with open(file, 'r') as f: |
|
data = json.load(f) |
|
|
|
flattened_data = {} |
|
|
|
flattened_data.update(data.get('config', {})) |
|
flattened_data.update(data.get('results', {})) |
|
data_list.append(flattened_data) |
|
except Exception as e: |
|
print(f"Error loading file {file}: {e}") |
|
|
|
if not data_list: |
|
print("No data loaded from JSON files") |
|
return pd.DataFrame() |
|
|
|
df = pd.DataFrame(data_list) |
|
print(f"Successfully loaded DataFrame with shape: {df.shape}") |
|
return df |
|
|
|
|
|
try: |
|
print("Attempting to load data using get_leaderboard_df...") |
|
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) |
|
print(f"get_leaderboard_df result shape: {LEADERBOARD_DF.shape}") |
|
|
|
|
|
if LEADERBOARD_DF.empty: |
|
print("get_leaderboard_df returned empty DataFrame, trying direct loading...") |
|
LEADERBOARD_DF = load_data_directly() |
|
|
|
|
|
if LEADERBOARD_DF.empty: |
|
print("Both methods returned empty DataFrames, creating sample data") |
|
LEADERBOARD_DF = pd.DataFrame([{ |
|
"model_name": "Sample Model", |
|
"average": 75.5, |
|
"model_type": "Encoder", |
|
"precision": "float16" |
|
}]) |
|
except Exception as e: |
|
print(f"Error in data loading: {e}") |
|
|
|
LEADERBOARD_DF = pd.DataFrame([{ |
|
"model_name": "Error Loading Data", |
|
"average": 0 |
|
}]) |
|
|
|
|
|
print(f"Final DataFrame shape: {LEADERBOARD_DF.shape}") |
|
print(f"Final DataFrame columns: {LEADERBOARD_DF.columns.tolist()}") |
|
|
|
|
|
display_cols = ["model_name", "average", "model_type", "precision", "weight_type", "license"] |
|
|
|
|
|
subject_cols = [ |
|
"abstract_algebra", "anatomy", "astronomy", "business_ethics", |
|
"college_biology", "college_chemistry", "college_computer_science", |
|
"high_school_mathematics", "machine_learning" |
|
] |
|
|
|
|
|
for col in LEADERBOARD_DF.columns: |
|
if col not in display_cols and col not in ["submitted_time", "revision", "base_model", "likes", "params"]: |
|
subject_cols.append(col) |
|
|
|
|
|
all_display_cols = display_cols + subject_cols |
|
actual_display_cols = [col for col in all_display_cols if col in LEADERBOARD_DF.columns] |
|
|
|
|
|
if not actual_display_cols and not LEADERBOARD_DF.empty: |
|
actual_display_cols = LEADERBOARD_DF.columns.tolist() |
|
|
|
|
|
if not LEADERBOARD_DF.empty: |
|
display_df = LEADERBOARD_DF[actual_display_cols].copy() |
|
|
|
|
|
for col in display_df.columns: |
|
if pd.api.types.is_numeric_dtype(display_df[col]): |
|
display_df[col] = display_df[col].round(2) |
|
|
|
|
|
if "average" in display_df.columns: |
|
display_df = display_df.sort_values(by="average", ascending=False) |
|
else: |
|
display_df = LEADERBOARD_DF |
|
|
|
|
|
with gr.Blocks(css=minimal_css) as demo: |
|
gr.HTML("<div class='header'><h1>ILMAAM: Index for Language Models for Arabic Assessment on Multitasks</h1></div>") |
|
|
|
with gr.Tabs() as tabs: |
|
with gr.TabItem("LLM Benchmark"): |
|
|
|
with gr.Accordion("Debug Info", open=True): |
|
gr.Markdown(f"DataFrame Shape: {display_df.shape}") |
|
gr.Markdown(f"Column Names: {', '.join(display_df.columns[:10])}" + ("..." if len(display_df.columns) > 10 else "")) |
|
|
|
|
|
datatable = gr.DataFrame( |
|
value=display_df, |
|
interactive=False, |
|
wrap=True |
|
) |
|
|
|
|
|
with gr.Row(): |
|
if "model_type" in display_df.columns and not display_df.empty: |
|
model_types = ["All"] + sorted(display_df["model_type"].dropna().unique().tolist()) |
|
model_type_filter = gr.Dropdown( |
|
choices=model_types, |
|
value="All", |
|
label="Filter by Model Type", |
|
interactive=True |
|
) |
|
|
|
if "precision" in display_df.columns and not display_df.empty: |
|
precisions = ["All"] + sorted(display_df["precision"].dropna().unique().tolist()) |
|
precision_filter = gr.Dropdown( |
|
choices=precisions, |
|
value="All", |
|
label="Filter by Precision", |
|
interactive=True |
|
) |
|
|
|
search_input = gr.Textbox( |
|
label="Search by Model Name", |
|
placeholder="Enter model name...", |
|
interactive=True |
|
) |
|
|
|
|
|
def filter_data(model_type, precision, search): |
|
filtered_df = display_df.copy() |
|
|
|
if model_type != "All" and "model_type" in filtered_df.columns: |
|
filtered_df = filtered_df[filtered_df["model_type"] == model_type] |
|
|
|
if precision != "All" and "precision" in filtered_df.columns: |
|
filtered_df = filtered_df[filtered_df["precision"] == precision] |
|
|
|
if search and "model_name" in filtered_df.columns: |
|
filtered_df = filtered_df[filtered_df["model_name"].str.contains(search, case=False)] |
|
|
|
return filtered_df |
|
|
|
|
|
filter_inputs = [] |
|
if "model_type" in display_df.columns and not display_df.empty: |
|
filter_inputs.append(model_type_filter) |
|
if "precision" in display_df.columns and not display_df.empty: |
|
filter_inputs.append(precision_filter) |
|
filter_inputs.append(search_input) |
|
|
|
|
|
if filter_inputs: |
|
for input_component in filter_inputs: |
|
input_component.change( |
|
filter_data, |
|
inputs=filter_inputs, |
|
outputs=datatable |
|
) |
|
|
|
with gr.TabItem("About"): |
|
gr.Markdown(""" |
|
# About ILMAAM |
|
|
|
The **Index for Language Models for Arabic Assessment on Multitasks (ILMAAM)** showcases the performance of various Arabic LLMs on the newly released MMMLU OpenAI Benchmark across different subjects. |
|
|
|
This benchmark evaluates language models specifically for Arabic language capabilities. |
|
""") |
|
|
|
with gr.TabItem("Submit"): |
|
gr.Markdown(""" |
|
# Submit Your Model |
|
|
|
You can submit your Arabic language model for benchmark evaluation. Fill out the form below: |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
model_name_textbox = gr.Textbox(label="Model name") |
|
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") |
|
model_type = gr.Dropdown( |
|
choices=["Encoder", "Decoder"], |
|
label="Model type", |
|
multiselect=False, |
|
interactive=True |
|
) |
|
|
|
with gr.Column(): |
|
precision = gr.Dropdown( |
|
choices=["float16", "float32", "int8", "int4"], |
|
label="Precision", |
|
multiselect=False, |
|
value="float16", |
|
interactive=True |
|
) |
|
weight_type = gr.Dropdown( |
|
choices=["Original", "Quantized", "Distilled"], |
|
label="Weights type", |
|
multiselect=False, |
|
value="Original", |
|
interactive=True |
|
) |
|
base_model_name_textbox = gr.Textbox(label="Base model (if applicable)") |
|
|
|
submit_button = gr.Button("Submit for Evaluation") |
|
submission_result = gr.Markdown() |
|
|
|
def mock_submission(model_name, base_model, revision, precision, weight_type, model_type): |
|
if not model_name: |
|
return "Error: Model name is required." |
|
return f"Model '{model_name}' submitted successfully! It will be evaluated soon." |
|
|
|
submit_button.click( |
|
mock_submission, |
|
[ |
|
model_name_textbox, |
|
base_model_name_textbox, |
|
revision_name_textbox, |
|
precision, |
|
weight_type, |
|
model_type, |
|
], |
|
submission_result, |
|
) |
|
|
|
demo.launch(debug=True, share=False) |