Spaces:

Omartificial-Intelligence-Space
/

Arabic-MMMLU-Leaderborad

Running

App Files Files Community

Arabic-MMMLU-Leaderborad / app.py

Omartificial-Intelligence-Space

update app.py

8355c4d verified 4 months ago

raw

history blame

10.5 kB

	import gradio as gr
	import pandas as pd
	import os
	import json
	from src.populate import get_leaderboard_df
	from src.display.utils import COLUMNS, COLS, BENCHMARK_COLS, EVAL_COLS
	from src.envs import EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH

	# Print paths for debugging
	print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
	print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")

	# Minimal CSS
	minimal_css = """
	.container {
	max-width: 1200px;
	margin: 0 auto;
	}
	.header {
	text-align: center;
	margin-bottom: 20px;
	}
	"""

	# Function to load data directly from JSON files
	def load_data_directly():
	if not os.path.exists(EVAL_RESULTS_PATH):
	print(f"Path does not exist: {EVAL_RESULTS_PATH}")
	return pd.DataFrame()

	result_files = [
	os.path.join(EVAL_RESULTS_PATH, f)
	for f in os.listdir(EVAL_RESULTS_PATH)
	if f.endswith('.json')
	]

	print(f"Found {len(result_files)} JSON files")

	data_list = []
	for file in result_files:
	try:
	with open(file, 'r') as f:
	data = json.load(f)

	flattened_data = {}
	# Extract both config and results
	flattened_data.update(data.get('config', {}))
	flattened_data.update(data.get('results', {}))
	data_list.append(flattened_data)
	except Exception as e:
	print(f"Error loading file {file}: {e}")

	if not data_list:
	print("No data loaded from JSON files")
	return pd.DataFrame()

	df = pd.DataFrame(data_list)
	print(f"Successfully loaded DataFrame with shape: {df.shape}")
	return df

	# Try to load data using both methods
	try:
	print("Attempting to load data using get_leaderboard_df...")
	LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
	print(f"get_leaderboard_df result shape: {LEADERBOARD_DF.shape}")

	# If that fails or returns empty, try direct loading
	if LEADERBOARD_DF.empty:
	print("get_leaderboard_df returned empty DataFrame, trying direct loading...")
	LEADERBOARD_DF = load_data_directly()

	# If still empty, create a sample
	if LEADERBOARD_DF.empty:
	print("Both methods returned empty DataFrames, creating sample data")
	LEADERBOARD_DF = pd.DataFrame([{
	"model_name": "Sample Model",
	"average": 75.5,
	"model_type": "Encoder",
	"precision": "float16"
	}])
	except Exception as e:
	print(f"Error in data loading: {e}")
	# Create a minimal DataFrame
	LEADERBOARD_DF = pd.DataFrame([{
	"model_name": "Error Loading Data",
	"average": 0
	}])

	# Print final DataFrame info
	print(f"Final DataFrame shape: {LEADERBOARD_DF.shape}")
	print(f"Final DataFrame columns: {LEADERBOARD_DF.columns.tolist()}")

	# Select important columns for display
	display_cols = ["model_name", "average", "model_type", "precision", "weight_type", "license"]

	# Add some subject columns
	subject_cols = [
	"abstract_algebra", "anatomy", "astronomy", "business_ethics",
	"college_biology", "college_chemistry", "college_computer_science",
	"high_school_mathematics", "machine_learning"
	]

	# Add all detected subject columns
	for col in LEADERBOARD_DF.columns:
	if col not in display_cols and col not in ["submitted_time", "revision", "base_model", "likes", "params"]:
	subject_cols.append(col)

	# Combine columns, filtering to only those that exist
	all_display_cols = display_cols + subject_cols
	actual_display_cols = [col for col in all_display_cols if col in LEADERBOARD_DF.columns]

	# Ensure we have at least some columns
	if not actual_display_cols and not LEADERBOARD_DF.empty:
	actual_display_cols = LEADERBOARD_DF.columns.tolist()

	# Filter the DataFrame
	if not LEADERBOARD_DF.empty:
	display_df = LEADERBOARD_DF[actual_display_cols].copy()

	# Round numeric columns for display
	for col in display_df.columns:
	if pd.api.types.is_numeric_dtype(display_df[col]):
	display_df[col] = display_df[col].round(2)

	# Sort by average if it exists
	if "average" in display_df.columns:
	display_df = display_df.sort_values(by="average", ascending=False)
	else:
	display_df = LEADERBOARD_DF

	# Create the app
	with gr.Blocks(css=minimal_css) as demo:
	gr.HTML("<div class='header'><h1>ILMAAM: Index for Language Models for Arabic Assessment on Multitasks</h1></div>")

	with gr.Tabs() as tabs:
	with gr.TabItem("LLM Benchmark"):
	# Add debug output
	with gr.Accordion("Debug Info", open=True):
	gr.Markdown(f"DataFrame Shape: {display_df.shape}")
	gr.Markdown(f"Column Names: {', '.join(display_df.columns[:10])}" + ("..." if len(display_df.columns) > 10 else ""))

	# Use standard DataTable
	datatable = gr.DataFrame(
	value=display_df,
	interactive=False,
	wrap=True
	)

	# Add filter functionality using dropdowns
	with gr.Row():
	if "model_type" in display_df.columns and not display_df.empty:
	model_types = ["All"] + sorted(display_df["model_type"].dropna().unique().tolist())
	model_type_filter = gr.Dropdown(
	choices=model_types,
	value="All",
	label="Filter by Model Type",
	interactive=True
	)

	if "precision" in display_df.columns and not display_df.empty:
	precisions = ["All"] + sorted(display_df["precision"].dropna().unique().tolist())
	precision_filter = gr.Dropdown(
	choices=precisions,
	value="All",
	label="Filter by Precision",
	interactive=True
	)

	search_input = gr.Textbox(
	label="Search by Model Name",
	placeholder="Enter model name...",
	interactive=True
	)

	# Filter function
	def filter_data(model_type, precision, search):
	filtered_df = display_df.copy()

	if model_type != "All" and "model_type" in filtered_df.columns:
	filtered_df = filtered_df[filtered_df["model_type"] == model_type]

	if precision != "All" and "precision" in filtered_df.columns:
	filtered_df = filtered_df[filtered_df["precision"] == precision]

	if search and "model_name" in filtered_df.columns:
	filtered_df = filtered_df[filtered_df["model_name"].str.contains(search, case=False)]

	return filtered_df

	# Connect filters
	filter_inputs = []
	if "model_type" in display_df.columns and not display_df.empty:
	filter_inputs.append(model_type_filter)
	if "precision" in display_df.columns and not display_df.empty:
	filter_inputs.append(precision_filter)
	filter_inputs.append(search_input)

	# If we have filter inputs, connect them
	if filter_inputs:
	for input_component in filter_inputs:
	input_component.change(
	filter_data,
	inputs=filter_inputs,
	outputs=datatable
	)

	with gr.TabItem("About"):
	gr.Markdown("""
	# About ILMAAM

	The Index for Language Models for Arabic Assessment on Multitasks (ILMAAM) showcases the performance of various Arabic LLMs on the newly released MMMLU OpenAI Benchmark across different subjects.

	This benchmark evaluates language models specifically for Arabic language capabilities.
	""")

	with gr.TabItem("Submit"):
	gr.Markdown("""
	# Submit Your Model

	You can submit your Arabic language model for benchmark evaluation. Fill out the form below:
	""")

	with gr.Row():
	with gr.Column():
	model_name_textbox = gr.Textbox(label="Model name")
	revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
	model_type = gr.Dropdown(
	choices=["Encoder", "Decoder"],
	label="Model type",
	multiselect=False,
	interactive=True
	)

	with gr.Column():
	precision = gr.Dropdown(
	choices=["float16", "float32", "int8", "int4"],
	label="Precision",
	multiselect=False,
	value="float16",
	interactive=True
	)
	weight_type = gr.Dropdown(
	choices=["Original", "Quantized", "Distilled"],
	label="Weights type",
	multiselect=False,
	value="Original",
	interactive=True
	)
	base_model_name_textbox = gr.Textbox(label="Base model (if applicable)")

	submit_button = gr.Button("Submit for Evaluation")
	submission_result = gr.Markdown()

	def mock_submission(model_name, base_model, revision, precision, weight_type, model_type):
	if not model_name:
	return "Error: Model name is required."
	return f"Model '{model_name}' submitted successfully! It will be evaluated soon."

	submit_button.click(
	mock_submission,
	[
	model_name_textbox,
	base_model_name_textbox,
	revision_name_textbox,
	precision,
	weight_type,
	model_type,
	],
	submission_result,
	)

	demo.launch(debug=True, share=False)