update app.py
Browse files
app.py
CHANGED
@@ -3,11 +3,12 @@ import pandas as pd
|
|
3 |
import os
|
4 |
import json
|
5 |
from src.populate import get_leaderboard_df
|
6 |
-
from src.display.utils import COLUMNS, COLS, BENCHMARK_COLS
|
7 |
from src.envs import EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH
|
8 |
|
9 |
-
#
|
10 |
-
|
|
|
11 |
|
12 |
# Minimal CSS
|
13 |
minimal_css = """
|
@@ -21,14 +22,56 @@ minimal_css = """
|
|
21 |
}
|
22 |
"""
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
try:
|
25 |
-
|
26 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
27 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
# If
|
30 |
if LEADERBOARD_DF.empty:
|
31 |
-
print("
|
32 |
LEADERBOARD_DF = pd.DataFrame([{
|
33 |
"model_name": "Sample Model",
|
34 |
"average": 75.5,
|
@@ -36,36 +79,56 @@ try:
|
|
36 |
"precision": "float16"
|
37 |
}])
|
38 |
except Exception as e:
|
39 |
-
print(f"Error
|
40 |
# Create a minimal DataFrame
|
41 |
LEADERBOARD_DF = pd.DataFrame([{
|
42 |
"model_name": "Error Loading Data",
|
43 |
"average": 0
|
44 |
}])
|
45 |
|
46 |
-
#
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
# Add
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
#
|
60 |
-
|
61 |
-
|
62 |
|
63 |
-
#
|
64 |
-
|
65 |
-
|
66 |
-
display_df[col] = display_df[col].round(2)
|
67 |
|
68 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
with gr.Blocks(css=minimal_css) as demo:
|
70 |
gr.HTML("<div class='header'><h1>ILMAAM: Index for Language Models for Arabic Assessment on Multitasks</h1></div>")
|
71 |
|
@@ -74,20 +137,19 @@ with gr.Blocks(css=minimal_css) as demo:
|
|
74 |
# Add debug output
|
75 |
with gr.Accordion("Debug Info", open=True):
|
76 |
gr.Markdown(f"DataFrame Shape: {display_df.shape}")
|
77 |
-
gr.Markdown(f"Column Names: {', '.join(display_df.columns)}")
|
78 |
|
79 |
-
# Use standard DataTable
|
80 |
datatable = gr.DataFrame(
|
81 |
value=display_df,
|
82 |
interactive=False,
|
83 |
-
wrap=True
|
84 |
-
column_widths=[200] + [100] * (len(actual_display_cols) - 1)
|
85 |
)
|
86 |
|
87 |
# Add filter functionality using dropdowns
|
88 |
with gr.Row():
|
89 |
-
if "model_type" in display_df.columns:
|
90 |
-
model_types = ["All"] + sorted(display_df["model_type"].unique().tolist())
|
91 |
model_type_filter = gr.Dropdown(
|
92 |
choices=model_types,
|
93 |
value="All",
|
@@ -95,8 +157,8 @@ with gr.Blocks(css=minimal_css) as demo:
|
|
95 |
interactive=True
|
96 |
)
|
97 |
|
98 |
-
if "precision" in display_df.columns:
|
99 |
-
precisions = ["All"] + sorted(display_df["precision"].unique().tolist())
|
100 |
precision_filter = gr.Dropdown(
|
101 |
choices=precisions,
|
102 |
value="All",
|
@@ -127,9 +189,9 @@ with gr.Blocks(css=minimal_css) as demo:
|
|
127 |
|
128 |
# Connect filters
|
129 |
filter_inputs = []
|
130 |
-
if "model_type" in display_df.columns:
|
131 |
filter_inputs.append(model_type_filter)
|
132 |
-
if "precision" in display_df.columns:
|
133 |
filter_inputs.append(precision_filter)
|
134 |
filter_inputs.append(search_input)
|
135 |
|
@@ -143,9 +205,68 @@ with gr.Blocks(css=minimal_css) as demo:
|
|
143 |
)
|
144 |
|
145 |
with gr.TabItem("About"):
|
146 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
with gr.TabItem("Submit"):
|
149 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
demo.launch(debug=True, share=False)
|
|
|
3 |
import os
|
4 |
import json
|
5 |
from src.populate import get_leaderboard_df
|
6 |
+
from src.display.utils import COLUMNS, COLS, BENCHMARK_COLS, EVAL_COLS
|
7 |
from src.envs import EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH
|
8 |
|
9 |
+
# Print paths for debugging
|
10 |
+
print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
|
11 |
+
print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
|
12 |
|
13 |
# Minimal CSS
|
14 |
minimal_css = """
|
|
|
22 |
}
|
23 |
"""
|
24 |
|
25 |
+
# Function to load data directly from JSON files
|
26 |
+
def load_data_directly():
|
27 |
+
if not os.path.exists(EVAL_RESULTS_PATH):
|
28 |
+
print(f"Path does not exist: {EVAL_RESULTS_PATH}")
|
29 |
+
return pd.DataFrame()
|
30 |
+
|
31 |
+
result_files = [
|
32 |
+
os.path.join(EVAL_RESULTS_PATH, f)
|
33 |
+
for f in os.listdir(EVAL_RESULTS_PATH)
|
34 |
+
if f.endswith('.json')
|
35 |
+
]
|
36 |
+
|
37 |
+
print(f"Found {len(result_files)} JSON files")
|
38 |
+
|
39 |
+
data_list = []
|
40 |
+
for file in result_files:
|
41 |
+
try:
|
42 |
+
with open(file, 'r') as f:
|
43 |
+
data = json.load(f)
|
44 |
+
|
45 |
+
flattened_data = {}
|
46 |
+
# Extract both config and results
|
47 |
+
flattened_data.update(data.get('config', {}))
|
48 |
+
flattened_data.update(data.get('results', {}))
|
49 |
+
data_list.append(flattened_data)
|
50 |
+
except Exception as e:
|
51 |
+
print(f"Error loading file {file}: {e}")
|
52 |
+
|
53 |
+
if not data_list:
|
54 |
+
print("No data loaded from JSON files")
|
55 |
+
return pd.DataFrame()
|
56 |
+
|
57 |
+
df = pd.DataFrame(data_list)
|
58 |
+
print(f"Successfully loaded DataFrame with shape: {df.shape}")
|
59 |
+
return df
|
60 |
+
|
61 |
+
# Try to load data using both methods
|
62 |
try:
|
63 |
+
print("Attempting to load data using get_leaderboard_df...")
|
64 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
65 |
+
print(f"get_leaderboard_df result shape: {LEADERBOARD_DF.shape}")
|
66 |
+
|
67 |
+
# If that fails or returns empty, try direct loading
|
68 |
+
if LEADERBOARD_DF.empty:
|
69 |
+
print("get_leaderboard_df returned empty DataFrame, trying direct loading...")
|
70 |
+
LEADERBOARD_DF = load_data_directly()
|
71 |
|
72 |
+
# If still empty, create a sample
|
73 |
if LEADERBOARD_DF.empty:
|
74 |
+
print("Both methods returned empty DataFrames, creating sample data")
|
75 |
LEADERBOARD_DF = pd.DataFrame([{
|
76 |
"model_name": "Sample Model",
|
77 |
"average": 75.5,
|
|
|
79 |
"precision": "float16"
|
80 |
}])
|
81 |
except Exception as e:
|
82 |
+
print(f"Error in data loading: {e}")
|
83 |
# Create a minimal DataFrame
|
84 |
LEADERBOARD_DF = pd.DataFrame([{
|
85 |
"model_name": "Error Loading Data",
|
86 |
"average": 0
|
87 |
}])
|
88 |
|
89 |
+
# Print final DataFrame info
|
90 |
+
print(f"Final DataFrame shape: {LEADERBOARD_DF.shape}")
|
91 |
+
print(f"Final DataFrame columns: {LEADERBOARD_DF.columns.tolist()}")
|
92 |
+
|
93 |
+
# Select important columns for display
|
94 |
+
display_cols = ["model_name", "average", "model_type", "precision", "weight_type", "license"]
|
95 |
+
|
96 |
+
# Add some subject columns
|
97 |
+
subject_cols = [
|
98 |
+
"abstract_algebra", "anatomy", "astronomy", "business_ethics",
|
99 |
+
"college_biology", "college_chemistry", "college_computer_science",
|
100 |
+
"high_school_mathematics", "machine_learning"
|
101 |
+
]
|
102 |
+
|
103 |
+
# Add all detected subject columns
|
104 |
+
for col in LEADERBOARD_DF.columns:
|
105 |
+
if col not in display_cols and col not in ["submitted_time", "revision", "base_model", "likes", "params"]:
|
106 |
+
subject_cols.append(col)
|
107 |
|
108 |
+
# Combine columns, filtering to only those that exist
|
109 |
+
all_display_cols = display_cols + subject_cols
|
110 |
+
actual_display_cols = [col for col in all_display_cols if col in LEADERBOARD_DF.columns]
|
111 |
|
112 |
+
# Ensure we have at least some columns
|
113 |
+
if not actual_display_cols and not LEADERBOARD_DF.empty:
|
114 |
+
actual_display_cols = LEADERBOARD_DF.columns.tolist()
|
|
|
115 |
|
116 |
+
# Filter the DataFrame
|
117 |
+
if not LEADERBOARD_DF.empty:
|
118 |
+
display_df = LEADERBOARD_DF[actual_display_cols].copy()
|
119 |
+
|
120 |
+
# Round numeric columns for display
|
121 |
+
for col in display_df.columns:
|
122 |
+
if pd.api.types.is_numeric_dtype(display_df[col]):
|
123 |
+
display_df[col] = display_df[col].round(2)
|
124 |
+
|
125 |
+
# Sort by average if it exists
|
126 |
+
if "average" in display_df.columns:
|
127 |
+
display_df = display_df.sort_values(by="average", ascending=False)
|
128 |
+
else:
|
129 |
+
display_df = LEADERBOARD_DF
|
130 |
+
|
131 |
+
# Create the app
|
132 |
with gr.Blocks(css=minimal_css) as demo:
|
133 |
gr.HTML("<div class='header'><h1>ILMAAM: Index for Language Models for Arabic Assessment on Multitasks</h1></div>")
|
134 |
|
|
|
137 |
# Add debug output
|
138 |
with gr.Accordion("Debug Info", open=True):
|
139 |
gr.Markdown(f"DataFrame Shape: {display_df.shape}")
|
140 |
+
gr.Markdown(f"Column Names: {', '.join(display_df.columns[:10])}" + ("..." if len(display_df.columns) > 10 else ""))
|
141 |
|
142 |
+
# Use standard DataTable
|
143 |
datatable = gr.DataFrame(
|
144 |
value=display_df,
|
145 |
interactive=False,
|
146 |
+
wrap=True
|
|
|
147 |
)
|
148 |
|
149 |
# Add filter functionality using dropdowns
|
150 |
with gr.Row():
|
151 |
+
if "model_type" in display_df.columns and not display_df.empty:
|
152 |
+
model_types = ["All"] + sorted(display_df["model_type"].dropna().unique().tolist())
|
153 |
model_type_filter = gr.Dropdown(
|
154 |
choices=model_types,
|
155 |
value="All",
|
|
|
157 |
interactive=True
|
158 |
)
|
159 |
|
160 |
+
if "precision" in display_df.columns and not display_df.empty:
|
161 |
+
precisions = ["All"] + sorted(display_df["precision"].dropna().unique().tolist())
|
162 |
precision_filter = gr.Dropdown(
|
163 |
choices=precisions,
|
164 |
value="All",
|
|
|
189 |
|
190 |
# Connect filters
|
191 |
filter_inputs = []
|
192 |
+
if "model_type" in display_df.columns and not display_df.empty:
|
193 |
filter_inputs.append(model_type_filter)
|
194 |
+
if "precision" in display_df.columns and not display_df.empty:
|
195 |
filter_inputs.append(precision_filter)
|
196 |
filter_inputs.append(search_input)
|
197 |
|
|
|
205 |
)
|
206 |
|
207 |
with gr.TabItem("About"):
|
208 |
+
gr.Markdown("""
|
209 |
+
# About ILMAAM
|
210 |
+
|
211 |
+
The **Index for Language Models for Arabic Assessment on Multitasks (ILMAAM)** showcases the performance of various Arabic LLMs on the newly released MMMLU OpenAI Benchmark across different subjects.
|
212 |
+
|
213 |
+
This benchmark evaluates language models specifically for Arabic language capabilities.
|
214 |
+
""")
|
215 |
|
216 |
with gr.TabItem("Submit"):
|
217 |
+
gr.Markdown("""
|
218 |
+
# Submit Your Model
|
219 |
+
|
220 |
+
You can submit your Arabic language model for benchmark evaluation. Fill out the form below:
|
221 |
+
""")
|
222 |
+
|
223 |
+
with gr.Row():
|
224 |
+
with gr.Column():
|
225 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
226 |
+
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
227 |
+
model_type = gr.Dropdown(
|
228 |
+
choices=["Encoder", "Decoder"],
|
229 |
+
label="Model type",
|
230 |
+
multiselect=False,
|
231 |
+
interactive=True
|
232 |
+
)
|
233 |
+
|
234 |
+
with gr.Column():
|
235 |
+
precision = gr.Dropdown(
|
236 |
+
choices=["float16", "float32", "int8", "int4"],
|
237 |
+
label="Precision",
|
238 |
+
multiselect=False,
|
239 |
+
value="float16",
|
240 |
+
interactive=True
|
241 |
+
)
|
242 |
+
weight_type = gr.Dropdown(
|
243 |
+
choices=["Original", "Quantized", "Distilled"],
|
244 |
+
label="Weights type",
|
245 |
+
multiselect=False,
|
246 |
+
value="Original",
|
247 |
+
interactive=True
|
248 |
+
)
|
249 |
+
base_model_name_textbox = gr.Textbox(label="Base model (if applicable)")
|
250 |
+
|
251 |
+
submit_button = gr.Button("Submit for Evaluation")
|
252 |
+
submission_result = gr.Markdown()
|
253 |
+
|
254 |
+
def mock_submission(model_name, base_model, revision, precision, weight_type, model_type):
|
255 |
+
if not model_name:
|
256 |
+
return "Error: Model name is required."
|
257 |
+
return f"Model '{model_name}' submitted successfully! It will be evaluated soon."
|
258 |
+
|
259 |
+
submit_button.click(
|
260 |
+
mock_submission,
|
261 |
+
[
|
262 |
+
model_name_textbox,
|
263 |
+
base_model_name_textbox,
|
264 |
+
revision_name_textbox,
|
265 |
+
precision,
|
266 |
+
weight_type,
|
267 |
+
model_type,
|
268 |
+
],
|
269 |
+
submission_result,
|
270 |
+
)
|
271 |
|
272 |
demo.launch(debug=True, share=False)
|