Commit
8355c4d
·
verified ·
1 Parent(s): e1da145

update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -40
app.py CHANGED
@@ -3,11 +3,12 @@ import pandas as pd
3
  import os
4
  import json
5
  from src.populate import get_leaderboard_df
6
- from src.display.utils import COLUMNS, COLS, BENCHMARK_COLS
7
  from src.envs import EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH
8
 
9
- # Ensure directories exist
10
- os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
 
11
 
12
  # Minimal CSS
13
  minimal_css = """
@@ -21,14 +22,56 @@ minimal_css = """
21
  }
22
  """
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  try:
25
- # Load the leaderboard DataFrame
26
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
27
- print("LEADERBOARD_DF Shape:", LEADERBOARD_DF.shape)
 
 
 
 
 
28
 
29
- # If DataFrame is empty, create a sample
30
  if LEADERBOARD_DF.empty:
31
- print("Creating sample data for testing")
32
  LEADERBOARD_DF = pd.DataFrame([{
33
  "model_name": "Sample Model",
34
  "average": 75.5,
@@ -36,36 +79,56 @@ try:
36
  "precision": "float16"
37
  }])
38
  except Exception as e:
39
- print(f"Error loading leaderboard data: {e}")
40
  # Create a minimal DataFrame
41
  LEADERBOARD_DF = pd.DataFrame([{
42
  "model_name": "Error Loading Data",
43
  "average": 0
44
  }])
45
 
46
- # Select common columns for display
47
- display_cols = ["model_name", "average"]
48
- # Add some subject columns if they exist
49
- subject_cols = ["abstract_algebra", "anatomy", "astronomy", "business_ethics"]
50
- for col in subject_cols:
51
- if col in LEADERBOARD_DF.columns:
52
- display_cols.append(col)
53
- # Add model metadata if they exist
54
- meta_cols = ["model_type", "precision", "weight_type", "license"]
55
- for col in meta_cols:
56
- if col in LEADERBOARD_DF.columns:
57
- display_cols.append(col)
 
 
 
 
 
 
58
 
59
- # Filter the DataFrame to only include display columns that actually exist
60
- actual_display_cols = [col for col in display_cols if col in LEADERBOARD_DF.columns]
61
- display_df = LEADERBOARD_DF[actual_display_cols].copy()
62
 
63
- # Round numeric columns for display
64
- for col in display_df.columns:
65
- if pd.api.types.is_numeric_dtype(display_df[col]):
66
- display_df[col] = display_df[col].round(2)
67
 
68
- # Create a very simple app using standard DataTable instead of Leaderboard
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  with gr.Blocks(css=minimal_css) as demo:
70
  gr.HTML("<div class='header'><h1>ILMAAM: Index for Language Models for Arabic Assessment on Multitasks</h1></div>")
71
 
@@ -74,20 +137,19 @@ with gr.Blocks(css=minimal_css) as demo:
74
  # Add debug output
75
  with gr.Accordion("Debug Info", open=True):
76
  gr.Markdown(f"DataFrame Shape: {display_df.shape}")
77
- gr.Markdown(f"Column Names: {', '.join(display_df.columns)}")
78
 
79
- # Use standard DataTable instead of Leaderboard
80
  datatable = gr.DataFrame(
81
  value=display_df,
82
  interactive=False,
83
- wrap=True,
84
- column_widths=[200] + [100] * (len(actual_display_cols) - 1)
85
  )
86
 
87
  # Add filter functionality using dropdowns
88
  with gr.Row():
89
- if "model_type" in display_df.columns:
90
- model_types = ["All"] + sorted(display_df["model_type"].unique().tolist())
91
  model_type_filter = gr.Dropdown(
92
  choices=model_types,
93
  value="All",
@@ -95,8 +157,8 @@ with gr.Blocks(css=minimal_css) as demo:
95
  interactive=True
96
  )
97
 
98
- if "precision" in display_df.columns:
99
- precisions = ["All"] + sorted(display_df["precision"].unique().tolist())
100
  precision_filter = gr.Dropdown(
101
  choices=precisions,
102
  value="All",
@@ -127,9 +189,9 @@ with gr.Blocks(css=minimal_css) as demo:
127
 
128
  # Connect filters
129
  filter_inputs = []
130
- if "model_type" in display_df.columns:
131
  filter_inputs.append(model_type_filter)
132
- if "precision" in display_df.columns:
133
  filter_inputs.append(precision_filter)
134
  filter_inputs.append(search_input)
135
 
@@ -143,9 +205,68 @@ with gr.Blocks(css=minimal_css) as demo:
143
  )
144
 
145
  with gr.TabItem("About"):
146
- gr.Markdown("This is a benchmark for Arabic language models.")
 
 
 
 
 
 
147
 
148
  with gr.TabItem("Submit"):
149
- gr.Markdown("Submission form will be available here.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  demo.launch(debug=True, share=False)
 
3
  import os
4
  import json
5
  from src.populate import get_leaderboard_df
6
+ from src.display.utils import COLUMNS, COLS, BENCHMARK_COLS, EVAL_COLS
7
  from src.envs import EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH
8
 
9
+ # Print paths for debugging
10
+ print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
11
+ print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
12
 
13
  # Minimal CSS
14
  minimal_css = """
 
22
  }
23
  """
24
 
25
+ # Function to load data directly from JSON files
26
+ def load_data_directly():
27
+ if not os.path.exists(EVAL_RESULTS_PATH):
28
+ print(f"Path does not exist: {EVAL_RESULTS_PATH}")
29
+ return pd.DataFrame()
30
+
31
+ result_files = [
32
+ os.path.join(EVAL_RESULTS_PATH, f)
33
+ for f in os.listdir(EVAL_RESULTS_PATH)
34
+ if f.endswith('.json')
35
+ ]
36
+
37
+ print(f"Found {len(result_files)} JSON files")
38
+
39
+ data_list = []
40
+ for file in result_files:
41
+ try:
42
+ with open(file, 'r') as f:
43
+ data = json.load(f)
44
+
45
+ flattened_data = {}
46
+ # Extract both config and results
47
+ flattened_data.update(data.get('config', {}))
48
+ flattened_data.update(data.get('results', {}))
49
+ data_list.append(flattened_data)
50
+ except Exception as e:
51
+ print(f"Error loading file {file}: {e}")
52
+
53
+ if not data_list:
54
+ print("No data loaded from JSON files")
55
+ return pd.DataFrame()
56
+
57
+ df = pd.DataFrame(data_list)
58
+ print(f"Successfully loaded DataFrame with shape: {df.shape}")
59
+ return df
60
+
61
+ # Try to load data using both methods
62
  try:
63
+ print("Attempting to load data using get_leaderboard_df...")
64
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
65
+ print(f"get_leaderboard_df result shape: {LEADERBOARD_DF.shape}")
66
+
67
+ # If that fails or returns empty, try direct loading
68
+ if LEADERBOARD_DF.empty:
69
+ print("get_leaderboard_df returned empty DataFrame, trying direct loading...")
70
+ LEADERBOARD_DF = load_data_directly()
71
 
72
+ # If still empty, create a sample
73
  if LEADERBOARD_DF.empty:
74
+ print("Both methods returned empty DataFrames, creating sample data")
75
  LEADERBOARD_DF = pd.DataFrame([{
76
  "model_name": "Sample Model",
77
  "average": 75.5,
 
79
  "precision": "float16"
80
  }])
81
  except Exception as e:
82
+ print(f"Error in data loading: {e}")
83
  # Create a minimal DataFrame
84
  LEADERBOARD_DF = pd.DataFrame([{
85
  "model_name": "Error Loading Data",
86
  "average": 0
87
  }])
88
 
89
+ # Print final DataFrame info
90
+ print(f"Final DataFrame shape: {LEADERBOARD_DF.shape}")
91
+ print(f"Final DataFrame columns: {LEADERBOARD_DF.columns.tolist()}")
92
+
93
+ # Select important columns for display
94
+ display_cols = ["model_name", "average", "model_type", "precision", "weight_type", "license"]
95
+
96
+ # Add some subject columns
97
+ subject_cols = [
98
+ "abstract_algebra", "anatomy", "astronomy", "business_ethics",
99
+ "college_biology", "college_chemistry", "college_computer_science",
100
+ "high_school_mathematics", "machine_learning"
101
+ ]
102
+
103
+ # Add all detected subject columns
104
+ for col in LEADERBOARD_DF.columns:
105
+ if col not in display_cols and col not in ["submitted_time", "revision", "base_model", "likes", "params"]:
106
+ subject_cols.append(col)
107
 
108
+ # Combine columns, filtering to only those that exist
109
+ all_display_cols = display_cols + subject_cols
110
+ actual_display_cols = [col for col in all_display_cols if col in LEADERBOARD_DF.columns]
111
 
112
+ # Ensure we have at least some columns
113
+ if not actual_display_cols and not LEADERBOARD_DF.empty:
114
+ actual_display_cols = LEADERBOARD_DF.columns.tolist()
 
115
 
116
+ # Filter the DataFrame
117
+ if not LEADERBOARD_DF.empty:
118
+ display_df = LEADERBOARD_DF[actual_display_cols].copy()
119
+
120
+ # Round numeric columns for display
121
+ for col in display_df.columns:
122
+ if pd.api.types.is_numeric_dtype(display_df[col]):
123
+ display_df[col] = display_df[col].round(2)
124
+
125
+ # Sort by average if it exists
126
+ if "average" in display_df.columns:
127
+ display_df = display_df.sort_values(by="average", ascending=False)
128
+ else:
129
+ display_df = LEADERBOARD_DF
130
+
131
+ # Create the app
132
  with gr.Blocks(css=minimal_css) as demo:
133
  gr.HTML("<div class='header'><h1>ILMAAM: Index for Language Models for Arabic Assessment on Multitasks</h1></div>")
134
 
 
137
  # Add debug output
138
  with gr.Accordion("Debug Info", open=True):
139
  gr.Markdown(f"DataFrame Shape: {display_df.shape}")
140
+ gr.Markdown(f"Column Names: {', '.join(display_df.columns[:10])}" + ("..." if len(display_df.columns) > 10 else ""))
141
 
142
+ # Use standard DataTable
143
  datatable = gr.DataFrame(
144
  value=display_df,
145
  interactive=False,
146
+ wrap=True
 
147
  )
148
 
149
  # Add filter functionality using dropdowns
150
  with gr.Row():
151
+ if "model_type" in display_df.columns and not display_df.empty:
152
+ model_types = ["All"] + sorted(display_df["model_type"].dropna().unique().tolist())
153
  model_type_filter = gr.Dropdown(
154
  choices=model_types,
155
  value="All",
 
157
  interactive=True
158
  )
159
 
160
+ if "precision" in display_df.columns and not display_df.empty:
161
+ precisions = ["All"] + sorted(display_df["precision"].dropna().unique().tolist())
162
  precision_filter = gr.Dropdown(
163
  choices=precisions,
164
  value="All",
 
189
 
190
  # Connect filters
191
  filter_inputs = []
192
+ if "model_type" in display_df.columns and not display_df.empty:
193
  filter_inputs.append(model_type_filter)
194
+ if "precision" in display_df.columns and not display_df.empty:
195
  filter_inputs.append(precision_filter)
196
  filter_inputs.append(search_input)
197
 
 
205
  )
206
 
207
  with gr.TabItem("About"):
208
+ gr.Markdown("""
209
+ # About ILMAAM
210
+
211
+ The **Index for Language Models for Arabic Assessment on Multitasks (ILMAAM)** showcases the performance of various Arabic LLMs on the newly released MMMLU OpenAI Benchmark across different subjects.
212
+
213
+ This benchmark evaluates language models specifically for Arabic language capabilities.
214
+ """)
215
 
216
  with gr.TabItem("Submit"):
217
+ gr.Markdown("""
218
+ # Submit Your Model
219
+
220
+ You can submit your Arabic language model for benchmark evaluation. Fill out the form below:
221
+ """)
222
+
223
+ with gr.Row():
224
+ with gr.Column():
225
+ model_name_textbox = gr.Textbox(label="Model name")
226
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
227
+ model_type = gr.Dropdown(
228
+ choices=["Encoder", "Decoder"],
229
+ label="Model type",
230
+ multiselect=False,
231
+ interactive=True
232
+ )
233
+
234
+ with gr.Column():
235
+ precision = gr.Dropdown(
236
+ choices=["float16", "float32", "int8", "int4"],
237
+ label="Precision",
238
+ multiselect=False,
239
+ value="float16",
240
+ interactive=True
241
+ )
242
+ weight_type = gr.Dropdown(
243
+ choices=["Original", "Quantized", "Distilled"],
244
+ label="Weights type",
245
+ multiselect=False,
246
+ value="Original",
247
+ interactive=True
248
+ )
249
+ base_model_name_textbox = gr.Textbox(label="Base model (if applicable)")
250
+
251
+ submit_button = gr.Button("Submit for Evaluation")
252
+ submission_result = gr.Markdown()
253
+
254
+ def mock_submission(model_name, base_model, revision, precision, weight_type, model_type):
255
+ if not model_name:
256
+ return "Error: Model name is required."
257
+ return f"Model '{model_name}' submitted successfully! It will be evaluated soon."
258
+
259
+ submit_button.click(
260
+ mock_submission,
261
+ [
262
+ model_name_textbox,
263
+ base_model_name_textbox,
264
+ revision_name_textbox,
265
+ precision,
266
+ weight_type,
267
+ model_type,
268
+ ],
269
+ submission_result,
270
+ )
271
 
272
  demo.launch(debug=True, share=False)