Commit
df4e9ef
Β·
verified Β·
1 Parent(s): 8355c4d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -222
app.py CHANGED
@@ -1,263 +1,148 @@
1
  import gradio as gr
 
2
  import pandas as pd
3
- import os
4
- import json
5
- from src.populate import get_leaderboard_df
6
- from src.display.utils import COLUMNS, COLS, BENCHMARK_COLS, EVAL_COLS
7
- from src.envs import EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH
8
 
9
- # Print paths for debugging
10
- print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
11
- print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Minimal CSS
14
- minimal_css = """
15
- .container {
16
- max-width: 1200px;
17
- margin: 0 auto;
18
- }
19
- .header {
20
- text-align: center;
21
- margin-bottom: 20px;
22
- }
23
- """
24
 
25
- # Function to load data directly from JSON files
26
- def load_data_directly():
27
- if not os.path.exists(EVAL_RESULTS_PATH):
28
- print(f"Path does not exist: {EVAL_RESULTS_PATH}")
29
- return pd.DataFrame()
30
-
31
- result_files = [
32
- os.path.join(EVAL_RESULTS_PATH, f)
33
- for f in os.listdir(EVAL_RESULTS_PATH)
34
- if f.endswith('.json')
35
- ]
36
-
37
- print(f"Found {len(result_files)} JSON files")
38
-
39
- data_list = []
40
- for file in result_files:
41
- try:
42
- with open(file, 'r') as f:
43
- data = json.load(f)
44
-
45
- flattened_data = {}
46
- # Extract both config and results
47
- flattened_data.update(data.get('config', {}))
48
- flattened_data.update(data.get('results', {}))
49
- data_list.append(flattened_data)
50
- except Exception as e:
51
- print(f"Error loading file {file}: {e}")
52
-
53
- if not data_list:
54
- print("No data loaded from JSON files")
55
- return pd.DataFrame()
56
-
57
- df = pd.DataFrame(data_list)
58
- print(f"Successfully loaded DataFrame with shape: {df.shape}")
59
- return df
60
 
61
- # Try to load data using both methods
62
  try:
63
- print("Attempting to load data using get_leaderboard_df...")
64
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
65
- print(f"get_leaderboard_df result shape: {LEADERBOARD_DF.shape}")
66
-
67
- # If that fails or returns empty, try direct loading
68
- if LEADERBOARD_DF.empty:
69
- print("get_leaderboard_df returned empty DataFrame, trying direct loading...")
70
- LEADERBOARD_DF = load_data_directly()
71
-
72
- # If still empty, create a sample
73
- if LEADERBOARD_DF.empty:
74
- print("Both methods returned empty DataFrames, creating sample data")
75
- LEADERBOARD_DF = pd.DataFrame([{
76
- "model_name": "Sample Model",
77
- "average": 75.5,
78
- "model_type": "Encoder",
79
- "precision": "float16"
80
- }])
81
- except Exception as e:
82
- print(f"Error in data loading: {e}")
83
- # Create a minimal DataFrame
84
- LEADERBOARD_DF = pd.DataFrame([{
85
- "model_name": "Error Loading Data",
86
- "average": 0
87
- }])
88
 
89
- # Print final DataFrame info
90
- print(f"Final DataFrame shape: {LEADERBOARD_DF.shape}")
91
- print(f"Final DataFrame columns: {LEADERBOARD_DF.columns.tolist()}")
92
 
93
- # Select important columns for display
94
- display_cols = ["model_name", "average", "model_type", "precision", "weight_type", "license"]
95
-
96
- # Add some subject columns
97
- subject_cols = [
98
- "abstract_algebra", "anatomy", "astronomy", "business_ethics",
99
- "college_biology", "college_chemistry", "college_computer_science",
100
- "high_school_mathematics", "machine_learning"
101
- ]
102
 
103
- # Add all detected subject columns
104
- for col in LEADERBOARD_DF.columns:
105
- if col not in display_cols and col not in ["submitted_time", "revision", "base_model", "likes", "params"]:
106
- subject_cols.append(col)
 
 
 
107
 
108
- # Combine columns, filtering to only those that exist
109
- all_display_cols = display_cols + subject_cols
110
- actual_display_cols = [col for col in all_display_cols if col in LEADERBOARD_DF.columns]
 
111
 
112
- # Ensure we have at least some columns
113
- if not actual_display_cols and not LEADERBOARD_DF.empty:
114
- actual_display_cols = LEADERBOARD_DF.columns.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
- # Filter the DataFrame
117
- if not LEADERBOARD_DF.empty:
118
- display_df = LEADERBOARD_DF[actual_display_cols].copy()
119
-
120
- # Round numeric columns for display
121
- for col in display_df.columns:
122
- if pd.api.types.is_numeric_dtype(display_df[col]):
123
- display_df[col] = display_df[col].round(2)
124
-
125
- # Sort by average if it exists
126
- if "average" in display_df.columns:
127
- display_df = display_df.sort_values(by="average", ascending=False)
128
- else:
129
- display_df = LEADERBOARD_DF
130
 
131
- # Create the app
132
- with gr.Blocks(css=minimal_css) as demo:
133
- gr.HTML("<div class='header'><h1>ILMAAM: Index for Language Models for Arabic Assessment on Multitasks</h1></div>")
134
-
135
- with gr.Tabs() as tabs:
136
- with gr.TabItem("LLM Benchmark"):
137
- # Add debug output
138
- with gr.Accordion("Debug Info", open=True):
139
- gr.Markdown(f"DataFrame Shape: {display_df.shape}")
140
- gr.Markdown(f"Column Names: {', '.join(display_df.columns[:10])}" + ("..." if len(display_df.columns) > 10 else ""))
141
-
142
- # Use standard DataTable
143
- datatable = gr.DataFrame(
144
- value=display_df,
145
- interactive=False,
146
- wrap=True
147
- )
148
-
149
- # Add filter functionality using dropdowns
150
  with gr.Row():
151
- if "model_type" in display_df.columns and not display_df.empty:
152
- model_types = ["All"] + sorted(display_df["model_type"].dropna().unique().tolist())
153
- model_type_filter = gr.Dropdown(
154
- choices=model_types,
155
- value="All",
156
- label="Filter by Model Type",
157
- interactive=True
158
- )
159
 
160
- if "precision" in display_df.columns and not display_df.empty:
161
- precisions = ["All"] + sorted(display_df["precision"].dropna().unique().tolist())
162
- precision_filter = gr.Dropdown(
163
- choices=precisions,
164
- value="All",
165
- label="Filter by Precision",
166
- interactive=True
167
- )
168
-
169
- search_input = gr.Textbox(
170
- label="Search by Model Name",
171
- placeholder="Enter model name...",
172
- interactive=True
173
- )
174
-
175
- # Filter function
176
- def filter_data(model_type, precision, search):
177
- filtered_df = display_df.copy()
178
-
179
- if model_type != "All" and "model_type" in filtered_df.columns:
180
- filtered_df = filtered_df[filtered_df["model_type"] == model_type]
181
-
182
- if precision != "All" and "precision" in filtered_df.columns:
183
- filtered_df = filtered_df[filtered_df["precision"] == precision]
184
-
185
- if search and "model_name" in filtered_df.columns:
186
- filtered_df = filtered_df[filtered_df["model_name"].str.contains(search, case=False)]
187
-
188
- return filtered_df
189
-
190
- # Connect filters
191
- filter_inputs = []
192
- if "model_type" in display_df.columns and not display_df.empty:
193
- filter_inputs.append(model_type_filter)
194
- if "precision" in display_df.columns and not display_df.empty:
195
- filter_inputs.append(precision_filter)
196
- filter_inputs.append(search_input)
197
-
198
- # If we have filter inputs, connect them
199
- if filter_inputs:
200
- for input_component in filter_inputs:
201
- input_component.change(
202
- filter_data,
203
- inputs=filter_inputs,
204
- outputs=datatable
205
- )
206
-
207
- with gr.TabItem("About"):
208
- gr.Markdown("""
209
- # About ILMAAM
210
-
211
- The **Index for Language Models for Arabic Assessment on Multitasks (ILMAAM)** showcases the performance of various Arabic LLMs on the newly released MMMLU OpenAI Benchmark across different subjects.
212
-
213
- This benchmark evaluates language models specifically for Arabic language capabilities.
214
- """)
215
-
216
- with gr.TabItem("Submit"):
217
- gr.Markdown("""
218
- # Submit Your Model
219
-
220
- You can submit your Arabic language model for benchmark evaluation. Fill out the form below:
221
- """)
222
-
223
  with gr.Row():
224
  with gr.Column():
225
  model_name_textbox = gr.Textbox(label="Model name")
226
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
227
  model_type = gr.Dropdown(
228
- choices=["Encoder", "Decoder"],
229
  label="Model type",
230
  multiselect=False,
231
- interactive=True
 
232
  )
233
 
234
  with gr.Column():
235
  precision = gr.Dropdown(
236
- choices=["float16", "float32", "int8", "int4"],
237
  label="Precision",
238
  multiselect=False,
239
  value="float16",
240
- interactive=True
241
  )
242
  weight_type = gr.Dropdown(
243
- choices=["Original", "Quantized", "Distilled"],
244
  label="Weights type",
245
  multiselect=False,
246
  value="Original",
247
- interactive=True
248
  )
249
- base_model_name_textbox = gr.Textbox(label="Base model (if applicable)")
250
 
251
- submit_button = gr.Button("Submit for Evaluation")
252
  submission_result = gr.Markdown()
253
-
254
- def mock_submission(model_name, base_model, revision, precision, weight_type, model_type):
255
- if not model_name:
256
- return "Error: Model name is required."
257
- return f"Model '{model_name}' submitted successfully! It will be evaluated soon."
258
-
259
  submit_button.click(
260
- mock_submission,
261
  [
262
  model_name_textbox,
263
  base_model_name_textbox,
@@ -269,4 +154,17 @@ with gr.Blocks(css=minimal_css) as demo:
269
  submission_result,
270
  )
271
 
272
- demo.launch(debug=True, share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
 
 
 
6
 
7
+ from src.about import (
8
+ CITATION_BUTTON_LABEL,
9
+ CITATION_BUTTON_TEXT,
10
+ EVALUATION_QUEUE_TEXT,
11
+ INTRODUCTION_TEXT,
12
+ LLM_BENCHMARKS_TEXT,
13
+ TITLE,
14
+ )
15
+ from src.display.css_html_js import custom_css
16
+ from src.display.utils import (
17
+ COLUMNS,
18
+ COLS,
19
+ BENCHMARK_COLS,
20
+ EVAL_COLS,
21
+ EVAL_TYPES,
22
+ ModelType,
23
+ WeightType,
24
+ Precision
25
+ )
26
 
27
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
+ from src.submission.submit import add_new_eval
 
 
 
 
 
 
 
 
30
 
31
+ def restart_space():
32
+ API.restart_space(repo_id=REPO_ID)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ ### Space initialization
35
  try:
36
+ print(EVAL_REQUESTS_PATH)
37
+ snapshot_download(
38
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
39
+ )
40
+ except Exception:
41
+ restart_space()
42
+ try:
43
+ print(EVAL_RESULTS_PATH)
44
+ snapshot_download(
45
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
46
+ )
47
+ except Exception:
48
+ restart_space()
49
+
50
+ # Load the leaderboard DataFrame
51
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
52
+ print("LEADERBOARD_DF Shape:", LEADERBOARD_DF.shape) # Debug
53
+ print("LEADERBOARD_DF Columns:", LEADERBOARD_DF.columns.tolist()) # Debug
 
 
 
 
 
 
 
54
 
55
+ # Load the evaluation queue DataFrames
56
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 
57
 
58
+ demo = gr.Blocks(css=custom_css)
59
+ with demo:
60
+ gr.HTML(TITLE)
61
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
 
 
 
 
62
 
63
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
64
+ with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
65
+ if LEADERBOARD_DF.empty:
66
+ gr.Markdown("No evaluations have been performed yet. The leaderboard is currently empty.")
67
+ else:
68
+ default_selection = [col.name for col in COLUMNS if col.displayed_by_default]
69
+ print("Default Selection before ensuring 'model_name':", default_selection) # Debug
70
 
71
+ # Ensure "model_name" is included
72
+ if "model_name" not in default_selection:
73
+ default_selection.insert(0, "model_name")
74
+ print("Default Selection after ensuring 'model_name':", default_selection) # Debug
75
 
76
+ leaderboard = Leaderboard(
77
+ value=LEADERBOARD_DF,
78
+ datatype=[col.type for col in COLUMNS],
79
+ select_columns=SelectColumns(
80
+ default_selection=default_selection,
81
+ cant_deselect=[col.name for col in COLUMNS if col.never_hidden],
82
+ label="Select Columns to Display:",
83
+ ),
84
+ search_columns=[col.name for col in COLUMNS if col.name in ["model_name", "license"]], # Updated to 'model_name'
85
+ hide_columns=[col.name for col in COLUMNS if col.hidden],
86
+ filter_columns=[
87
+ ColumnFilter("model_type", type="checkboxgroup", label="Model types"),
88
+ ColumnFilter("precision", type="checkboxgroup", label="Precision"),
89
+ ColumnFilter(
90
+ "still_on_hub", type="boolean", label="Deleted/incomplete", default=True
91
+ ),
92
+ ],
93
+ bool_checkboxgroup_label="Hide models",
94
+ interactive=False,
95
+ )
96
+ # No need to call leaderboard.render() since it's created within the Gradio context
97
 
98
+ with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
99
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
100
+
101
+ with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
102
+ with gr.Column():
103
+ with gr.Row():
104
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
105
+
106
+ # Since the evaluation queues are empty, display a message
107
+ with gr.Column():
108
+ gr.Markdown("Evaluations are performed immediately upon submission. There are no pending or running evaluations.")
 
 
 
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  with gr.Row():
111
+ gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
 
 
 
 
 
 
 
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  with gr.Row():
114
  with gr.Column():
115
  model_name_textbox = gr.Textbox(label="Model name")
116
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
117
  model_type = gr.Dropdown(
118
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
119
  label="Model type",
120
  multiselect=False,
121
+ value=None,
122
+ interactive=True,
123
  )
124
 
125
  with gr.Column():
126
  precision = gr.Dropdown(
127
+ choices=[i.value for i in Precision if i != Precision.Unknown],
128
  label="Precision",
129
  multiselect=False,
130
  value="float16",
131
+ interactive=True,
132
  )
133
  weight_type = gr.Dropdown(
134
+ choices=[i.value for i in WeightType],
135
  label="Weights type",
136
  multiselect=False,
137
  value="Original",
138
+ interactive=True,
139
  )
140
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
141
 
142
+ submit_button = gr.Button("Submit Eval")
143
  submission_result = gr.Markdown()
 
 
 
 
 
 
144
  submit_button.click(
145
+ add_new_eval,
146
  [
147
  model_name_textbox,
148
  base_model_name_textbox,
 
154
  submission_result,
155
  )
156
 
157
+ with gr.Row():
158
+ with gr.Accordion("πŸ“™ Citation", open=False):
159
+ citation_button = gr.Textbox(
160
+ value=CITATION_BUTTON_TEXT,
161
+ label=CITATION_BUTTON_LABEL,
162
+ lines=20,
163
+ elem_id="citation-button",
164
+ show_copy_button=True,
165
+ )
166
+
167
+ scheduler = BackgroundScheduler()
168
+ scheduler.add_job(restart_space, "interval", seconds=1800)
169
+ scheduler.start()
170
+ demo.queue(default_concurrency_limit=40).launch()