SondosMB commited on
Commit
5b92bc4
·
verified ·
1 Parent(s): 68f17e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -293
app.py CHANGED
@@ -1,191 +1,36 @@
1
- # import gradio as gr
2
- # import pandas as pd
3
- # import os
4
- # import re
5
- # from datetime import datetime
6
-
7
- # LEADERBOARD_FILE = "leaderboard.csv" # File to store leaderboard data
8
-
9
- # def clean_answer(answer):
10
- # if pd.isna(answer):
11
- # return None
12
- # answer = str(answer)
13
- # clean = re.sub(r'[^A-Da-d]', '', answer)
14
- # if clean:
15
- # first_letter = clean[0].upper()
16
- # if first_letter in ['A', 'B', 'C', 'D']:
17
- # return first_letter
18
- # return None
19
-
20
- # def write_evaluation_results(results, output_file):
21
- # os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
22
- # timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
23
-
24
- # output_text = [
25
- # f"Evaluation Results for Model: {results['model_name']}",
26
- # f"Timestamp: {timestamp}",
27
- # "-" * 50,
28
- # f"Overall Accuracy (including invalid): {results['overall_accuracy']:.2%}",
29
- # f"Accuracy (valid predictions only): {results['valid_accuracy']:.2%}",
30
- # f"Total Questions: {results['total_questions']}",
31
- # f"Valid Predictions: {results['valid_predictions']}",
32
- # f"Invalid/Malformed Predictions: {results['invalid_predictions']}",
33
- # f"Correct Predictions: {results['correct_predictions']}",
34
- # "\nPerformance by Field:",
35
- # "-" * 50
36
- # ]
37
-
38
- # for field, metrics in results['field_performance'].items():
39
- # field_results = [
40
- # f"\nField: {field}",
41
- # f"Accuracy (including invalid): {metrics['accuracy']:.2%}",
42
- # f"Accuracy (valid only): {metrics['valid_accuracy']:.2%}",
43
- # f"Correct: {metrics['correct']}/{metrics['total']}",
44
- # f"Invalid predictions: {metrics['invalid']}"
45
- # ]
46
- # output_text.extend(field_results)
47
-
48
- # with open(output_file, 'w') as f:
49
- # f.write('\n'.join(output_text))
50
- # print('\n'.join(output_text))
51
- # print(f"\nResults have been saved to: {output_file}")
52
-
53
- # def update_leaderboard(results):
54
- # # Add results to the leaderboard file
55
- # new_entry = {
56
- # "Model Name": results['model_name'],
57
- # "Overall Accuracy": f"{results['overall_accuracy']:.2%}",
58
- # "Valid Accuracy": f"{results['valid_accuracy']:.2%}",
59
- # "Correct Predictions": results['correct_predictions'],
60
- # "Total Questions": results['total_questions'],
61
- # "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
62
- # }
63
- # leaderboard_df = pd.DataFrame([new_entry])
64
- # if os.path.exists(LEADERBOARD_FILE):
65
- # existing_df = pd.read_csv(LEADERBOARD_FILE)
66
- # leaderboard_df = pd.concat([existing_df, leaderboard_df], ignore_index=True)
67
- # leaderboard_df.to_csv(LEADERBOARD_FILE, index=False)
68
-
69
- # def display_leaderboard():
70
- # if not os.path.exists(LEADERBOARD_FILE):
71
- # return "Leaderboard is empty."
72
- # leaderboard_df = pd.read_csv(LEADERBOARD_FILE)
73
- # return leaderboard_df.to_markdown(index=False)
74
-
75
- # def evaluate_predictions(prediction_file):
76
- # ground_truth_file = "ground_truth.csv" # Specify the path to the ground truth file
77
- # if not prediction_file:
78
- # return "Prediction file not uploaded", None
79
-
80
- # if not os.path.exists(ground_truth_file):
81
- # return "Ground truth file not found", None
82
-
83
- # try:
84
- # predictions_df = pd.read_csv(prediction_file.name)
85
- # ground_truth_df = pd.read_csv(ground_truth_file)
86
-
87
- # # Extract model name
88
- # try:
89
- # filename = os.path.basename(prediction_file.name)
90
- # if "_" in filename and "." in filename:
91
- # model_name = filename.split('_')[1].split('.')[0]
92
- # else:
93
- # model_name = "unknown_model"
94
- # except IndexError:
95
- # model_name = "unknown_model"
96
-
97
- # # Merge dataframes
98
- # merged_df = pd.merge(
99
- # predictions_df,
100
- # ground_truth_df,
101
- # on='question_id',
102
- # how='inner'
103
- # )
104
- # merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
105
- # invalid_predictions = merged_df['pred_answer'].isna().sum()
106
- # valid_predictions = merged_df.dropna(subset=['pred_answer'])
107
- # correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
108
- # total_predictions = len(merged_df)
109
- # total_valid_predictions = len(valid_predictions)
110
-
111
- # overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
112
- # valid_accuracy = (
113
- # correct_predictions / total_valid_predictions
114
- # if total_valid_predictions > 0
115
- # else 0
116
- # )
117
-
118
- # field_metrics = {}
119
- # for field in merged_df['Field'].unique():
120
- # field_data = merged_df[merged_df['Field'] == field]
121
- # field_valid_data = field_data.dropna(subset=['pred_answer'])
122
-
123
- # field_correct = (field_valid_data['pred_answer'] == field_valid_data['Answer']).sum()
124
- # field_total = len(field_data)
125
- # field_valid_total = len(field_valid_data)
126
- # field_invalid = field_total - field_valid_total
127
-
128
- # field_metrics[field] = {
129
- # 'accuracy': field_correct / field_total if field_total > 0 else 0,
130
- # 'valid_accuracy': field_correct / field_valid_total if field_valid_total > 0 else 0,
131
- # 'correct': field_correct,
132
- # 'total': field_total,
133
- # 'invalid': field_invalid
134
- # }
135
-
136
- # results = {
137
- # 'model_name': model_name,
138
- # 'overall_accuracy': overall_accuracy,
139
- # 'valid_accuracy': valid_accuracy,
140
- # 'total_questions': total_predictions,
141
- # 'valid_predictions': total_valid_predictions,
142
- # 'invalid_predictions': invalid_predictions,
143
- # 'correct_predictions': correct_predictions,
144
- # 'field_performance': field_metrics
145
- # }
146
-
147
- # update_leaderboard(results)
148
- # output_file = "evaluation_results.txt"
149
- # write_evaluation_results(results, output_file)
150
- # return "Evaluation completed successfully! Leaderboard updated.", output_file
151
-
152
- # except Exception as e:
153
- # return f"Error during evaluation: {str(e)}", None
154
-
155
- # # Gradio Interface
156
- # description = "Upload a prediction CSV file to evaluate predictions against the ground truth and update the leaderboard."
157
-
158
- # demo = gr.Blocks()
159
-
160
- # with demo:
161
- # gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
162
- # with gr.Tab("Evaluate"):
163
- # file_input = gr.File(label="Upload Prediction CSV")
164
- # eval_status = gr.Textbox(label="Evaluation Status")
165
- # eval_results_file = gr.File(label="Download Evaluation Results")
166
- # eval_button = gr.Button("Evaluate")
167
- # eval_button.click(
168
- # evaluate_predictions, inputs=file_input, outputs=[eval_status, eval_results_file]
169
- # )
170
- # with gr.Tab("Leaderboard"):
171
- # leaderboard_text = gr.Textbox(label="Leaderboard", interactive=False)
172
- # refresh_button = gr.Button("Refresh Leaderboard")
173
- # refresh_button.click(display_leaderboard, outputs=leaderboard_text)
174
-
175
- # if __name__ == "__main__":
176
- # demo.launch()
177
-
178
 
 
179
  # import gradio as gr
180
  # import pandas as pd
181
  # import os
182
  # import re
183
  # from datetime import datetime
184
 
185
- # LEADERBOARD_FILE = "leaderboard.csv" # File to store leaderboard data
186
  # LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  # def clean_answer(answer):
 
 
 
189
  # if pd.isna(answer):
190
  # return None
191
  # answer = str(answer)
@@ -194,49 +39,9 @@
194
  # return clean[0].upper()
195
  # return None
196
 
197
-
198
- # def evaluate_predictions(prediction_file):
199
- # ground_truth_file = "ground_truth.csv"
200
- # if not os.path.exists(ground_truth_file):
201
- # return "Ground truth file not found."
202
- # if not prediction_file:
203
- # return "Prediction file not uploaded."
204
-
205
- # try:
206
- # predictions_df = pd.read_csv(prediction_file.name)
207
- # ground_truth_df = pd.read_csv(ground_truth_file)
208
- # model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
209
-
210
- # merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
211
- # merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
212
-
213
- # valid_predictions = merged_df.dropna(subset=['pred_answer'])
214
- # correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
215
- # total_predictions = len(merged_df)
216
- # total_valid_predictions = len(valid_predictions)
217
-
218
- # overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
219
- # valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
220
-
221
- # results = {
222
- # 'model_name': model_name,
223
- # 'overall_accuracy': overall_accuracy,
224
- # 'valid_accuracy': valid_accuracy,
225
- # 'correct_predictions': correct_predictions,
226
- # 'total_questions': total_predictions,
227
- # }
228
-
229
- # update_leaderboard(results)
230
- # return "Evaluation completed successfully! Leaderboard updated."
231
- # except Exception as e:
232
- # return f"Error during evaluation: {str(e)}"
233
-
234
-
235
- # # Build Gradio App
236
-
237
  # def update_leaderboard(results):
238
  # """
239
- # Update the leaderboard file with new results.
240
  # """
241
  # new_entry = {
242
  # "Model Name": results['model_name'],
@@ -247,23 +52,14 @@
247
  # "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
248
  # }
249
 
250
- # # Convert new entry to DataFrame
251
  # new_entry_df = pd.DataFrame([new_entry])
252
-
253
- # # Append to leaderboard file
254
- # if not os.path.exists(LEADERBOARD_FILE):
255
- # # If file does not exist, create it with headers
256
- # new_entry_df.to_csv(LEADERBOARD_FILE, index=False)
257
- # else:
258
- # # Append without headers
259
- # new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)
260
-
261
 
262
  # def load_leaderboard():
263
  # """
264
- # Load the leaderboard from the leaderboard file.
265
  # """
266
- # if not os.path.exists(LEADERBOARD_FILE):
267
  # return pd.DataFrame({
268
  # "Model Name": [],
269
  # "Overall Accuracy": [],
@@ -274,10 +70,9 @@
274
  # })
275
  # return pd.read_csv(LEADERBOARD_FILE)
276
 
277
-
278
- # def evaluate_predictions_and_update_leaderboard(prediction_file):
279
  # """
280
- # Evaluate predictions and update the leaderboard.
281
  # """
282
  # ground_truth_file = "ground_truth.csv"
283
  # if not os.path.exists(ground_truth_file):
@@ -286,35 +81,45 @@
286
  # return "Prediction file not uploaded.", load_leaderboard()
287
 
288
  # try:
 
289
  # predictions_df = pd.read_csv(prediction_file.name)
290
  # ground_truth_df = pd.read_csv(ground_truth_file)
291
- # model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
292
 
 
293
  # merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
294
  # merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
295
 
 
296
  # valid_predictions = merged_df.dropna(subset=['pred_answer'])
297
  # correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
298
  # total_predictions = len(merged_df)
299
  # total_valid_predictions = len(valid_predictions)
300
 
 
301
  # overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
302
  # valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
303
 
304
  # results = {
305
- # 'model_name': model_name,
306
  # 'overall_accuracy': overall_accuracy,
307
  # 'valid_accuracy': valid_accuracy,
308
  # 'correct_predictions': correct_predictions,
309
  # 'total_questions': total_predictions,
310
  # }
311
 
312
- # update_leaderboard(results)
313
- # return "Evaluation completed successfully! Leaderboard updated.", load_leaderboard()
 
 
 
 
314
  # except Exception as e:
315
  # return f"Error during evaluation: {str(e)}", load_leaderboard()
316
 
317
- # # Build Gradio App
 
 
 
318
  # with gr.Blocks() as demo:
319
  # gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
320
 
@@ -322,6 +127,8 @@
322
  # # Submission Tab
323
  # with gr.TabItem("🏅 Submission"):
324
  # file_input = gr.File(label="Upload Prediction CSV")
 
 
325
  # eval_status = gr.Textbox(label="Evaluation Status", interactive=False)
326
  # leaderboard_table_preview = gr.Dataframe(
327
  # value=load_leaderboard(),
@@ -331,8 +138,8 @@
331
  # )
332
  # eval_button = gr.Button("Evaluate and Update Leaderboard")
333
  # eval_button.click(
334
- # evaluate_predictions_and_update_leaderboard,
335
- # inputs=[file_input],
336
  # outputs=[eval_status, leaderboard_table_preview],
337
  # )
338
 
@@ -354,49 +161,76 @@
354
  # gr.Markdown(f"Last updated on **{LAST_UPDATED}**")
355
 
356
  # demo.launch()
 
357
  import gradio as gr
358
  import pandas as pd
359
- import os
360
  import re
361
  from datetime import datetime
 
 
 
362
 
363
- LEADERBOARD_FILE = "leaderboard.csv" # File to store all submissions persistently
 
 
364
  LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
365
 
366
- def initialize_leaderboard_file():
367
  """
368
- Ensure the leaderboard file exists and has the correct headers.
369
  """
370
- if not os.path.exists(LEADERBOARD_FILE):
371
- # Create the file with headers
372
- pd.DataFrame(columns=[
373
- "Model Name", "Overall Accuracy", "Valid Accuracy",
374
- "Correct Predictions", "Total Questions", "Timestamp"
375
- ]).to_csv(LEADERBOARD_FILE, index=False)
376
- else:
377
- # Check if the file is empty and write headers if needed
378
- if os.stat(LEADERBOARD_FILE).st_size == 0:
379
- pd.DataFrame(columns=[
380
- "Model Name", "Overall Accuracy", "Valid Accuracy",
381
- "Correct Predictions", "Total Questions", "Timestamp"
382
- ]).to_csv(LEADERBOARD_FILE, index=False)
383
 
384
- def clean_answer(answer):
385
  """
386
- Clean and normalize the predicted answers.
387
  """
388
- if pd.isna(answer):
389
- return None
390
- answer = str(answer)
391
- clean = re.sub(r'[^A-Da-d]', '', answer)
392
- if clean:
393
- return clean[0].upper()
394
- return None
 
 
 
 
 
 
 
 
 
 
395
 
396
  def update_leaderboard(results):
397
  """
398
- Append new submission results to the leaderboard file.
399
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  new_entry = {
401
  "Model Name": results['model_name'],
402
  "Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
@@ -405,41 +239,38 @@ def update_leaderboard(results):
405
  "Total Questions": results['total_questions'],
406
  "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
407
  }
 
408
 
409
- new_entry_df = pd.DataFrame([new_entry])
410
- new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)
 
 
411
 
412
- def load_leaderboard():
413
  """
414
- Load all submissions from the leaderboard file.
415
  """
416
- if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0:
417
- return pd.DataFrame({
418
- "Model Name": [],
419
- "Overall Accuracy": [],
420
- "Valid Accuracy": [],
421
- "Correct Predictions": [],
422
- "Total Questions": [],
423
- "Timestamp": [],
424
- })
425
- return pd.read_csv(LEADERBOARD_FILE)
426
 
427
  def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
428
  """
429
  Evaluate predictions and optionally add results to the leaderboard.
430
  """
431
- ground_truth_file = "ground_truth.csv"
432
- if not os.path.exists(ground_truth_file):
433
  return "Ground truth file not found.", load_leaderboard()
434
  if not prediction_file:
435
  return "Prediction file not uploaded.", load_leaderboard()
436
 
437
  try:
438
- # Load predictions and ground truth
439
  predictions_df = pd.read_csv(prediction_file.name)
440
- ground_truth_df = pd.read_csv(ground_truth_file)
441
-
442
- # Merge predictions with ground truth
443
  merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
444
  merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
445
 
@@ -470,12 +301,9 @@ def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
470
  except Exception as e:
471
  return f"Error during evaluation: {str(e)}", load_leaderboard()
472
 
473
- # Initialize leaderboard file
474
- initialize_leaderboard_file()
475
-
476
  # Gradio Interface
477
  with gr.Blocks() as demo:
478
- gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
479
 
480
  with gr.Tabs():
481
  # Submission Tab
@@ -516,4 +344,3 @@ with gr.Blocks() as demo:
516
 
517
  demo.launch()
518
 
519
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+ # # demo.launch()
3
  # import gradio as gr
4
  # import pandas as pd
5
  # import os
6
  # import re
7
  # from datetime import datetime
8
 
9
+ # LEADERBOARD_FILE = "leaderboard.csv" # File to store all submissions persistently
10
  # LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
11
 
12
+ # def initialize_leaderboard_file():
13
+ # """
14
+ # Ensure the leaderboard file exists and has the correct headers.
15
+ # """
16
+ # if not os.path.exists(LEADERBOARD_FILE):
17
+ # # Create the file with headers
18
+ # pd.DataFrame(columns=[
19
+ # "Model Name", "Overall Accuracy", "Valid Accuracy",
20
+ # "Correct Predictions", "Total Questions", "Timestamp"
21
+ # ]).to_csv(LEADERBOARD_FILE, index=False)
22
+ # else:
23
+ # # Check if the file is empty and write headers if needed
24
+ # if os.stat(LEADERBOARD_FILE).st_size == 0:
25
+ # pd.DataFrame(columns=[
26
+ # "Model Name", "Overall Accuracy", "Valid Accuracy",
27
+ # "Correct Predictions", "Total Questions", "Timestamp"
28
+ # ]).to_csv(LEADERBOARD_FILE, index=False)
29
+
30
  # def clean_answer(answer):
31
+ # """
32
+ # Clean and normalize the predicted answers.
33
+ # """
34
  # if pd.isna(answer):
35
  # return None
36
  # answer = str(answer)
 
39
  # return clean[0].upper()
40
  # return None
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # def update_leaderboard(results):
43
  # """
44
+ # Append new submission results to the leaderboard file.
45
  # """
46
  # new_entry = {
47
  # "Model Name": results['model_name'],
 
52
  # "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
53
  # }
54
 
 
55
  # new_entry_df = pd.DataFrame([new_entry])
56
+ # new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)
 
 
 
 
 
 
 
 
57
 
58
  # def load_leaderboard():
59
  # """
60
+ # Load all submissions from the leaderboard file.
61
  # """
62
+ # if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0:
63
  # return pd.DataFrame({
64
  # "Model Name": [],
65
  # "Overall Accuracy": [],
 
70
  # })
71
  # return pd.read_csv(LEADERBOARD_FILE)
72
 
73
+ # def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
 
74
  # """
75
+ # Evaluate predictions and optionally add results to the leaderboard.
76
  # """
77
  # ground_truth_file = "ground_truth.csv"
78
  # if not os.path.exists(ground_truth_file):
 
81
  # return "Prediction file not uploaded.", load_leaderboard()
82
 
83
  # try:
84
+ # # Load predictions and ground truth
85
  # predictions_df = pd.read_csv(prediction_file.name)
86
  # ground_truth_df = pd.read_csv(ground_truth_file)
 
87
 
88
+ # # Merge predictions with ground truth
89
  # merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
90
  # merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
91
 
92
+ # # Evaluate predictions
93
  # valid_predictions = merged_df.dropna(subset=['pred_answer'])
94
  # correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
95
  # total_predictions = len(merged_df)
96
  # total_valid_predictions = len(valid_predictions)
97
 
98
+ # # Calculate accuracy
99
  # overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
100
  # valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
101
 
102
  # results = {
103
+ # 'model_name': model_name if model_name else "Unknown Model",
104
  # 'overall_accuracy': overall_accuracy,
105
  # 'valid_accuracy': valid_accuracy,
106
  # 'correct_predictions': correct_predictions,
107
  # 'total_questions': total_predictions,
108
  # }
109
 
110
+ # # Update leaderboard only if opted in
111
+ # if add_to_leaderboard:
112
+ # update_leaderboard(results)
113
+ # return "Evaluation completed and added to leaderboard.", load_leaderboard()
114
+ # else:
115
+ # return "Evaluation completed but not added to leaderboard.", load_leaderboard()
116
  # except Exception as e:
117
  # return f"Error during evaluation: {str(e)}", load_leaderboard()
118
 
119
+ # # Initialize leaderboard file
120
+ # initialize_leaderboard_file()
121
+
122
+ # # Gradio Interface
123
  # with gr.Blocks() as demo:
124
  # gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
125
 
 
127
  # # Submission Tab
128
  # with gr.TabItem("🏅 Submission"):
129
  # file_input = gr.File(label="Upload Prediction CSV")
130
+ # model_name_input = gr.Textbox(label="Model Name", placeholder="Enter your model name")
131
+ # add_to_leaderboard_checkbox = gr.Checkbox(label="Add to Leaderboard?", value=True)
132
  # eval_status = gr.Textbox(label="Evaluation Status", interactive=False)
133
  # leaderboard_table_preview = gr.Dataframe(
134
  # value=load_leaderboard(),
 
138
  # )
139
  # eval_button = gr.Button("Evaluate and Update Leaderboard")
140
  # eval_button.click(
141
+ # evaluate_predictions,
142
+ # inputs=[file_input, model_name_input, add_to_leaderboard_checkbox],
143
  # outputs=[eval_status, leaderboard_table_preview],
144
  # )
145
 
 
161
  # gr.Markdown(f"Last updated on **{LAST_UPDATED}**")
162
 
163
  # demo.launch()
164
+
165
  import gradio as gr
166
  import pandas as pd
 
167
  import re
168
  from datetime import datetime
169
+ from huggingface_hub import hf_hub_download
170
+ from datasets import Dataset
171
+ import os
172
 
173
+ HF_TOKEN = os.getenv("HF_TOKEN") # Hugging Face token stored as an environment variable
174
+ LEADERBOARD_REPO = "username/leaderboard-dataset" # Replace with your leaderboard dataset name
175
+ GROUND_TRUTH_REPO = "username/ground-truth-dataset" # Replace with your ground truth dataset name
176
  LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
177
 
178
+ def load_ground_truth():
179
  """
180
+ Load the ground truth file from a private Hugging Face dataset.
181
  """
182
+ try:
183
+ ground_truth_path = hf_hub_download(
184
+ repo_id=GROUND_TRUTH_REPO,
185
+ filename="ground_truth.csv",
186
+ use_auth_token=HF_TOKEN
187
+ )
188
+ return pd.read_csv(ground_truth_path)
189
+ except Exception as e:
190
+ print(f"Error loading ground truth: {e}")
191
+ return None
 
 
 
192
 
193
+ def load_leaderboard():
194
  """
195
+ Load the leaderboard from a private Hugging Face dataset.
196
  """
197
+ try:
198
+ leaderboard_path = hf_hub_download(
199
+ repo_id=LEADERBOARD_REPO,
200
+ filename="leaderboard.csv",
201
+ use_auth_token=HF_TOKEN
202
+ )
203
+ return pd.read_csv(leaderboard_path)
204
+ except Exception as e:
205
+ print(f"Error loading leaderboard: {e}")
206
+ return pd.DataFrame({
207
+ "Model Name": [],
208
+ "Overall Accuracy": [],
209
+ "Valid Accuracy": [],
210
+ "Correct Predictions": [],
211
+ "Total Questions": [],
212
+ "Timestamp": [],
213
+ })
214
 
215
  def update_leaderboard(results):
216
  """
217
+ Append new submission results to the private leaderboard dataset.
218
  """
219
+ try:
220
+ # Load existing leaderboard or create a new one
221
+ leaderboard_path = hf_hub_download(
222
+ repo_id=LEADERBOARD_REPO,
223
+ filename="leaderboard.csv",
224
+ use_auth_token=HF_TOKEN
225
+ )
226
+ df = pd.read_csv(leaderboard_path)
227
+ except:
228
+ df = pd.DataFrame(columns=[
229
+ "Model Name", "Overall Accuracy", "Valid Accuracy",
230
+ "Correct Predictions", "Total Questions", "Timestamp"
231
+ ])
232
+
233
+ # Add new entry
234
  new_entry = {
235
  "Model Name": results['model_name'],
236
  "Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
 
239
  "Total Questions": results['total_questions'],
240
  "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
241
  }
242
+ df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
243
 
244
+ # Save locally and push updated dataset to Hugging Face
245
+ df.to_csv("leaderboard.csv", index=False)
246
+ dataset = Dataset.from_pandas(df)
247
+ dataset.push_to_hub(LEADERBOARD_REPO, split="train", private=True)
248
 
249
+ def clean_answer(answer):
250
  """
251
+ Clean and normalize the predicted answers.
252
  """
253
+ if pd.isna(answer):
254
+ return None
255
+ answer = str(answer)
256
+ clean = re.sub(r'[^A-Da-d]', '', answer)
257
+ if clean:
258
+ return clean[0].upper()
259
+ return None
 
 
 
260
 
261
  def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
262
  """
263
  Evaluate predictions and optionally add results to the leaderboard.
264
  """
265
+ ground_truth_df = load_ground_truth()
266
+ if ground_truth_df is None:
267
  return "Ground truth file not found.", load_leaderboard()
268
  if not prediction_file:
269
  return "Prediction file not uploaded.", load_leaderboard()
270
 
271
  try:
272
+ # Load predictions and merge with ground truth
273
  predictions_df = pd.read_csv(prediction_file.name)
 
 
 
274
  merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
275
  merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
276
 
 
301
  except Exception as e:
302
  return f"Error during evaluation: {str(e)}", load_leaderboard()
303
 
 
 
 
304
  # Gradio Interface
305
  with gr.Blocks() as demo:
306
+ gr.Markdown("# Secure Prediction Evaluation Tool with Private Leaderboard")
307
 
308
  with gr.Tabs():
309
  # Submission Tab
 
344
 
345
  demo.launch()
346