SondosMB commited on
Commit
6bcbc7b
·
verified ·
1 Parent(s): 9f7748a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -65
app.py CHANGED
@@ -176,16 +176,205 @@
176
  # demo.launch()
177
 
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  import gradio as gr
180
  import pandas as pd
181
  import os
182
  import re
183
  from datetime import datetime
184
 
185
- LEADERBOARD_FILE = "leaderboard.csv" # File to store leaderboard data
186
  LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
187
 
 
 
 
 
 
 
 
188
  def clean_answer(answer):
 
 
 
189
  if pd.isna(answer):
190
  return None
191
  answer = str(answer)
@@ -194,49 +383,9 @@ def clean_answer(answer):
194
  return clean[0].upper()
195
  return None
196
 
197
-
198
- def evaluate_predictions(prediction_file):
199
- ground_truth_file = "ground_truth.csv"
200
- if not os.path.exists(ground_truth_file):
201
- return "Ground truth file not found."
202
- if not prediction_file:
203
- return "Prediction file not uploaded."
204
-
205
- try:
206
- predictions_df = pd.read_csv(prediction_file.name)
207
- ground_truth_df = pd.read_csv(ground_truth_file)
208
- model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
209
-
210
- merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
211
- merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
212
-
213
- valid_predictions = merged_df.dropna(subset=['pred_answer'])
214
- correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
215
- total_predictions = len(merged_df)
216
- total_valid_predictions = len(valid_predictions)
217
-
218
- overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
219
- valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
220
-
221
- results = {
222
- 'model_name': model_name,
223
- 'overall_accuracy': overall_accuracy,
224
- 'valid_accuracy': valid_accuracy,
225
- 'correct_predictions': correct_predictions,
226
- 'total_questions': total_predictions,
227
- }
228
-
229
- update_leaderboard(results)
230
- return "Evaluation completed successfully! Leaderboard updated."
231
- except Exception as e:
232
- return f"Error during evaluation: {str(e)}"
233
-
234
-
235
- # Build Gradio App
236
-
237
  def update_leaderboard(results):
238
  """
239
- Update the leaderboard file with new results.
240
  """
241
  new_entry = {
242
  "Model Name": results['model_name'],
@@ -247,37 +396,18 @@ def update_leaderboard(results):
247
  "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
248
  }
249
 
250
- # Convert new entry to DataFrame
251
  new_entry_df = pd.DataFrame([new_entry])
252
-
253
- # Append to leaderboard file
254
- if not os.path.exists(LEADERBOARD_FILE):
255
- # If file does not exist, create it with headers
256
- new_entry_df.to_csv(LEADERBOARD_FILE, index=False)
257
- else:
258
- # Append without headers
259
- new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)
260
-
261
 
262
  def load_leaderboard():
263
  """
264
- Load the leaderboard from the leaderboard file.
265
  """
266
- if not os.path.exists(LEADERBOARD_FILE):
267
- return pd.DataFrame({
268
- "Model Name": [],
269
- "Overall Accuracy": [],
270
- "Valid Accuracy": [],
271
- "Correct Predictions": [],
272
- "Total Questions": [],
273
- "Timestamp": [],
274
- })
275
  return pd.read_csv(LEADERBOARD_FILE)
276
 
277
-
278
  def evaluate_predictions_and_update_leaderboard(prediction_file):
279
  """
280
- Evaluate predictions and update the leaderboard.
281
  """
282
  ground_truth_file = "ground_truth.csv"
283
  if not os.path.exists(ground_truth_file):
@@ -286,18 +416,22 @@ def evaluate_predictions_and_update_leaderboard(prediction_file):
286
  return "Prediction file not uploaded.", load_leaderboard()
287
 
288
  try:
 
289
  predictions_df = pd.read_csv(prediction_file.name)
290
  ground_truth_df = pd.read_csv(ground_truth_file)
291
  model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
292
 
 
293
  merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
294
  merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
295
 
 
296
  valid_predictions = merged_df.dropna(subset=['pred_answer'])
297
  correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
298
  total_predictions = len(merged_df)
299
  total_valid_predictions = len(valid_predictions)
300
 
 
301
  overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
302
  valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
303
 
@@ -309,12 +443,13 @@ def evaluate_predictions_and_update_leaderboard(prediction_file):
309
  'total_questions': total_predictions,
310
  }
311
 
 
312
  update_leaderboard(results)
313
  return "Evaluation completed successfully! Leaderboard updated.", load_leaderboard()
314
  except Exception as e:
315
  return f"Error during evaluation: {str(e)}", load_leaderboard()
316
 
317
- # Build Gradio App
318
  with gr.Blocks() as demo:
319
  gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
320
 
 
176
  # demo.launch()
177
 
178
 
179
+ # import gradio as gr
180
+ # import pandas as pd
181
+ # import os
182
+ # import re
183
+ # from datetime import datetime
184
+
185
+ # LEADERBOARD_FILE = "leaderboard.csv" # File to store leaderboard data
186
+ # LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
187
+
188
+ # def clean_answer(answer):
189
+ # if pd.isna(answer):
190
+ # return None
191
+ # answer = str(answer)
192
+ # clean = re.sub(r'[^A-Da-d]', '', answer)
193
+ # if clean:
194
+ # return clean[0].upper()
195
+ # return None
196
+
197
+
198
+ # def evaluate_predictions(prediction_file):
199
+ # ground_truth_file = "ground_truth.csv"
200
+ # if not os.path.exists(ground_truth_file):
201
+ # return "Ground truth file not found."
202
+ # if not prediction_file:
203
+ # return "Prediction file not uploaded."
204
+
205
+ # try:
206
+ # predictions_df = pd.read_csv(prediction_file.name)
207
+ # ground_truth_df = pd.read_csv(ground_truth_file)
208
+ # model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
209
+
210
+ # merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
211
+ # merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
212
+
213
+ # valid_predictions = merged_df.dropna(subset=['pred_answer'])
214
+ # correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
215
+ # total_predictions = len(merged_df)
216
+ # total_valid_predictions = len(valid_predictions)
217
+
218
+ # overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
219
+ # valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
220
+
221
+ # results = {
222
+ # 'model_name': model_name,
223
+ # 'overall_accuracy': overall_accuracy,
224
+ # 'valid_accuracy': valid_accuracy,
225
+ # 'correct_predictions': correct_predictions,
226
+ # 'total_questions': total_predictions,
227
+ # }
228
+
229
+ # update_leaderboard(results)
230
+ # return "Evaluation completed successfully! Leaderboard updated."
231
+ # except Exception as e:
232
+ # return f"Error during evaluation: {str(e)}"
233
+
234
+
235
+ # # Build Gradio App
236
+
237
+ # def update_leaderboard(results):
238
+ # """
239
+ # Update the leaderboard file with new results.
240
+ # """
241
+ # new_entry = {
242
+ # "Model Name": results['model_name'],
243
+ # "Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
244
+ # "Valid Accuracy": round(results['valid_accuracy'] * 100, 2),
245
+ # "Correct Predictions": results['correct_predictions'],
246
+ # "Total Questions": results['total_questions'],
247
+ # "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
248
+ # }
249
+
250
+ # # Convert new entry to DataFrame
251
+ # new_entry_df = pd.DataFrame([new_entry])
252
+
253
+ # # Append to leaderboard file
254
+ # if not os.path.exists(LEADERBOARD_FILE):
255
+ # # If file does not exist, create it with headers
256
+ # new_entry_df.to_csv(LEADERBOARD_FILE, index=False)
257
+ # else:
258
+ # # Append without headers
259
+ # new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)
260
+
261
+
262
+ # def load_leaderboard():
263
+ # """
264
+ # Load the leaderboard from the leaderboard file.
265
+ # """
266
+ # if not os.path.exists(LEADERBOARD_FILE):
267
+ # return pd.DataFrame({
268
+ # "Model Name": [],
269
+ # "Overall Accuracy": [],
270
+ # "Valid Accuracy": [],
271
+ # "Correct Predictions": [],
272
+ # "Total Questions": [],
273
+ # "Timestamp": [],
274
+ # })
275
+ # return pd.read_csv(LEADERBOARD_FILE)
276
+
277
+
278
+ # def evaluate_predictions_and_update_leaderboard(prediction_file):
279
+ # """
280
+ # Evaluate predictions and update the leaderboard.
281
+ # """
282
+ # ground_truth_file = "ground_truth.csv"
283
+ # if not os.path.exists(ground_truth_file):
284
+ # return "Ground truth file not found.", load_leaderboard()
285
+ # if not prediction_file:
286
+ # return "Prediction file not uploaded.", load_leaderboard()
287
+
288
+ # try:
289
+ # predictions_df = pd.read_csv(prediction_file.name)
290
+ # ground_truth_df = pd.read_csv(ground_truth_file)
291
+ # model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
292
+
293
+ # merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
294
+ # merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
295
+
296
+ # valid_predictions = merged_df.dropna(subset=['pred_answer'])
297
+ # correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
298
+ # total_predictions = len(merged_df)
299
+ # total_valid_predictions = len(valid_predictions)
300
+
301
+ # overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
302
+ # valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
303
+
304
+ # results = {
305
+ # 'model_name': model_name,
306
+ # 'overall_accuracy': overall_accuracy,
307
+ # 'valid_accuracy': valid_accuracy,
308
+ # 'correct_predictions': correct_predictions,
309
+ # 'total_questions': total_predictions,
310
+ # }
311
+
312
+ # update_leaderboard(results)
313
+ # return "Evaluation completed successfully! Leaderboard updated.", load_leaderboard()
314
+ # except Exception as e:
315
+ # return f"Error during evaluation: {str(e)}", load_leaderboard()
316
+
317
+ # # Build Gradio App
318
+ # with gr.Blocks() as demo:
319
+ # gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
320
+
321
+ # with gr.Tabs():
322
+ # # Submission Tab
323
+ # with gr.TabItem("🏅 Submission"):
324
+ # file_input = gr.File(label="Upload Prediction CSV")
325
+ # eval_status = gr.Textbox(label="Evaluation Status", interactive=False)
326
+ # leaderboard_table_preview = gr.Dataframe(
327
+ # value=load_leaderboard(),
328
+ # label="Leaderboard (Preview)",
329
+ # interactive=False,
330
+ # wrap=True,
331
+ # )
332
+ # eval_button = gr.Button("Evaluate and Update Leaderboard")
333
+ # eval_button.click(
334
+ # evaluate_predictions_and_update_leaderboard,
335
+ # inputs=[file_input],
336
+ # outputs=[eval_status, leaderboard_table_preview],
337
+ # )
338
+
339
+ # # Leaderboard Tab
340
+ # with gr.TabItem("🏅 Leaderboard"):
341
+ # leaderboard_table = gr.Dataframe(
342
+ # value=load_leaderboard(),
343
+ # label="Leaderboard",
344
+ # interactive=False,
345
+ # wrap=True,
346
+ # )
347
+ # refresh_button = gr.Button("Refresh Leaderboard")
348
+ # refresh_button.click(
349
+ # lambda: load_leaderboard(),
350
+ # inputs=[],
351
+ # outputs=[leaderboard_table],
352
+ # )
353
+
354
+ # gr.Markdown(f"Last updated on **{LAST_UPDATED}**")
355
+
356
+ # demo.launch()
357
+
358
  import gradio as gr
359
  import pandas as pd
360
  import os
361
  import re
362
  from datetime import datetime
363
 
364
+ LEADERBOARD_FILE = "leaderboard.csv" # File to store all submissions persistently
365
  LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
366
 
367
+ # Initialize the leaderboard file if it doesn't exist
368
+ if not os.path.exists(LEADERBOARD_FILE):
369
+ pd.DataFrame(columns=[
370
+ "Model Name", "Overall Accuracy", "Valid Accuracy",
371
+ "Correct Predictions", "Total Questions", "Timestamp"
372
+ ]).to_csv(LEADERBOARD_FILE, index=False)
373
+
374
  def clean_answer(answer):
375
+ """
376
+ Clean and normalize the predicted answers.
377
+ """
378
  if pd.isna(answer):
379
  return None
380
  answer = str(answer)
 
383
  return clean[0].upper()
384
  return None
385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  def update_leaderboard(results):
387
  """
388
+ Append new submission results to the leaderboard file.
389
  """
390
  new_entry = {
391
  "Model Name": results['model_name'],
 
396
  "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
397
  }
398
 
 
399
  new_entry_df = pd.DataFrame([new_entry])
400
+ new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)
 
 
 
 
 
 
 
 
401
 
402
  def load_leaderboard():
403
  """
404
+ Load all submissions from the leaderboard file.
405
  """
 
 
 
 
 
 
 
 
 
406
  return pd.read_csv(LEADERBOARD_FILE)
407
 
 
408
  def evaluate_predictions_and_update_leaderboard(prediction_file):
409
  """
410
+ Evaluate predictions and append results to the leaderboard.
411
  """
412
  ground_truth_file = "ground_truth.csv"
413
  if not os.path.exists(ground_truth_file):
 
416
  return "Prediction file not uploaded.", load_leaderboard()
417
 
418
  try:
419
+ # Load predictions and ground truth
420
  predictions_df = pd.read_csv(prediction_file.name)
421
  ground_truth_df = pd.read_csv(ground_truth_file)
422
  model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
423
 
424
+ # Merge predictions with ground truth
425
  merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
426
  merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
427
 
428
+ # Evaluate predictions
429
  valid_predictions = merged_df.dropna(subset=['pred_answer'])
430
  correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
431
  total_predictions = len(merged_df)
432
  total_valid_predictions = len(valid_predictions)
433
 
434
+ # Calculate accuracy
435
  overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
436
  valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
437
 
 
443
  'total_questions': total_predictions,
444
  }
445
 
446
+ # Update leaderboard
447
  update_leaderboard(results)
448
  return "Evaluation completed successfully! Leaderboard updated.", load_leaderboard()
449
  except Exception as e:
450
  return f"Error during evaluation: {str(e)}", load_leaderboard()
451
 
452
+ # Gradio Interface
453
  with gr.Blocks() as demo:
454
  gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
455