Update app.py
Browse files
app.py
CHANGED
@@ -274,7 +274,47 @@ def update_leaderboard(results):
|
|
274 |
except Exception as e:
|
275 |
print(f"Error updating leaderboard file: {e}")
|
276 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
|
279 |
|
280 |
# def load_leaderboard():
|
@@ -419,6 +459,65 @@ def evaluate_predictions(prediction_file, model_name,Team_name ,add_to_leaderboa
|
|
419 |
initialize_leaderboard_file()
|
420 |
|
421 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
422 |
# Function to set default mode
|
423 |
# Function to set default mode
|
424 |
import gradio as gr
|
@@ -803,8 +902,8 @@ with gr.Blocks(css=css_tech_theme) as demo:
|
|
803 |
overall_accuracy_display = gr.Number(label="π Overall Accuracy (%)", interactive=False,scale=1,min_width=1200)
|
804 |
|
805 |
with gr.Row(elem_id="submission-buttons"):
|
806 |
-
|
807 |
-
|
808 |
eval_status = gr.Textbox(label="π οΈ Evaluation Status", interactive=False,scale=1,min_width=1200)
|
809 |
|
810 |
|
@@ -855,12 +954,64 @@ with gr.Blocks(css=css_tech_theme) as demo:
|
|
855 |
except Exception as e:
|
856 |
return f"Error during evaluation: {str(e)}", 0, gr.update(visible=False)
|
857 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
858 |
|
859 |
|
860 |
def handle_submission(file, model_name,Team_name):
|
861 |
# Handle leaderboard submission
|
862 |
status, _ = evaluate_predictions(file, model_name,Team_name, add_to_leaderboard=True)
|
863 |
return f"Submission to leaderboard completed: {status}"
|
|
|
|
|
|
|
|
|
|
|
|
|
864 |
|
865 |
# Connect button clicks to the functions
|
866 |
eval_button.click(
|
@@ -868,6 +1019,18 @@ with gr.Blocks(css=css_tech_theme) as demo:
|
|
868 |
inputs=[file_input, model_name_input,Team_name_input],
|
869 |
outputs=[eval_status, overall_accuracy_display, submit_button],
|
870 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
871 |
|
872 |
submit_button.click(
|
873 |
handle_submission,
|
@@ -890,6 +1053,19 @@ with gr.Blocks(css=css_tech_theme) as demo:
|
|
890 |
inputs=[],
|
891 |
outputs=[leaderboard_table],
|
892 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
893 |
|
894 |
# Post-Tabs Section
|
895 |
# gr.Markdown("""
|
|
|
274 |
except Exception as e:
|
275 |
print(f"Error updating leaderboard file: {e}")
|
276 |
|
277 |
+
def update_leaderboard_pro(results):
|
278 |
+
"""
|
279 |
+
Append new submission results to the leaderboard file and push updates to the Hugging Face repository.
|
280 |
+
"""
|
281 |
+
new_entry = {
|
282 |
+
"Model Name": results['model_name'],
|
283 |
+
"Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
|
284 |
+
"Correct Predictions": results['correct_predictions'],
|
285 |
+
"Total Questions": results['total_questions'],
|
286 |
+
"Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
287 |
+
"Team Name": results['Team_name']
|
288 |
+
}
|
289 |
|
290 |
+
try:
|
291 |
+
# Update the local leaderboard file
|
292 |
+
new_entry_df = pd.DataFrame([new_entry])
|
293 |
+
file_exists = os.path.exists(LEADERBOARD_FILE)
|
294 |
+
|
295 |
+
new_entry_df.to_csv(
|
296 |
+
LEADERBOARD_FILE,
|
297 |
+
mode='a', # Append mode
|
298 |
+
index=False,
|
299 |
+
header=not file_exists # Write header only if the file is new
|
300 |
+
)
|
301 |
+
print(f"Leaderboard updated successfully at {LEADERBOARD_FILE}")
|
302 |
+
|
303 |
+
# Push the updated file to the Hugging Face repository using HTTP API
|
304 |
+
api = HfApi()
|
305 |
+
token = HfFolder.get_token()
|
306 |
+
|
307 |
+
api.upload_file(
|
308 |
+
path_or_fileobj=LEADERBOARD_FILE,
|
309 |
+
path_in_repo="leaderboardPro.csv",
|
310 |
+
repo_id="SondosMB/Mobile-MMLU", # Your Space repository
|
311 |
+
repo_type="space",
|
312 |
+
token=token
|
313 |
+
)
|
314 |
+
print("Leaderboard changes pushed to Hugging Face repository.")
|
315 |
+
|
316 |
+
except Exception as e:
|
317 |
+
print(f"Error updating leaderboard file: {e}")
|
318 |
|
319 |
|
320 |
# def load_leaderboard():
|
|
|
459 |
initialize_leaderboard_file()
|
460 |
|
461 |
|
462 |
+
|
463 |
+
def evaluate_predictions_pro(prediction_file, model_name,Team_name ,add_to_leaderboard):
|
464 |
+
try:
|
465 |
+
ground_truth_path = hf_hub_download(
|
466 |
+
repo_id="SondosMB/ground-truth-dataset",
|
467 |
+
filename="ground_truth.csv",
|
468 |
+
repo_type="dataset",
|
469 |
+
use_auth_token=True
|
470 |
+
)
|
471 |
+
ground_truth_df = pd.read_csv(ground_truth_path)
|
472 |
+
except FileNotFoundError:
|
473 |
+
return "Ground truth file not found in the dataset repository.", load_leaderboard_pro()
|
474 |
+
except Exception as e:
|
475 |
+
return f"Error loading ground truth: {e}", load_leaderboard_pro()
|
476 |
+
|
477 |
+
if not prediction_file:
|
478 |
+
return "Prediction file not uploaded.", load_leaderboard_pro()
|
479 |
+
|
480 |
+
try:
|
481 |
+
#load prediction file
|
482 |
+
predictions_df = pd.read_csv(prediction_file.name)
|
483 |
+
# Validate required columns in prediction file
|
484 |
+
required_columns = ['question_id', 'predicted_answer']
|
485 |
+
missing_columns = [col for col in required_columns if col not in predictions_df.columns]
|
486 |
+
if missing_columns:
|
487 |
+
return (f"Error: Missing required columns in prediction file: {', '.join(missing_columns)}.",
|
488 |
+
load_leaderboard())
|
489 |
+
|
490 |
+
# Validate 'Answer' column in ground truth file
|
491 |
+
if 'Answer' not in ground_truth_df.columns:
|
492 |
+
return "Error: 'Answer' column is missing in the ground truth dataset.", load_leaderboard_pro()
|
493 |
+
merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
|
494 |
+
merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
|
495 |
+
|
496 |
+
valid_predictions = merged_df.dropna(subset=['pred_answer'])
|
497 |
+
correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
|
498 |
+
total_predictions = len(merged_df)
|
499 |
+
|
500 |
+
overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
|
501 |
+
|
502 |
+
results = {
|
503 |
+
'model_name': model_name if model_name else "Unknown Model",
|
504 |
+
'overall_accuracy': overall_accuracy,
|
505 |
+
'correct_predictions': correct_predictions,
|
506 |
+
'total_questions': total_predictions,
|
507 |
+
'Team_name': Team_name if Team_name else "Unknown Team",
|
508 |
+
}
|
509 |
+
|
510 |
+
if add_to_leaderboard:
|
511 |
+
update_leaderboard_pro(results)
|
512 |
+
return "Evaluation completed and added to leaderboard.", load_leaderboard_pro()
|
513 |
+
else:
|
514 |
+
return "Evaluation completed but not added to leaderboard.", load_leaderboard_pro()
|
515 |
+
|
516 |
+
except Exception as e:
|
517 |
+
return f"Error during evaluation: {str(e)}", load_leaderboard_pro()
|
518 |
+
initialize_leaderboard_file()
|
519 |
+
|
520 |
+
|
521 |
# Function to set default mode
|
522 |
# Function to set default mode
|
523 |
import gradio as gr
|
|
|
902 |
overall_accuracy_display = gr.Number(label="π Overall Accuracy (%)", interactive=False,scale=1,min_width=1200)
|
903 |
|
904 |
with gr.Row(elem_id="submission-buttons"):
|
905 |
+
eval_button_pro = gr.Button("π Evaluate",scale=1,min_width=1200)
|
906 |
+
submit_button_pro = gr.Button("π€ Prove and Submit to Leaderboard", elem_id="evaluation-status", visible=False,scale=1,min_width=1200)
|
907 |
eval_status = gr.Textbox(label="π οΈ Evaluation Status", interactive=False,scale=1,min_width=1200)
|
908 |
|
909 |
|
|
|
954 |
except Exception as e:
|
955 |
return f"Error during evaluation: {str(e)}", 0, gr.update(visible=False)
|
956 |
|
957 |
+
def handle_evaluation_pro(file, model_name, Team_name):
|
958 |
+
if not file:
|
959 |
+
return "Error: Please upload a prediction file.", 0, gr.update(visible=False)
|
960 |
+
if not model_name or model_name.strip() == "":
|
961 |
+
return "Error: Please enter a model name.", 0, gr.update(visible=False)
|
962 |
+
if not Team_name or Team_name.strip() == "":
|
963 |
+
return "Error: Please enter a Team name.", 0, gr.update(visible=False)
|
964 |
+
|
965 |
+
try:
|
966 |
+
# Load predictions file
|
967 |
+
predictions_df = pd.read_csv(file.name)
|
968 |
+
|
969 |
+
# Validate required columns
|
970 |
+
required_columns = ['question_id', 'predicted_answer']
|
971 |
+
missing_columns = [col for col in required_columns if col not in predictions_df.columns]
|
972 |
+
if missing_columns:
|
973 |
+
return (f"Error: Missing required columns in prediction file: {', '.join(missing_columns)}.",
|
974 |
+
0, gr.update(visible=False))
|
975 |
+
|
976 |
+
# Load ground truth
|
977 |
+
try:
|
978 |
+
ground_truth_path = hf_hub_download(
|
979 |
+
repo_id="SondosMB/ground-truth-dataset",
|
980 |
+
filename="ground_truth.csv",
|
981 |
+
repo_type="dataset",
|
982 |
+
use_auth_token=True
|
983 |
+
)
|
984 |
+
ground_truth_df = pd.read_csv(ground_truth_path)
|
985 |
+
except Exception as e:
|
986 |
+
return f"Error loading ground truth: {e}", 0, gr.update(visible=False)
|
987 |
+
|
988 |
+
# Perform evaluation calculations
|
989 |
+
merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
|
990 |
+
merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
|
991 |
+
|
992 |
+
valid_predictions = merged_df.dropna(subset=['pred_answer'])
|
993 |
+
correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
|
994 |
+
total_predictions = len(merged_df)
|
995 |
+
|
996 |
+
overall_accuracy = (correct_predictions / total_predictions * 100) if total_predictions > 0 else 0
|
997 |
+
|
998 |
+
return "Evaluation completed successfully.", overall_accuracy, gr.update(visible=True)
|
999 |
+
|
1000 |
+
except Exception as e:
|
1001 |
+
return f"Error during evaluation: {str(e)}", 0, gr.update(visible=False)
|
1002 |
+
|
1003 |
|
1004 |
|
1005 |
def handle_submission(file, model_name,Team_name):
|
1006 |
# Handle leaderboard submission
|
1007 |
status, _ = evaluate_predictions(file, model_name,Team_name, add_to_leaderboard=True)
|
1008 |
return f"Submission to leaderboard completed: {status}"
|
1009 |
+
|
1010 |
+
def handle_submission_pro(file, model_name,Team_name):
|
1011 |
+
# Handle leaderboard submission
|
1012 |
+
status, _ = evaluate_predictions_pro(file, model_name,Team_name, add_to_leaderboard=True)
|
1013 |
+
return f"Submission to leaderboard completed: {status}"
|
1014 |
+
|
1015 |
|
1016 |
# Connect button clicks to the functions
|
1017 |
eval_button.click(
|
|
|
1019 |
inputs=[file_input, model_name_input,Team_name_input],
|
1020 |
outputs=[eval_status, overall_accuracy_display, submit_button],
|
1021 |
)
|
1022 |
+
|
1023 |
+
eval_button_pro.click(
|
1024 |
+
handle_evaluation_pro,
|
1025 |
+
inputs=[file_input, model_name_input,Team_name_input],
|
1026 |
+
outputs=[eval_status, overall_accuracy_display, submit_button_pro],
|
1027 |
+
)
|
1028 |
+
|
1029 |
+
submit_button_pro.click(
|
1030 |
+
handle_submission_pro,
|
1031 |
+
inputs=[file_input, model_name_input,Team_name_input],
|
1032 |
+
outputs=[eval_status],
|
1033 |
+
)
|
1034 |
|
1035 |
submit_button.click(
|
1036 |
handle_submission,
|
|
|
1053 |
inputs=[],
|
1054 |
outputs=[leaderboard_table],
|
1055 |
)
|
1056 |
+
with gr.TabItem("π
Leaderboard-pro"):
|
1057 |
+
leaderboard_table = gr.Dataframe(
|
1058 |
+
value=load_leaderboard_pro(),
|
1059 |
+
label="Leaderboard",
|
1060 |
+
interactive=False,
|
1061 |
+
wrap=True,
|
1062 |
+
)
|
1063 |
+
refresh_button = gr.Button("Refresh Leaderboard")
|
1064 |
+
refresh_button.click(
|
1065 |
+
lambda: load_leaderboard_pro(),
|
1066 |
+
inputs=[],
|
1067 |
+
outputs=[leaderboard_table],
|
1068 |
+
)
|
1069 |
|
1070 |
# Post-Tabs Section
|
1071 |
# gr.Markdown("""
|