Spaces:

MBZUAI-LLM
/

Mobile-MMLU-Challenge

Running

App Files Files Community

SondosMB commited on Dec 20, 2024

Commit

5b92bc4

verified ·

1 Parent(s): 68f17e0

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -293

app.py CHANGED Viewed

@@ -1,191 +1,36 @@
-# import gradio as gr
-# import pandas as pd
-# import os
-# import re
-# from datetime import datetime
-# LEADERBOARD_FILE = "leaderboard.csv"  # File to store leaderboard data
-# def clean_answer(answer):
-#     if pd.isna(answer):
-#         return None
-#     answer = str(answer)
-#     clean = re.sub(r'[^A-Da-d]', '', answer)
-#     if clean:
-#         first_letter = clean[0].upper()
-#         if first_letter in ['A', 'B', 'C', 'D']:
-#             return first_letter
-#     return None
-# def write_evaluation_results(results, output_file):
-#     os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
-#     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-#     output_text = [
-#         f"Evaluation Results for Model: {results['model_name']}",
-#         f"Timestamp: {timestamp}",
-#         "-" * 50,
-#         f"Overall Accuracy (including invalid): {results['overall_accuracy']:.2%}",
-#         f"Accuracy (valid predictions only): {results['valid_accuracy']:.2%}",
-#         f"Total Questions: {results['total_questions']}",
-#         f"Valid Predictions: {results['valid_predictions']}",
-#         f"Invalid/Malformed Predictions: {results['invalid_predictions']}",
-#         f"Correct Predictions: {results['correct_predictions']}",
-#         "\nPerformance by Field:",
-#         "-" * 50
-#     ]
-#     for field, metrics in results['field_performance'].items():
-#         field_results = [
-#             f"\nField: {field}",
-#             f"Accuracy (including invalid): {metrics['accuracy']:.2%}",
-#             f"Accuracy (valid only): {metrics['valid_accuracy']:.2%}",
-#             f"Correct: {metrics['correct']}/{metrics['total']}",
-#             f"Invalid predictions: {metrics['invalid']}"
-#         ]
-#         output_text.extend(field_results)
-#     with open(output_file, 'w') as f:
-#         f.write('\n'.join(output_text))
-#     print('\n'.join(output_text))
-#     print(f"\nResults have been saved to: {output_file}")
-# def update_leaderboard(results):
-#     # Add results to the leaderboard file
-#     new_entry = {
-#         "Model Name": results['model_name'],
-#         "Overall Accuracy": f"{results['overall_accuracy']:.2%}",
-#         "Valid Accuracy": f"{results['valid_accuracy']:.2%}",
-#         "Correct Predictions": results['correct_predictions'],
-#         "Total Questions": results['total_questions'],
-#         "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-#     }
-#     leaderboard_df = pd.DataFrame([new_entry])
-#     if os.path.exists(LEADERBOARD_FILE):
-#         existing_df = pd.read_csv(LEADERBOARD_FILE)
-#         leaderboard_df = pd.concat([existing_df, leaderboard_df], ignore_index=True)
-#     leaderboard_df.to_csv(LEADERBOARD_FILE, index=False)
-# def display_leaderboard():
-#     if not os.path.exists(LEADERBOARD_FILE):
-#         return "Leaderboard is empty."
-#     leaderboard_df = pd.read_csv(LEADERBOARD_FILE)
-#     return leaderboard_df.to_markdown(index=False)
-# def evaluate_predictions(prediction_file):
-#     ground_truth_file = "ground_truth.csv"  # Specify the path to the ground truth file
-#     if not prediction_file:
-#         return "Prediction file not uploaded", None
-#     if not os.path.exists(ground_truth_file):
-#         return "Ground truth file not found", None
-#     try:
-#         predictions_df = pd.read_csv(prediction_file.name)
-#         ground_truth_df = pd.read_csv(ground_truth_file)
-#         # Extract model name
-#         try:
-#             filename = os.path.basename(prediction_file.name)
-#             if "_" in filename and "." in filename:
-#                 model_name = filename.split('_')[1].split('.')[0]
-#             else:
-#                 model_name = "unknown_model"
-#         except IndexError:
-#             model_name = "unknown_model"
-#         # Merge dataframes
-#         merged_df = pd.merge(
-#             predictions_df,
-#             ground_truth_df,
-#             on='question_id',
-#             how='inner'
-#         )
-#         merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
-#         invalid_predictions = merged_df['pred_answer'].isna().sum()
-#         valid_predictions = merged_df.dropna(subset=['pred_answer'])
-#         correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
-#         total_predictions = len(merged_df)
-#         total_valid_predictions = len(valid_predictions)
-#         overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
-#         valid_accuracy = (
-#             correct_predictions / total_valid_predictions
-#             if total_valid_predictions > 0
-#             else 0
-#         )
-#         field_metrics = {}
-#         for field in merged_df['Field'].unique():
-#             field_data = merged_df[merged_df['Field'] == field]
-#             field_valid_data = field_data.dropna(subset=['pred_answer'])
-#             field_correct = (field_valid_data['pred_answer'] == field_valid_data['Answer']).sum()
-#             field_total = len(field_data)
-#             field_valid_total = len(field_valid_data)
-#             field_invalid = field_total - field_valid_total
-#             field_metrics[field] = {
-#                 'accuracy': field_correct / field_total if field_total > 0 else 0,
-#                 'valid_accuracy': field_correct / field_valid_total if field_valid_total > 0 else 0,
-#                 'correct': field_correct,
-#                 'total': field_total,
-#                 'invalid': field_invalid
-#             }
-#         results = {
-#             'model_name': model_name,
-#             'overall_accuracy': overall_accuracy,
-#             'valid_accuracy': valid_accuracy,
-#             'total_questions': total_predictions,
-#             'valid_predictions': total_valid_predictions,
-#             'invalid_predictions': invalid_predictions,
-#             'correct_predictions': correct_predictions,
-#             'field_performance': field_metrics
-#         }
-#         update_leaderboard(results)
-#         output_file = "evaluation_results.txt"
-#         write_evaluation_results(results, output_file)
-#         return "Evaluation completed successfully! Leaderboard updated.", output_file
-#     except Exception as e:
-#         return f"Error during evaluation: {str(e)}", None
-# # Gradio Interface
-# description = "Upload a prediction CSV file to evaluate predictions against the ground truth and update the leaderboard."
-# demo = gr.Blocks()
-# with demo:
-#     gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
-#     with gr.Tab("Evaluate"):
-#         file_input = gr.File(label="Upload Prediction CSV")
-#         eval_status = gr.Textbox(label="Evaluation Status")
-#         eval_results_file = gr.File(label="Download Evaluation Results")
-#         eval_button = gr.Button("Evaluate")
-#         eval_button.click(
-#             evaluate_predictions, inputs=file_input, outputs=[eval_status, eval_results_file]
-#         )
-#     with gr.Tab("Leaderboard"):
-#         leaderboard_text = gr.Textbox(label="Leaderboard", interactive=False)
-#         refresh_button = gr.Button("Refresh Leaderboard")
-#         refresh_button.click(display_leaderboard, outputs=leaderboard_text)
-# if __name__ == "__main__":
-#     demo.launch()
 # import gradio as gr
 # import pandas as pd
 # import os
 # import re
 # from datetime import datetime
-# LEADERBOARD_FILE = "leaderboard.csv"  # File to store leaderboard data
 # LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
 # def clean_answer(answer):
 #     if pd.isna(answer):
 #         return None
 #     answer = str(answer)
@@ -194,49 +39,9 @@
 #         return clean[0].upper()
 #     return None
-# def evaluate_predictions(prediction_file):
-#     ground_truth_file = "ground_truth.csv"
-#     if not os.path.exists(ground_truth_file):
-#         return "Ground truth file not found."
-#     if not prediction_file:
-#         return "Prediction file not uploaded."
-#     try:
-#         predictions_df = pd.read_csv(prediction_file.name)
-#         ground_truth_df = pd.read_csv(ground_truth_file)
-#         model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
-#         merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
-#         merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
-#         valid_predictions = merged_df.dropna(subset=['pred_answer'])
-#         correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
-#         total_predictions = len(merged_df)
-#         total_valid_predictions = len(valid_predictions)
-#         overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
-#         valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
-#         results = {
-#             'model_name': model_name,
-#             'overall_accuracy': overall_accuracy,
-#             'valid_accuracy': valid_accuracy,
-#             'correct_predictions': correct_predictions,
-#             'total_questions': total_predictions,
-#         }
-#         update_leaderboard(results)
-#         return "Evaluation completed successfully! Leaderboard updated."
-#     except Exception as e:
-#         return f"Error during evaluation: {str(e)}"
-# # Build Gradio App
 # def update_leaderboard(results):
 #     """
-#     Update the leaderboard file with new results.
 #     """
 #     new_entry = {
 #         "Model Name": results['model_name'],
@@ -247,23 +52,14 @@
 #         "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 #     }
-#     # Convert new entry to DataFrame
 #     new_entry_df = pd.DataFrame([new_entry])
-#     # Append to leaderboard file
-#     if not os.path.exists(LEADERBOARD_FILE):
-#         # If file does not exist, create it with headers
-#         new_entry_df.to_csv(LEADERBOARD_FILE, index=False)
-#     else:
-#         # Append without headers
-#         new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)
 # def load_leaderboard():
 #     """
-#     Load the leaderboard from the leaderboard file.
 #     """
-#     if not os.path.exists(LEADERBOARD_FILE):
 #         return pd.DataFrame({
 #             "Model Name": [],
 #             "Overall Accuracy": [],
@@ -274,10 +70,9 @@
 #         })
 #     return pd.read_csv(LEADERBOARD_FILE)
-# def evaluate_predictions_and_update_leaderboard(prediction_file):
 #     """
-#     Evaluate predictions and update the leaderboard.
 #     """
 #     ground_truth_file = "ground_truth.csv"
 #     if not os.path.exists(ground_truth_file):
@@ -286,35 +81,45 @@
 #         return "Prediction file not uploaded.", load_leaderboard()
 #     try:
 #         predictions_df = pd.read_csv(prediction_file.name)
 #         ground_truth_df = pd.read_csv(ground_truth_file)
-#         model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
 #         merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
 #         merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
 #         valid_predictions = merged_df.dropna(subset=['pred_answer'])
 #         correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
 #         total_predictions = len(merged_df)
 #         total_valid_predictions = len(valid_predictions)
 #         overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
 #         valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
 #         results = {
-#             'model_name': model_name,
 #             'overall_accuracy': overall_accuracy,
 #             'valid_accuracy': valid_accuracy,
 #             'correct_predictions': correct_predictions,
 #             'total_questions': total_predictions,
 #         }
-#         update_leaderboard(results)
-#         return "Evaluation completed successfully! Leaderboard updated.", load_leaderboard()
 #     except Exception as e:
 #         return f"Error during evaluation: {str(e)}", load_leaderboard()
-# # Build Gradio App
 # with gr.Blocks() as demo:
 #     gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
@@ -322,6 +127,8 @@
 #         # Submission Tab
 #         with gr.TabItem("🏅 Submission"):
 #             file_input = gr.File(label="Upload Prediction CSV")
 #             eval_status = gr.Textbox(label="Evaluation Status", interactive=False)
 #             leaderboard_table_preview = gr.Dataframe(
 #                 value=load_leaderboard(),
@@ -331,8 +138,8 @@
 #             )
 #             eval_button = gr.Button("Evaluate and Update Leaderboard")
 #             eval_button.click(
-#                 evaluate_predictions_and_update_leaderboard,
-#                 inputs=[file_input],
 #                 outputs=[eval_status, leaderboard_table_preview],
 #             )
@@ -354,49 +161,76 @@
 #     gr.Markdown(f"Last updated on **{LAST_UPDATED}**")
 # demo.launch()
 import gradio as gr
 import pandas as pd
-import os
 import re
 from datetime import datetime
-LEADERBOARD_FILE = "leaderboard.csv"  # File to store all submissions persistently
 LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
-def initialize_leaderboard_file():
     """
-    Ensure the leaderboard file exists and has the correct headers.
     """
-    if not os.path.exists(LEADERBOARD_FILE):
-        # Create the file with headers
-        pd.DataFrame(columns=[
-            "Model Name", "Overall Accuracy", "Valid Accuracy",
-            "Correct Predictions", "Total Questions", "Timestamp"
-        ]).to_csv(LEADERBOARD_FILE, index=False)
-    else:
-        # Check if the file is empty and write headers if needed
-        if os.stat(LEADERBOARD_FILE).st_size == 0:
-            pd.DataFrame(columns=[
-                "Model Name", "Overall Accuracy", "Valid Accuracy",
-                "Correct Predictions", "Total Questions", "Timestamp"
-            ]).to_csv(LEADERBOARD_FILE, index=False)
-def clean_answer(answer):
     """
-    Clean and normalize the predicted answers.
     """
-    if pd.isna(answer):
-        return None
-    answer = str(answer)
-    clean = re.sub(r'[^A-Da-d]', '', answer)
-    if clean:
-        return clean[0].upper()
-    return None
 def update_leaderboard(results):
     """
-    Append new submission results to the leaderboard file.
     """
     new_entry = {
         "Model Name": results['model_name'],
         "Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
@@ -405,41 +239,38 @@ def update_leaderboard(results):
         "Total Questions": results['total_questions'],
         "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
     }
-    new_entry_df = pd.DataFrame([new_entry])
-    new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)
-def load_leaderboard():
     """
-    Load all submissions from the leaderboard file.
     """
-    if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0:
-        return pd.DataFrame({
-            "Model Name": [],
-            "Overall Accuracy": [],
-            "Valid Accuracy": [],
-            "Correct Predictions": [],
-            "Total Questions": [],
-            "Timestamp": [],
-        })
-    return pd.read_csv(LEADERBOARD_FILE)
 def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
     """
     Evaluate predictions and optionally add results to the leaderboard.
     """
-    ground_truth_file = "ground_truth.csv"
-    if not os.path.exists(ground_truth_file):
         return "Ground truth file not found.", load_leaderboard()
     if not prediction_file:
         return "Prediction file not uploaded.", load_leaderboard()
     try:
-        # Load predictions and ground truth
         predictions_df = pd.read_csv(prediction_file.name)
-        ground_truth_df = pd.read_csv(ground_truth_file)
-        # Merge predictions with ground truth
         merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
         merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
@@ -470,12 +301,9 @@ def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
     except Exception as e:
         return f"Error during evaluation: {str(e)}", load_leaderboard()
-# Initialize leaderboard file
-initialize_leaderboard_file()
 # Gradio Interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
     with gr.Tabs():
         # Submission Tab
@@ -516,4 +344,3 @@ with gr.Blocks() as demo:
 demo.launch()

+# # demo.launch()
 # import gradio as gr
 # import pandas as pd
 # import os
 # import re
 # from datetime import datetime
+# LEADERBOARD_FILE = "leaderboard.csv"  # File to store all submissions persistently
 # LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
+# def initialize_leaderboard_file():
+#     """
+#     Ensure the leaderboard file exists and has the correct headers.
+#     """
+#     if not os.path.exists(LEADERBOARD_FILE):
+#         # Create the file with headers
+#         pd.DataFrame(columns=[
+#             "Model Name", "Overall Accuracy", "Valid Accuracy",
+#             "Correct Predictions", "Total Questions", "Timestamp"
+#         ]).to_csv(LEADERBOARD_FILE, index=False)
+#     else:
+#         # Check if the file is empty and write headers if needed
+#         if os.stat(LEADERBOARD_FILE).st_size == 0:
+#             pd.DataFrame(columns=[
+#                 "Model Name", "Overall Accuracy", "Valid Accuracy",
+#                 "Correct Predictions", "Total Questions", "Timestamp"
+#             ]).to_csv(LEADERBOARD_FILE, index=False)
 # def clean_answer(answer):
+#     """
+#     Clean and normalize the predicted answers.
+#     """
 #     if pd.isna(answer):
 #         return None
 #     answer = str(answer)
 #         return clean[0].upper()
 #     return None
 # def update_leaderboard(results):
 #     """
+#     Append new submission results to the leaderboard file.
 #     """
 #     new_entry = {
 #         "Model Name": results['model_name'],
 #         "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 #     }
 #     new_entry_df = pd.DataFrame([new_entry])
+#     new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)
 # def load_leaderboard():
 #     """
+#     Load all submissions from the leaderboard file.
 #     """
+#     if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0:
 #         return pd.DataFrame({
 #             "Model Name": [],
 #             "Overall Accuracy": [],
 #         })
 #     return pd.read_csv(LEADERBOARD_FILE)
+# def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
 #     """
+#     Evaluate predictions and optionally add results to the leaderboard.
 #     """
 #     ground_truth_file = "ground_truth.csv"
 #     if not os.path.exists(ground_truth_file):
 #         return "Prediction file not uploaded.", load_leaderboard()
 #     try:
+#         # Load predictions and ground truth
 #         predictions_df = pd.read_csv(prediction_file.name)
 #         ground_truth_df = pd.read_csv(ground_truth_file)
+#         # Merge predictions with ground truth
 #         merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
 #         merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
+#         # Evaluate predictions
 #         valid_predictions = merged_df.dropna(subset=['pred_answer'])
 #         correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
 #         total_predictions = len(merged_df)
 #         total_valid_predictions = len(valid_predictions)
+#         # Calculate accuracy
 #         overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
 #         valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
 #         results = {
+#             'model_name': model_name if model_name else "Unknown Model",
 #             'overall_accuracy': overall_accuracy,
 #             'valid_accuracy': valid_accuracy,
 #             'correct_predictions': correct_predictions,
 #             'total_questions': total_predictions,
 #         }
+#         # Update leaderboard only if opted in
+#         if add_to_leaderboard:
+#             update_leaderboard(results)
+#             return "Evaluation completed and added to leaderboard.", load_leaderboard()
+#         else:
+#             return "Evaluation completed but not added to leaderboard.", load_leaderboard()
 #     except Exception as e:
 #         return f"Error during evaluation: {str(e)}", load_leaderboard()
+# # Initialize leaderboard file
+# initialize_leaderboard_file()
+# # Gradio Interface
 # with gr.Blocks() as demo:
 #     gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
 #         # Submission Tab
 #         with gr.TabItem("🏅 Submission"):
 #             file_input = gr.File(label="Upload Prediction CSV")
+#             model_name_input = gr.Textbox(label="Model Name", placeholder="Enter your model name")
+#             add_to_leaderboard_checkbox = gr.Checkbox(label="Add to Leaderboard?", value=True)
 #             eval_status = gr.Textbox(label="Evaluation Status", interactive=False)
 #             leaderboard_table_preview = gr.Dataframe(
 #                 value=load_leaderboard(),
 #             )
 #             eval_button = gr.Button("Evaluate and Update Leaderboard")
 #             eval_button.click(
+#                 evaluate_predictions,
+#                 inputs=[file_input, model_name_input, add_to_leaderboard_checkbox],
 #                 outputs=[eval_status, leaderboard_table_preview],
 #             )
 #     gr.Markdown(f"Last updated on **{LAST_UPDATED}**")
 # demo.launch()
 import gradio as gr
 import pandas as pd
 import re
 from datetime import datetime
+from huggingface_hub import hf_hub_download
+from datasets import Dataset
+import os
+HF_TOKEN = os.getenv("HF_TOKEN")  # Hugging Face token stored as an environment variable
+LEADERBOARD_REPO = "username/leaderboard-dataset"  # Replace with your leaderboard dataset name
+GROUND_TRUTH_REPO = "username/ground-truth-dataset"  # Replace with your ground truth dataset name
 LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
+def load_ground_truth():
     """
+    Load the ground truth file from a private Hugging Face dataset.
     """
+    try:
+        ground_truth_path = hf_hub_download(
+            repo_id=GROUND_TRUTH_REPO,
+            filename="ground_truth.csv",
+            use_auth_token=HF_TOKEN
+        )
+        return pd.read_csv(ground_truth_path)
+    except Exception as e:
+        print(f"Error loading ground truth: {e}")
+        return None
+def load_leaderboard():
     """
+    Load the leaderboard from a private Hugging Face dataset.
     """
+    try:
+        leaderboard_path = hf_hub_download(
+            repo_id=LEADERBOARD_REPO,
+            filename="leaderboard.csv",
+            use_auth_token=HF_TOKEN
+        )
+        return pd.read_csv(leaderboard_path)
+    except Exception as e:
+        print(f"Error loading leaderboard: {e}")
+        return pd.DataFrame({
+            "Model Name": [],
+            "Overall Accuracy": [],
+            "Valid Accuracy": [],
+            "Correct Predictions": [],
+            "Total Questions": [],
+            "Timestamp": [],
+        })
 def update_leaderboard(results):
     """
+    Append new submission results to the private leaderboard dataset.
     """
+    try:
+        # Load existing leaderboard or create a new one
+        leaderboard_path = hf_hub_download(
+            repo_id=LEADERBOARD_REPO,
+            filename="leaderboard.csv",
+            use_auth_token=HF_TOKEN
+        )
+        df = pd.read_csv(leaderboard_path)
+    except:
+        df = pd.DataFrame(columns=[
+            "Model Name", "Overall Accuracy", "Valid Accuracy",
+            "Correct Predictions", "Total Questions", "Timestamp"
+        ])
+    # Add new entry
     new_entry = {
         "Model Name": results['model_name'],
         "Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
         "Total Questions": results['total_questions'],
         "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
     }
+    df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
+    # Save locally and push updated dataset to Hugging Face
+    df.to_csv("leaderboard.csv", index=False)
+    dataset = Dataset.from_pandas(df)
+    dataset.push_to_hub(LEADERBOARD_REPO, split="train", private=True)
+def clean_answer(answer):
     """
+    Clean and normalize the predicted answers.
     """
+    if pd.isna(answer):
+        return None
+    answer = str(answer)
+    clean = re.sub(r'[^A-Da-d]', '', answer)
+    if clean:
+        return clean[0].upper()
+    return None
 def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
     """
     Evaluate predictions and optionally add results to the leaderboard.
     """
+    ground_truth_df = load_ground_truth()
+    if ground_truth_df is None:
         return "Ground truth file not found.", load_leaderboard()
     if not prediction_file:
         return "Prediction file not uploaded.", load_leaderboard()
     try:
+        # Load predictions and merge with ground truth
         predictions_df = pd.read_csv(prediction_file.name)
         merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
         merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
     except Exception as e:
         return f"Error during evaluation: {str(e)}", load_leaderboard()
 # Gradio Interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Secure Prediction Evaluation Tool with Private Leaderboard")
     with gr.Tabs():
         # Submission Tab
 demo.launch()