File size: 7,015 Bytes
24a059f
 
 
 
 
 
c308901
 
24a059f
 
 
 
 
 
 
 
 
 
 
5d5c6ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24a059f
c308901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24a059f
 
 
 
 
 
 
 
 
 
 
e4f66e8
 
 
 
 
 
 
 
 
 
24a059f
e4f66e8
 
 
 
 
 
 
24a059f
e4f66e8
 
 
24a059f
e4f66e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24a059f
 
 
 
e4f66e8
24a059f
e4f66e8
 
 
 
24a059f
 
c308901
e4f66e8
 
c308901
24a059f
 
e4f66e8
 
5d5c6ec
c308901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24a059f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import gradio as gr
import pandas as pd
import os
import re
from datetime import datetime

LEADERBOARD_FILE = "leaderboard.csv"  # File to store leaderboard data

def clean_answer(answer):
    if pd.isna(answer):
        return None
    answer = str(answer)
    clean = re.sub(r'[^A-Da-d]', '', answer)
    if clean:
        first_letter = clean[0].upper()
        if first_letter in ['A', 'B', 'C', 'D']:
            return first_letter
    return None

def write_evaluation_results(results, output_file):
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    output_text = [
        f"Evaluation Results for Model: {results['model_name']}",
        f"Timestamp: {timestamp}",
        "-" * 50,
        f"Overall Accuracy (including invalid): {results['overall_accuracy']:.2%}",
        f"Accuracy (valid predictions only): {results['valid_accuracy']:.2%}",
        f"Total Questions: {results['total_questions']}",
        f"Valid Predictions: {results['valid_predictions']}",
        f"Invalid/Malformed Predictions: {results['invalid_predictions']}",
        f"Correct Predictions: {results['correct_predictions']}",
        "\nPerformance by Field:",
        "-" * 50
    ]

    for field, metrics in results['field_performance'].items():
        field_results = [
            f"\nField: {field}",
            f"Accuracy (including invalid): {metrics['accuracy']:.2%}",
            f"Accuracy (valid only): {metrics['valid_accuracy']:.2%}",
            f"Correct: {metrics['correct']}/{metrics['total']}",
            f"Invalid predictions: {metrics['invalid']}"
        ]
        output_text.extend(field_results)

    with open(output_file, 'w') as f:
        f.write('\n'.join(output_text))
    print('\n'.join(output_text))
    print(f"\nResults have been saved to: {output_file}")

def update_leaderboard(results):
    # Add results to the leaderboard file
    new_entry = {
        "Model Name": results['model_name'],
        "Overall Accuracy": f"{results['overall_accuracy']:.2%}",
        "Valid Accuracy": f"{results['valid_accuracy']:.2%}",
        "Correct Predictions": results['correct_predictions'],
        "Total Questions": results['total_questions'],
        "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }
    leaderboard_df = pd.DataFrame([new_entry])
    if os.path.exists(LEADERBOARD_FILE):
        existing_df = pd.read_csv(LEADERBOARD_FILE)
        leaderboard_df = pd.concat([existing_df, leaderboard_df], ignore_index=True)
    leaderboard_df.to_csv(LEADERBOARD_FILE, index=False)

def display_leaderboard():
    if not os.path.exists(LEADERBOARD_FILE):
        return "Leaderboard is empty."
    leaderboard_df = pd.read_csv(LEADERBOARD_FILE)
    return leaderboard_df.to_markdown(index=False)

def evaluate_predictions(prediction_file):
    ground_truth_file = "ground_truth.csv"  # Specify the path to the ground truth file
    if not prediction_file:
        return "Prediction file not uploaded", None

    if not os.path.exists(ground_truth_file):
        return "Ground truth file not found", None

    try:
        predictions_df = pd.read_csv(prediction_file.name)
        ground_truth_df = pd.read_csv(ground_truth_file)
        
        # Extract model name
        try:
            filename = os.path.basename(prediction_file.name)
            if "_" in filename and "." in filename:
                model_name = filename.split('_')[1].split('.')[0]
            else:
                model_name = "unknown_model"
        except IndexError:
            model_name = "unknown_model"

        # Merge dataframes
        merged_df = pd.merge(
            predictions_df, 
            ground_truth_df, 
            on='question_id', 
            how='inner'
        )
        merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
        invalid_predictions = merged_df['pred_answer'].isna().sum()
        valid_predictions = merged_df.dropna(subset=['pred_answer'])
        correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
        total_predictions = len(merged_df)
        total_valid_predictions = len(valid_predictions)

        overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
        valid_accuracy = (
            correct_predictions / total_valid_predictions
            if total_valid_predictions > 0
            else 0
        )

        field_metrics = {}
        for field in merged_df['Field'].unique():
            field_data = merged_df[merged_df['Field'] == field]
            field_valid_data = field_data.dropna(subset=['pred_answer'])

            field_correct = (field_valid_data['pred_answer'] == field_valid_data['Answer']).sum()
            field_total = len(field_data)
            field_valid_total = len(field_valid_data)
            field_invalid = field_total - field_valid_total

            field_metrics[field] = {
                'accuracy': field_correct / field_total if field_total > 0 else 0,
                'valid_accuracy': field_correct / field_valid_total if field_valid_total > 0 else 0,
                'correct': field_correct,
                'total': field_total,
                'invalid': field_invalid
            }

        results = {
            'model_name': model_name,
            'overall_accuracy': overall_accuracy,
            'valid_accuracy': valid_accuracy,
            'total_questions': total_predictions,
            'valid_predictions': total_valid_predictions,
            'invalid_predictions': invalid_predictions,
            'correct_predictions': correct_predictions,
            'field_performance': field_metrics
        }

        update_leaderboard(results)
        output_file = "evaluation_results.txt"
        write_evaluation_results(results, output_file)
        return "Evaluation completed successfully! Leaderboard updated.", output_file

    except Exception as e:
        return f"Error during evaluation: {str(e)}", None

# Gradio Interface
description = "Upload a prediction CSV file to evaluate predictions against the ground truth and update the leaderboard."

demo = gr.Blocks()

with demo:
    gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
    with gr.Tab("Evaluate"):
        file_input = gr.File(label="Upload Prediction CSV")
        eval_status = gr.Textbox(label="Evaluation Status")
        eval_results_file = gr.File(label="Download Evaluation Results")
        eval_button = gr.Button("Evaluate")
        eval_button.click(
            evaluate_predictions, inputs=file_input, outputs=[eval_status, eval_results_file]
        )
    with gr.Tab("Leaderboard"):
        leaderboard_text = gr.Textbox(label="Leaderboard", interactive=False)
        refresh_button = gr.Button("Refresh Leaderboard")
        refresh_button.click(display_leaderboard, outputs=leaderboard_text)

if __name__ == "__main__":
    demo.launch()