File size: 5,498 Bytes
24a059f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d5c6ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24a059f
 
 
 
 
 
 
 
 
 
 
 
e4f66e8
 
 
 
 
 
 
 
 
 
24a059f
e4f66e8
 
 
 
 
 
 
24a059f
e4f66e8
 
 
24a059f
e4f66e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24a059f
 
 
 
e4f66e8
24a059f
e4f66e8
 
 
 
24a059f
 
e4f66e8
 
 
24a059f
 
e4f66e8
 
5d5c6ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24a059f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import gradio as gr
import pandas as pd
import os
import re
from datetime import datetime

def clean_answer(answer):
    if pd.isna(answer):
        return None
    answer = str(answer)
    clean = re.sub(r'[^A-Da-d]', '', answer)
    if clean:
        first_letter = clean[0].upper()
        if first_letter in ['A', 'B', 'C', 'D']:
            return first_letter
    return None

def write_evaluation_results(results, output_file):
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    output_text = [
        f"Evaluation Results for Model: {results['model_name']}",
        f"Timestamp: {timestamp}",
        "-" * 50,
        f"Overall Accuracy (including invalid): {results['overall_accuracy']:.2%}",
        f"Accuracy (valid predictions only): {results['valid_accuracy']:.2%}",
        f"Total Questions: {results['total_questions']}",
        f"Valid Predictions: {results['valid_predictions']}",
        f"Invalid/Malformed Predictions: {results['invalid_predictions']}",
        f"Correct Predictions: {results['correct_predictions']}",
        "\nPerformance by Field:",
        "-" * 50
    ]

    for field, metrics in results['field_performance'].items():
        field_results = [
            f"\nField: {field}",
            f"Accuracy (including invalid): {metrics['accuracy']:.2%}",
            f"Accuracy (valid only): {metrics['valid_accuracy']:.2%}",
            f"Correct: {metrics['correct']}/{metrics['total']}",
            f"Invalid predictions: {metrics['invalid']}"
        ]
        output_text.extend(field_results)

    with open(output_file, 'w') as f:
        f.write('\n'.join(output_text))
    print('\n'.join(output_text))
    print(f"\nResults have been saved to: {output_file}")

def evaluate_predictions(prediction_file):
    ground_truth_file = "ground_truth.csv"  # Specify the path to the ground truth file
    if not prediction_file:
        return "Prediction file not uploaded", None

    if not os.path.exists(ground_truth_file):
        return "Ground truth file not found", None

    try:
        predictions_df = pd.read_csv(prediction_file.name)
        ground_truth_df = pd.read_csv(ground_truth_file)
        
        # Extract model name
        try:
            filename = os.path.basename(prediction_file.name)
            if "_" in filename and "." in filename:
                model_name = filename.split('_')[1].split('.')[0]
            else:
                model_name = "unknown_model"
        except IndexError:
            model_name = "unknown_model"

        # Merge dataframes
        merged_df = pd.merge(
            predictions_df, 
            ground_truth_df, 
            on='question_id', 
            how='inner'
        )
        merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
        invalid_predictions = merged_df['pred_answer'].isna().sum()
        valid_predictions = merged_df.dropna(subset=['pred_answer'])
        correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
        total_predictions = len(merged_df)
        total_valid_predictions = len(valid_predictions)

        overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
        valid_accuracy = (
            correct_predictions / total_valid_predictions
            if total_valid_predictions > 0
            else 0
        )

        field_metrics = {}
        for field in merged_df['Field'].unique():
            field_data = merged_df[merged_df['Field'] == field]
            field_valid_data = field_data.dropna(subset=['pred_answer'])

            field_correct = (field_valid_data['pred_answer'] == field_valid_data['Answer']).sum()
            field_total = len(field_data)
            field_valid_total = len(field_valid_data)
            field_invalid = field_total - field_valid_total

            field_metrics[field] = {
                'accuracy': field_correct / field_total if field_total > 0 else 0,
                'valid_accuracy': field_correct / field_valid_total if field_valid_total > 0 else 0,
                'correct': field_correct,
                'total': field_total,
                'invalid': field_invalid
            }

        results = {
            'model_name': model_name,
            'overall_accuracy': overall_accuracy,
            'valid_accuracy': valid_accuracy,
            'total_questions': total_predictions,
            'valid_predictions': total_valid_predictions,
            'invalid_predictions': invalid_predictions,
            'correct_predictions': correct_predictions,
            'field_performance': field_metrics
        }

        output_file = "evaluation_results.txt"
        write_evaluation_results(results, output_file)
        return "Evaluation completed successfully!", output_file

    except Exception as e:
        return f"Error during evaluation: {str(e)}", None

# Gradio Interface
description = "Upload a prediction CSV file to evaluate predictions against the ground truth stored in the system."

demo = gr.Interface(
    fn=evaluate_predictions,
    inputs=[
        gr.File(label="Upload Prediction CSV")
    ],
    outputs=[
        gr.Textbox(label="Evaluation Status"),
        gr.File(label="Download Evaluation Results")
    ],
    title="Prediction Evaluation Tool",
    description=description
)

if __name__ == "__main__":
    demo.launch()