File size: 5,247 Bytes
0f03dd5
 
 
 
37e8cfe
0f03dd5
 
 
 
 
 
 
 
 
37e8cfe
0f03dd5
37e8cfe
0f03dd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37e8cfe
0f03dd5
 
 
 
37e8cfe
 
 
 
 
 
 
 
 
 
 
0f03dd5
37e8cfe
 
 
0f03dd5
37e8cfe
 
 
 
 
 
 
0f03dd5
37e8cfe
 
 
0f03dd5
37e8cfe
 
0f03dd5
37e8cfe
 
 
 
 
 
0f03dd5
37e8cfe
 
 
 
 
 
 
 
 
 
 
 
 
 
0f03dd5
37e8cfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f03dd5
37e8cfe
 
 
a1ce4b0
37e8cfe
 
 
 
 
 
0f03dd5
37e8cfe
 
 
0f03dd5
37e8cfe
 
 
0f03dd5
37e8cfe
0f03dd5
37e8cfe
 
 
 
 
 
 
 
 
 
 
 
0f03dd5
 
37e8cfe
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from sklearn.metrics import f1_score
import re
from collections import Counter
import string
from huggingface_hub import login
import gradio as gr
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

# Normalization functions (same as extractor)
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score_qa(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    return (2 * precision * recall) / (precision + recall)

def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

# Identical confidence calculation to extractor
def calculate_confidence(model, tokenizer, question, context):
    inputs = tokenizer(
        question, 
        context, 
        return_tensors="pt",
        truncation=True,
        max_length=512,
        stride=128,
        padding=True
    )
    
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
        model = model.cuda()

    with torch.no_grad():
        outputs = model(**inputs)

    start_probs = torch.softmax(outputs.start_logits, dim=1)
    end_probs = torch.softmax(outputs.end_logits, dim=1)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    
    start_prob = start_probs[0, answer_start].item()
    end_prob = end_probs[0, answer_end-1].item()
    confidence = np.sqrt(start_prob * end_prob)
    
    answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()
    
    return answer, float(confidence)

def run_evaluation(num_samples=100):
    # Authenticate
    if token := os.getenv("HF_TOKEN"):
        login(token=token)
    
    # Load model same as extractor
    model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    
    # Load CUAD dataset
    dataset = load_dataset("theatticusproject/cuad-qa", token=token)
    test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
    
    results = []
    for example in test_data:
        context = example["context"]
        question = example["question"]
        gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
        
        pred_answer, confidence = calculate_confidence(model, tokenizer, question, context)
        
        results.append({
            "question": question,
            "prediction": pred_answer,
            "ground_truth": gt_answer,
            "confidence": confidence,
            "exact_match": exact_match_score(pred_answer, gt_answer),
            "f1": f1_score_qa(pred_answer, gt_answer)
        })
    
    # Generate report
    df = pd.DataFrame(results)
    avg_metrics = {
        "exact_match": df["exact_match"].mean() * 100,
        "f1": df["f1"].mean() * 100,
        "confidence": df["confidence"].mean() * 100
    }
    
    # Confidence calibration analysis
    high_conf_correct = df[(df["confidence"] > 0.8) & (df["exact_match"] == 1)].shape[0]
    high_conf_total = df[df["confidence"] > 0.8].shape[0]
    
    report = f"""
    CUAD Evaluation Report (n={len(df)})
    ========================
    Accuracy:
    - Exact Match: {avg_metrics['exact_match']:.2f}%
    - F1 Score: {avg_metrics['f1']:.2f}%
    
    Confidence Analysis:
    - Avg Confidence: {avg_metrics['confidence']:.2f}%
    - High-Confidence (>80%) Accuracy: {high_conf_correct}/{high_conf_total} ({high_conf_correct/max(1,high_conf_total)*100:.1f}%)
    
    Confidence vs Accuracy:
    {df[['confidence', 'exact_match']].corr().iloc[0,1]:.3f} correlation
    """
    
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = f"cuad_eval_{timestamp}.json"
    with open(results_file, "w") as f:
        json.dump({
            "metrics": avg_metrics,
            "samples": results,
            "config": {
                "model": model_name,
                "confidence_method": "geometric_mean_start_end_probs"
            }
        }, f, indent=2)
    
    return report, df, results_file

if __name__ == "__main__":
    report, df, _ = run_evaluation()
    print(report)
    print("\nSample predictions:")
    print(df.head())