File size: 4,489 Bytes
0f03dd5
 
 
 
37e8cfe
0f03dd5
 
 
 
 
 
e010197
0f03dd5
e010197
 
 
 
 
0f03dd5
 
e010197
 
 
 
 
 
 
 
0f03dd5
e010197
 
 
37e8cfe
0f03dd5
e010197
 
37e8cfe
e010197
37e8cfe
 
 
 
 
 
 
e010197
37e8cfe
e010197
37e8cfe
 
e010197
37e8cfe
 
 
 
0f03dd5
e010197
 
 
 
0f03dd5
37e8cfe
e010197
 
37e8cfe
 
e010197
 
 
 
 
 
 
0f03dd5
e010197
37e8cfe
 
 
 
 
 
 
 
 
0f03dd5
e010197
37e8cfe
 
e010197
 
 
 
 
 
37e8cfe
 
 
 
 
e010197
 
 
 
 
 
 
37e8cfe
0f03dd5
e010197
0f03dd5
e010197
 
37e8cfe
e010197
 
 
 
 
 
 
37e8cfe
 
 
0f03dd5
 
e010197
37e8cfe
 
e010197
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from collections import Counter
import string
import pandas as pd
from datetime import datetime

# Normalization functions
def normalize_answer(s):
    def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text): return ' '.join(text.split())
    def remove_punc(text): 
        return ''.join(ch for ch in text if ch not in set(string.punctuation))
    def lower(text): return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

# Metrics
def exact_match_score(pred, truth):
    return int(normalize_answer(pred) == normalize_answer(truth))

def f1_score_qa(pred, truth):
    pred_tokens = normalize_answer(pred).split()
    truth_tokens = normalize_answer(truth).split()
    common = Counter(pred_tokens) & Counter(truth_tokens)
    num_same = sum(common.values())
    if num_same == 0: return 0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(truth_tokens)
    return (2 * precision * recall) / (precision + recall)

# Identical to extractor's QA confidence
def get_qa_confidence(model, tokenizer, question, context):
    inputs = tokenizer(
        question, context,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        stride=128,
        padding=True
    )
    if torch.cuda.is_available():
        inputs = {k:v.cuda() for k,v in inputs.items()}
        model = model.cuda()
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    start_probs = torch.softmax(outputs.start_logits, dim=1)
    end_probs = torch.softmax(outputs.end_logits, dim=1)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    
    confidence = np.sqrt(
        start_probs[0, answer_start].item() * 
        end_probs[0, answer_end-1].item()
    )
    
    answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    return answer.strip(), float(confidence)

def run_evaluation(num_samples=100):
    # Load CUAD with remote code trust
    dataset = load_dataset(
        "theatticusproject/cuad-qa",
        trust_remote_code=True,
        token=os.getenv("HF_TOKEN", True)  # True allows anonymous access
    )
    test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
    
    # Load model
    model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    
    results = []
    for example in test_data:
        context = example["context"]
        question = example["question"]
        gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
        
        pred, conf = get_qa_confidence(model, tokenizer, question, context)
        
        results.append({
            "question": question[:100] + "..." if len(question) > 100 else question,
            "prediction": pred,
            "confidence": conf,
            "exact_match": exact_match_score(pred, gt_answer),
            "f1": f1_score_qa(pred, gt_answer),
            "ground_truth": gt_answer
        })
    
    # Generate report
    df = pd.DataFrame(results)
    report = f"""
    Evaluation Results (n={len(df)})
    =================
    Exact Match: {df['exact_match'].mean():.1%}
    F1 Score: {df['f1'].mean():.1%}
    Avg Confidence: {df['confidence'].mean():.1%}
    High-Confidence Accuracy: {
        df[df['confidence'] > 0.8]['exact_match'].mean():.1%}
    """
    
    # Save
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = f"eval_results_{timestamp}.json"
    with open(results_file, 'w') as f:
        json.dump({
            "config": {"model": model_name, "dataset": "cuad-qa"},
            "metrics": {
                "exact_match": float(df['exact_match'].mean()),
                "f1": float(df['f1'].mean()),
                "confidence": float(df['confidence'].mean())
            },
            "samples": results
        }, f, indent=2)
    
    return report, df, results_file

if __name__ == "__main__":
    report, df, _ = run_evaluation(num_samples=50)
    print(report)
    print("\nSample predictions:")
    print(df[["question", "confidence", "exact_match"]].head())