File size: 6,505 Bytes
0f03dd5
 
 
 
cb1cf5c
0f03dd5
cb1cf5c
 
0f03dd5
 
cb1cf5c
 
0f03dd5
 
 
cb1cf5c
0f03dd5
e010197
 
 
cb1cf5c
 
e010197
cb1cf5c
0f03dd5
cb1cf5c
 
 
 
0f03dd5
e010197
cb1cf5c
 
37e8cfe
0f03dd5
cb1cf5c
 
 
 
e010197
37e8cfe
e010197
37e8cfe
 
 
 
 
 
 
e010197
37e8cfe
e010197
37e8cfe
 
e010197
37e8cfe
 
 
 
0f03dd5
e010197
 
 
 
0f03dd5
37e8cfe
e010197
 
37e8cfe
cb1cf5c
 
 
 
 
0f03dd5
cb1cf5c
37e8cfe
cb1cf5c
 
 
 
 
 
 
 
 
 
 
 
 
 
37e8cfe
 
cb1cf5c
 
 
37e8cfe
 
 
0f03dd5
cb1cf5c
37e8cfe
 
cb1cf5c
 
 
 
 
 
37e8cfe
 
 
 
 
e010197
 
cb1cf5c
 
 
 
 
37e8cfe
0f03dd5
cb1cf5c
0f03dd5
e010197
 
37e8cfe
cb1cf5c
e010197
cb1cf5c
 
 
e010197
 
37e8cfe
 
 
0f03dd5
cb1cf5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f03dd5
cb1cf5c
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import torch
from sklearn.metrics import f1_score
import re
from collections import Counter
import string
from huggingface_hub import login
import gradio as gr
import pandas as pd
from datetime import datetime

# Normalization functions (identical to extractor)
def normalize_answer(s):
    def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text): return ' '.join(text.split())
    def remove_punc(text): 
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text): return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s)))

def f1_score_qa(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0: return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    return (2 * precision * recall) / (precision + recall)

def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

# Identical confidence calculation to extractor
def get_qa_confidence(model, tokenizer, question, context):
    inputs = tokenizer(
        question, context,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        stride=128,
        padding=True
    )
    if torch.cuda.is_available():
        inputs = {k:v.cuda() for k,v in inputs.items()}
        model = model.cuda()
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    start_probs = torch.softmax(outputs.start_logits, dim=1)
    end_probs = torch.softmax(outputs.end_logits, dim=1)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    
    confidence = np.sqrt(
        start_probs[0, answer_start].item() * 
        end_probs[0, answer_end-1].item()
    )
    
    answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    return answer.strip(), float(confidence)

def run_evaluation(num_samples, progress=gr.Progress()):
    # Authentication
    hf_token = os.getenv("EVAL_TOKEN")
    if hf_token:
        login(token=hf_token)
    
    # Load model same as extractor
    model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token)
    
    progress(0.1, desc="Loading CUAD dataset...")
    try:
        dataset = load_dataset(
            "theatticusproject/cuad-qa",
            trust_remote_code=True,
            token=hf_token
        )
        test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
        print(f"βœ“ Loaded {len(test_data)} samples")
    except Exception as e:
        return f"❌ Dataset load failed: {str(e)}", pd.DataFrame(), None
    
    results = []
    for i, example in enumerate(test_data):
        progress(0.2 + 0.7*i/num_samples, desc=f"Evaluating {i+1}/{num_samples}")
        
        context = example["context"]
        question = example["question"]
        gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
        
        pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context)
        
        results.append({
            "Question": question[:100] + "..." if len(question) > 100 else question,
            "Prediction": pred_answer,
            "Truth": gt_answer,
            "Confidence": confidence,
            "Exact Match": exact_match_score(pred_answer, gt_answer),
            "F1": f1_score_qa(pred_answer, gt_answer)
        })
    
    # Generate report
    df = pd.DataFrame(results)
    report = f"""
    Evaluation Results (n={len(df)})
    =================
    - Exact Match: {df['Exact Match'].mean():.1%}
    - F1 Score: {df['F1'].mean():.1%}
    - Avg Confidence: {df['Confidence'].mean():.1%}
    - High-Confidence (>80%) Accuracy: {
        df[df['Confidence'] > 0.8]['Exact Match'].mean():.1%}
    """
    
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = f"eval_results_{timestamp}.json"
    with open(results_file, 'w') as f:
        json.dump({
            "model": model_name,
            "metrics": {
                "exact_match": float(df['Exact Match'].mean()),
                "f1": float(df['F1'].mean()),
                "avg_confidence": float(df['Confidence'].mean())
            },
            "samples": results
        }, f, indent=2)
    
    return report, df, results_file

def create_gradio_interface():
    with gr.Blocks(title="CUAD Evaluator") as demo:
        gr.Markdown("## πŸ›οΈ CUAD QA Model Evaluation")
        
        with gr.Row():
            num_samples = gr.Slider(10, 500, value=100, step=10, 
                                   label="Number of Samples")
            eval_btn = gr.Button("πŸš€ Run Evaluation", variant="primary")
        
        with gr.Row():
            report = gr.Markdown("Results will appear here...")
            results_table = gr.Dataframe(headers=["Question", "Prediction", "Confidence", "Exact Match"])
        
        download = gr.File(label="Download Results", visible=False)
        
        def run_and_display(num_samples):
            report_text, df, file = run_evaluation(num_samples)
            return (
                report_text,
                df[["Question", "Prediction", "Confidence", "Exact Match"]],
                gr.File(visible=True, value=file)
            )
        
        eval_btn.click(
            fn=run_and_display,
            inputs=num_samples,
            outputs=[report, results_table, download]
        )
    
    return demo

if __name__ == "__main__":
    # Verify CUDA
    if torch.cuda.is_available():
        print(f"βœ“ CUDA available: {torch.cuda.get_device_name(0)}")
    else:
        print("! Using CPU")
    
    # Launch Gradio
    demo = create_gradio_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )