Spaces:
Sleeping
Sleeping
File size: 5,247 Bytes
0f03dd5 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe a1ce4b0 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe 0f03dd5 37e8cfe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import os
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from sklearn.metrics import f1_score
import re
from collections import Counter
import string
from huggingface_hub import login
import gradio as gr
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
# Normalization functions (same as extractor)
def normalize_answer(s):
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score_qa(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
return (2 * precision * recall) / (precision + recall)
def exact_match_score(prediction, ground_truth):
return normalize_answer(prediction) == normalize_answer(ground_truth)
# Identical confidence calculation to extractor
def calculate_confidence(model, tokenizer, question, context):
inputs = tokenizer(
question,
context,
return_tensors="pt",
truncation=True,
max_length=512,
stride=128,
padding=True
)
if torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
model = model.cuda()
with torch.no_grad():
outputs = model(**inputs)
start_probs = torch.softmax(outputs.start_logits, dim=1)
end_probs = torch.softmax(outputs.end_logits, dim=1)
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1
start_prob = start_probs[0, answer_start].item()
end_prob = end_probs[0, answer_end-1].item()
confidence = np.sqrt(start_prob * end_prob)
answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()
return answer, float(confidence)
def run_evaluation(num_samples=100):
# Authenticate
if token := os.getenv("HF_TOKEN"):
login(token=token)
# Load model same as extractor
model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
# Load CUAD dataset
dataset = load_dataset("theatticusproject/cuad-qa", token=token)
test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
results = []
for example in test_data:
context = example["context"]
question = example["question"]
gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
pred_answer, confidence = calculate_confidence(model, tokenizer, question, context)
results.append({
"question": question,
"prediction": pred_answer,
"ground_truth": gt_answer,
"confidence": confidence,
"exact_match": exact_match_score(pred_answer, gt_answer),
"f1": f1_score_qa(pred_answer, gt_answer)
})
# Generate report
df = pd.DataFrame(results)
avg_metrics = {
"exact_match": df["exact_match"].mean() * 100,
"f1": df["f1"].mean() * 100,
"confidence": df["confidence"].mean() * 100
}
# Confidence calibration analysis
high_conf_correct = df[(df["confidence"] > 0.8) & (df["exact_match"] == 1)].shape[0]
high_conf_total = df[df["confidence"] > 0.8].shape[0]
report = f"""
CUAD Evaluation Report (n={len(df)})
========================
Accuracy:
- Exact Match: {avg_metrics['exact_match']:.2f}%
- F1 Score: {avg_metrics['f1']:.2f}%
Confidence Analysis:
- Avg Confidence: {avg_metrics['confidence']:.2f}%
- High-Confidence (>80%) Accuracy: {high_conf_correct}/{high_conf_total} ({high_conf_correct/max(1,high_conf_total)*100:.1f}%)
Confidence vs Accuracy:
{df[['confidence', 'exact_match']].corr().iloc[0,1]:.3f} correlation
"""
# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"cuad_eval_{timestamp}.json"
with open(results_file, "w") as f:
json.dump({
"metrics": avg_metrics,
"samples": results,
"config": {
"model": model_name,
"confidence_method": "geometric_mean_start_end_probs"
}
}, f, indent=2)
return report, df, results_file
if __name__ == "__main__":
report, df, _ = run_evaluation()
print(report)
print("\nSample predictions:")
print(df.head()) |