Spaces:
Running
Running
File size: 4,489 Bytes
0f03dd5 37e8cfe 0f03dd5 e010197 0f03dd5 e010197 0f03dd5 e010197 0f03dd5 e010197 37e8cfe 0f03dd5 e010197 37e8cfe e010197 37e8cfe e010197 37e8cfe e010197 37e8cfe e010197 37e8cfe 0f03dd5 e010197 0f03dd5 37e8cfe e010197 37e8cfe e010197 0f03dd5 e010197 37e8cfe 0f03dd5 e010197 37e8cfe e010197 37e8cfe e010197 37e8cfe 0f03dd5 e010197 0f03dd5 e010197 37e8cfe e010197 37e8cfe 0f03dd5 e010197 37e8cfe e010197 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import os
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from collections import Counter
import string
import pandas as pd
from datetime import datetime
# Normalization functions
def normalize_answer(s):
def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text): return ' '.join(text.split())
def remove_punc(text):
return ''.join(ch for ch in text if ch not in set(string.punctuation))
def lower(text): return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
# Metrics
def exact_match_score(pred, truth):
return int(normalize_answer(pred) == normalize_answer(truth))
def f1_score_qa(pred, truth):
pred_tokens = normalize_answer(pred).split()
truth_tokens = normalize_answer(truth).split()
common = Counter(pred_tokens) & Counter(truth_tokens)
num_same = sum(common.values())
if num_same == 0: return 0
precision = num_same / len(pred_tokens)
recall = num_same / len(truth_tokens)
return (2 * precision * recall) / (precision + recall)
# Identical to extractor's QA confidence
def get_qa_confidence(model, tokenizer, question, context):
inputs = tokenizer(
question, context,
return_tensors="pt",
truncation=True,
max_length=512,
stride=128,
padding=True
)
if torch.cuda.is_available():
inputs = {k:v.cuda() for k,v in inputs.items()}
model = model.cuda()
with torch.no_grad():
outputs = model(**inputs)
start_probs = torch.softmax(outputs.start_logits, dim=1)
end_probs = torch.softmax(outputs.end_logits, dim=1)
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1
confidence = np.sqrt(
start_probs[0, answer_start].item() *
end_probs[0, answer_end-1].item()
)
answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
return answer.strip(), float(confidence)
def run_evaluation(num_samples=100):
# Load CUAD with remote code trust
dataset = load_dataset(
"theatticusproject/cuad-qa",
trust_remote_code=True,
token=os.getenv("HF_TOKEN", True) # True allows anonymous access
)
test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
# Load model
model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
results = []
for example in test_data:
context = example["context"]
question = example["question"]
gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
pred, conf = get_qa_confidence(model, tokenizer, question, context)
results.append({
"question": question[:100] + "..." if len(question) > 100 else question,
"prediction": pred,
"confidence": conf,
"exact_match": exact_match_score(pred, gt_answer),
"f1": f1_score_qa(pred, gt_answer),
"ground_truth": gt_answer
})
# Generate report
df = pd.DataFrame(results)
report = f"""
Evaluation Results (n={len(df)})
=================
Exact Match: {df['exact_match'].mean():.1%}
F1 Score: {df['f1'].mean():.1%}
Avg Confidence: {df['confidence'].mean():.1%}
High-Confidence Accuracy: {
df[df['confidence'] > 0.8]['exact_match'].mean():.1%}
"""
# Save
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"eval_results_{timestamp}.json"
with open(results_file, 'w') as f:
json.dump({
"config": {"model": model_name, "dataset": "cuad-qa"},
"metrics": {
"exact_match": float(df['exact_match'].mean()),
"f1": float(df['f1'].mean()),
"confidence": float(df['confidence'].mean())
},
"samples": results
}, f, indent=2)
return report, df, results_file
if __name__ == "__main__":
report, df, _ = run_evaluation(num_samples=50)
print(report)
print("\nSample predictions:")
print(df[["question", "confidence", "exact_match"]].head()) |