eval_model / app.py
AvocadoMuffin's picture
Update app.py
e010197 verified
raw
history blame
4.49 kB
import os
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from collections import Counter
import string
import pandas as pd
from datetime import datetime
# Normalization functions
def normalize_answer(s):
def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text): return ' '.join(text.split())
def remove_punc(text):
return ''.join(ch for ch in text if ch not in set(string.punctuation))
def lower(text): return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
# Metrics
def exact_match_score(pred, truth):
return int(normalize_answer(pred) == normalize_answer(truth))
def f1_score_qa(pred, truth):
pred_tokens = normalize_answer(pred).split()
truth_tokens = normalize_answer(truth).split()
common = Counter(pred_tokens) & Counter(truth_tokens)
num_same = sum(common.values())
if num_same == 0: return 0
precision = num_same / len(pred_tokens)
recall = num_same / len(truth_tokens)
return (2 * precision * recall) / (precision + recall)
# Identical to extractor's QA confidence
def get_qa_confidence(model, tokenizer, question, context):
inputs = tokenizer(
question, context,
return_tensors="pt",
truncation=True,
max_length=512,
stride=128,
padding=True
)
if torch.cuda.is_available():
inputs = {k:v.cuda() for k,v in inputs.items()}
model = model.cuda()
with torch.no_grad():
outputs = model(**inputs)
start_probs = torch.softmax(outputs.start_logits, dim=1)
end_probs = torch.softmax(outputs.end_logits, dim=1)
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1
confidence = np.sqrt(
start_probs[0, answer_start].item() *
end_probs[0, answer_end-1].item()
)
answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
return answer.strip(), float(confidence)
def run_evaluation(num_samples=100):
# Load CUAD with remote code trust
dataset = load_dataset(
"theatticusproject/cuad-qa",
trust_remote_code=True,
token=os.getenv("HF_TOKEN", True) # True allows anonymous access
)
test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
# Load model
model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
results = []
for example in test_data:
context = example["context"]
question = example["question"]
gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
pred, conf = get_qa_confidence(model, tokenizer, question, context)
results.append({
"question": question[:100] + "..." if len(question) > 100 else question,
"prediction": pred,
"confidence": conf,
"exact_match": exact_match_score(pred, gt_answer),
"f1": f1_score_qa(pred, gt_answer),
"ground_truth": gt_answer
})
# Generate report
df = pd.DataFrame(results)
report = f"""
Evaluation Results (n={len(df)})
=================
Exact Match: {df['exact_match'].mean():.1%}
F1 Score: {df['f1'].mean():.1%}
Avg Confidence: {df['confidence'].mean():.1%}
High-Confidence Accuracy: {
df[df['confidence'] > 0.8]['exact_match'].mean():.1%}
"""
# Save
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"eval_results_{timestamp}.json"
with open(results_file, 'w') as f:
json.dump({
"config": {"model": model_name, "dataset": "cuad-qa"},
"metrics": {
"exact_match": float(df['exact_match'].mean()),
"f1": float(df['f1'].mean()),
"confidence": float(df['confidence'].mean())
},
"samples": results
}, f, indent=2)
return report, df, results_file
if __name__ == "__main__":
report, df, _ = run_evaluation(num_samples=50)
print(report)
print("\nSample predictions:")
print(df[["question", "confidence", "exact_match"]].head())