Spaces:

AvocadoMuffin
/

eval_model

Running

App Files Files Community

eval_model / app.py

AvocadoMuffin

Update app.py

e010197 verified about 1 month ago

raw

history blame

4.49 kB

	import os
	import json
	import numpy as np
	from datasets import load_dataset
	from transformers import AutoTokenizer, AutoModelForQuestionAnswering
	import torch
	from collections import Counter
	import string
	import pandas as pd
	from datetime import datetime

	# Normalization functions
	def normalize_answer(s):
	def remove_articles(text): return re.sub(r'\b(a\|an\|the)\b', ' ', text)
	def white_space_fix(text): return ' '.join(text.split())
	def remove_punc(text):
	return ''.join(ch for ch in text if ch not in set(string.punctuation))
	def lower(text): return text.lower()
	return white_space_fix(remove_articles(remove_punc(lower(s))))

	# Metrics
	def exact_match_score(pred, truth):
	return int(normalize_answer(pred) == normalize_answer(truth))

	def f1_score_qa(pred, truth):
	pred_tokens = normalize_answer(pred).split()
	truth_tokens = normalize_answer(truth).split()
	common = Counter(pred_tokens) & Counter(truth_tokens)
	num_same = sum(common.values())
	if num_same == 0: return 0
	precision = num_same / len(pred_tokens)
	recall = num_same / len(truth_tokens)
	return (2 * precision * recall) / (precision + recall)

	# Identical to extractor's QA confidence
	def get_qa_confidence(model, tokenizer, question, context):
	inputs = tokenizer(
	question, context,
	return_tensors="pt",
	truncation=True,
	max_length=512,
	stride=128,
	padding=True
	)
	if torch.cuda.is_available():
	inputs = {k:v.cuda() for k,v in inputs.items()}
	model = model.cuda()

	with torch.no_grad():
	outputs = model(**inputs)

	start_probs = torch.softmax(outputs.start_logits, dim=1)
	end_probs = torch.softmax(outputs.end_logits, dim=1)
	answer_start = torch.argmax(outputs.start_logits)
	answer_end = torch.argmax(outputs.end_logits) + 1

	confidence = np.sqrt(
	start_probs[0, answer_start].item() *
	end_probs[0, answer_end-1].item()
	)

	answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
	answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
	return answer.strip(), float(confidence)

	def run_evaluation(num_samples=100):
	# Load CUAD with remote code trust
	dataset = load_dataset(
	"theatticusproject/cuad-qa",
	trust_remote_code=True,
	token=os.getenv("HF_TOKEN", True) # True allows anonymous access
	)
	test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))

	# Load model
	model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForQuestionAnswering.from_pretrained(model_name)

	results = []
	for example in test_data:
	context = example["context"]
	question = example["question"]
	gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""

	pred, conf = get_qa_confidence(model, tokenizer, question, context)

	results.append({
	"question": question[:100] + "..." if len(question) > 100 else question,
	"prediction": pred,
	"confidence": conf,
	"exact_match": exact_match_score(pred, gt_answer),
	"f1": f1_score_qa(pred, gt_answer),
	"ground_truth": gt_answer
	})

	# Generate report
	df = pd.DataFrame(results)
	report = f"""
	Evaluation Results (n={len(df)})
	=================
	Exact Match: {df['exact_match'].mean():.1%}
	F1 Score: {df['f1'].mean():.1%}
	Avg Confidence: {df['confidence'].mean():.1%}
	High-Confidence Accuracy: {
	df[df['confidence'] > 0.8]['exact_match'].mean():.1%}
	"""

	# Save
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	results_file = f"eval_results_{timestamp}.json"
	with open(results_file, 'w') as f:
	json.dump({
	"config": {"model": model_name, "dataset": "cuad-qa"},
	"metrics": {
	"exact_match": float(df['exact_match'].mean()),
	"f1": float(df['f1'].mean()),
	"confidence": float(df['confidence'].mean())
	},
	"samples": results
	}, f, indent=2)

	return report, df, results_file

	if __name__ == "__main__":
	report, df, _ = run_evaluation(num_samples=50)
	print(report)
	print("\nSample predictions:")
	print(df[["question", "confidence", "exact_match"]].head())