Spaces:

evaluate-metric
/

cuad

Runtime error

App Files Files Community

cuad / compute_score.py

lvwerra HF Staff

Update Space (evaluate main: 828c6327)

22a9bb8 over 3 years ago

raw

history blame

6.98 kB

	""" Official evaluation script for CUAD dataset. """

	import argparse
	import json
	import re
	import string
	import sys

	import numpy as np


	IOU_THRESH = 0.5


	def get_jaccard(prediction, ground_truth):
	remove_tokens = [".", ",", ";", ":"]

	for token in remove_tokens:
	ground_truth = ground_truth.replace(token, "")
	prediction = prediction.replace(token, "")

	ground_truth, prediction = ground_truth.lower(), prediction.lower()
	ground_truth, prediction = ground_truth.replace("/", " "), prediction.replace("/", " ")
	ground_truth, prediction = set(ground_truth.split(" ")), set(prediction.split(" "))

	intersection = ground_truth.intersection(prediction)
	union = ground_truth.union(prediction)
	jaccard = len(intersection) / len(union)
	return jaccard


	def normalize_answer(s):
	"""Lower text and remove punctuation, articles and extra whitespace."""

	def remove_articles(text):
	return re.sub(r"\b(a\|an\|the)\b", " ", text)

	def white_space_fix(text):
	return " ".join(text.split())

	def remove_punc(text):
	exclude = set(string.punctuation)
	return "".join(ch for ch in text if ch not in exclude)

	def lower(text):
	return text.lower()

	return white_space_fix(remove_articles(remove_punc(lower(s))))


	def compute_precision_recall(predictions, ground_truths, qa_id):
	tp, fp, fn = 0, 0, 0

	substr_ok = "Parties" in qa_id

	# first check if ground truth is empty
	if len(ground_truths) == 0:
	if len(predictions) > 0:
	fp += len(predictions) # false positive for each one
	else:
	for ground_truth in ground_truths:
	assert len(ground_truth) > 0
	# check if there is a match
	match_found = False
	for pred in predictions:
	if substr_ok:
	is_match = get_jaccard(pred, ground_truth) >= IOU_THRESH or ground_truth in pred
	else:
	is_match = get_jaccard(pred, ground_truth) >= IOU_THRESH
	if is_match:
	match_found = True

	if match_found:
	tp += 1
	else:
	fn += 1

	# now also get any fps by looping through preds
	for pred in predictions:
	# Check if there's a match. if so, don't count (don't want to double count based on the above)
	# but if there's no match, then this is a false positive.
	# (Note: we get the true positives in the above loop instead of this loop so that we don't double count
	# multiple predictions that are matched with the same answer.)
	match_found = False
	for ground_truth in ground_truths:
	assert len(ground_truth) > 0
	if substr_ok:
	is_match = get_jaccard(pred, ground_truth) >= IOU_THRESH or ground_truth in pred
	else:
	is_match = get_jaccard(pred, ground_truth) >= IOU_THRESH
	if is_match:
	match_found = True

	if not match_found:
	fp += 1

	precision = tp / (tp + fp) if tp + fp > 0 else np.nan
	recall = tp / (tp + fn) if tp + fn > 0 else np.nan

	return precision, recall


	def process_precisions(precisions):
	"""
	Processes precisions to ensure that precision and recall don't both get worse.
	Assumes the list precision is sorted in order of recalls
	"""
	precision_best = precisions[::-1]
	for i in range(1, len(precision_best)):
	precision_best[i] = max(precision_best[i - 1], precision_best[i])
	precisions = precision_best[::-1]
	return precisions


	def get_aupr(precisions, recalls):
	processed_precisions = process_precisions(precisions)
	aupr = np.trapz(processed_precisions, recalls)
	if np.isnan(aupr):
	return 0
	return aupr


	def get_prec_at_recall(precisions, recalls, recall_thresh):
	"""Assumes recalls are sorted in increasing order"""
	processed_precisions = process_precisions(precisions)
	prec_at_recall = 0
	for prec, recall in zip(processed_precisions, recalls):
	if recall >= recall_thresh:
	prec_at_recall = prec
	break
	return prec_at_recall


	def exact_match_score(prediction, ground_truth):
	return normalize_answer(prediction) == normalize_answer(ground_truth)


	def metric_max_over_ground_truths(metric_fn, predictions, ground_truths):
	score = 0
	for pred in predictions:
	for ground_truth in ground_truths:
	score = metric_fn(pred, ground_truth)
	if score == 1: # break the loop when one prediction matches the ground truth
	break
	if score == 1:
	break
	return score


	def compute_score(dataset, predictions):
	f1 = exact_match = total = 0
	precisions = []
	recalls = []
	for article in dataset:
	for paragraph in article["paragraphs"]:
	for qa in paragraph["qas"]:
	total += 1
	if qa["id"] not in predictions:
	message = "Unanswered question " + qa["id"] + " will receive score 0."
	print(message, file=sys.stderr)
	continue
	ground_truths = list(map(lambda x: x["text"], qa["answers"]))
	prediction = predictions[qa["id"]]
	precision, recall = compute_precision_recall(prediction, ground_truths, qa["id"])

	precisions.append(precision)
	recalls.append(recall)

	if precision == 0 and recall == 0:
	f1 += 0
	else:
	f1 += 2 * (precision * recall) / (precision + recall)

	exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)

	precisions = [x for _, x in sorted(zip(recalls, precisions))]
	recalls.sort()

	f1 = 100.0 * f1 / total
	exact_match = 100.0 * exact_match / total
	aupr = get_aupr(precisions, recalls)

	prec_at_90_recall = get_prec_at_recall(precisions, recalls, recall_thresh=0.9)
	prec_at_80_recall = get_prec_at_recall(precisions, recalls, recall_thresh=0.8)

	return {
	"exact_match": exact_match,
	"f1": f1,
	"aupr": aupr,
	"prec_at_80_recall": prec_at_80_recall,
	"prec_at_90_recall": prec_at_90_recall,
	}


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Evaluation for CUAD")
	parser.add_argument("dataset_file", help="Dataset file")
	parser.add_argument("prediction_file", help="Prediction File")
	args = parser.parse_args()
	with open(args.dataset_file) as dataset_file:
	dataset_json = json.load(dataset_file)
	dataset = dataset_json["data"]
	with open(args.prediction_file) as prediction_file:
	predictions = json.load(prediction_file)
	print(json.dumps(compute_score(dataset, predictions)))