Spaces:
Running
Running
File size: 1,398 Bytes
7758a19 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
from typing import List
import evaluate
from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
class QuestionAnsweringSquadEvaluator:
"""Directly compares answers without using LLM-as-judge"""
def __init__(self):
self.metric = evaluate.load("squad")
def evaluate(
self, hypothesis_answer: str, ground_truth_answer: list
) -> List[EvaluationResult]:
"""Directly compare answers using simple matching"""
# format the answers so that they're dicts with 'id' and 'prediction' keys for hypo
# and the ref has id and answers keys
hypothesis_answer = [{"id": "1", "prediction_text": hypothesis_answer}]
ground_truth_answer = [
{
"id": "1",
"answers": {
"answer_start": [0],
"text": [ground_truth_answer[0]["value"]],
},
}
]
# Use the SQuAD metric to compare answers
result = self.metric.compute(
predictions=hypothesis_answer, references=ground_truth_answer
)
match = EvaluationResult(
passed=True if int(result["exact_match"]) == 1 else False,
reason=f"Partial Match (F1) score is {round(result['f1'], 2)}",
criteria="Is the answer a direct match?",
points=1,
)
return [match]
|