surf-spot-finder / src /surf_spot_finder /evaluation /evaluators /QuestionAnsweringSquadEvaluator.py
Nathan Brake
Messaging capabilities come to tools (#37)
6646f61 unverified
raw
history blame
1.4 kB
from typing import List
import evaluate
from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
class QuestionAnsweringSquadEvaluator:
"""Directly compares answers without using LLM-as-judge"""
def __init__(self):
self.metric = evaluate.load("squad")
def evaluate(
self, hypothesis_answer: str, ground_truth_answer: list
) -> List[EvaluationResult]:
"""Directly compare answers using simple matching"""
# format the answers so that they're dicts with 'id' and 'prediction' keys for hypo
# and the ref has id and answers keys
hypothesis_answer = [{"id": "1", "prediction_text": hypothesis_answer}]
ground_truth_answer = [
{
"id": "1",
"answers": {
"answer_start": [0],
"text": [ground_truth_answer[0]["value"]],
},
}
]
# Use the SQuAD metric to compare answers
result = self.metric.compute(
predictions=hypothesis_answer, references=ground_truth_answer
)
match = EvaluationResult(
passed=True if int(result["exact_match"]) == 1 else False,
reason=f"Partial Match (F1) score is {round(result['f1'], 2)}",
criteria="Is the answer a direct match?",
points=1,
)
return [match]