File size: 1,398 Bytes
7758a19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from typing import List
import evaluate
from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult


class QuestionAnsweringSquadEvaluator:
    """Directly compares answers without using LLM-as-judge"""

    def __init__(self):
        self.metric = evaluate.load("squad")

    def evaluate(
        self, hypothesis_answer: str, ground_truth_answer: list
    ) -> List[EvaluationResult]:
        """Directly compare answers using simple matching"""

        # format the answers so that they're dicts with 'id' and 'prediction' keys for hypo
        # and the ref has id and answers keys
        hypothesis_answer = [{"id": "1", "prediction_text": hypothesis_answer}]
        ground_truth_answer = [
            {
                "id": "1",
                "answers": {
                    "answer_start": [0],
                    "text": [ground_truth_answer[0]["value"]],
                },
            }
        ]
        # Use the SQuAD metric to compare answers
        result = self.metric.compute(
            predictions=hypothesis_answer, references=ground_truth_answer
        )

        match = EvaluationResult(
            passed=True if int(result["exact_match"]) == 1 else False,
            reason=f"Partial Match (F1) score is {round(result['f1'], 2)}",
            criteria="Is the answer a direct match?",
            points=1,
        )
        return [match]