File size: 1,065 Bytes
7758a19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from typing import Dict, List, Any
from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
from surf_spot_finder.evaluation.test_case import CheckpointCriteria


class HypothesisEvaluator(LLMEvaluator):
    """Evaluates the final answer against ground truth"""

    def evaluate(
        self,
        hypothesis_final_answer: str,
        ground_truth_answer_dict: Dict[str, Any],
        ground_truth_checkpoints: List[CheckpointCriteria],
    ) -> List[EvaluationResult]:
        """Verify if the final answer meets all specified criteria"""
        results = []

        for criterion in ground_truth_checkpoints:
            evaluation = self.llm_evaluate_with_criterion(
                criteria=criterion.criteria,
                points=criterion.points,
                ground_truth_output=ground_truth_answer_dict,
                hypothesis_final_answer=hypothesis_final_answer,
            )

            results.append(evaluation)

        return results