Spaces:
Running
Running
File size: 1,065 Bytes
7758a19 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
from typing import Dict, List, Any
from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
from surf_spot_finder.evaluation.test_case import CheckpointCriteria
class HypothesisEvaluator(LLMEvaluator):
"""Evaluates the final answer against ground truth"""
def evaluate(
self,
hypothesis_final_answer: str,
ground_truth_answer_dict: Dict[str, Any],
ground_truth_checkpoints: List[CheckpointCriteria],
) -> List[EvaluationResult]:
"""Verify if the final answer meets all specified criteria"""
results = []
for criterion in ground_truth_checkpoints:
evaluation = self.llm_evaluate_with_criterion(
criteria=criterion.criteria,
points=criterion.points,
ground_truth_output=ground_truth_answer_dict,
hypothesis_final_answer=hypothesis_final_answer,
)
results.append(evaluation)
return results
|