Nathan Brake
Add SquadQA metric, split out the files and support for ollama LLM-as-judge (#35)
7758a19 unverified
raw
history blame
1.07 kB
from typing import Dict, List, Any
from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
from surf_spot_finder.evaluation.test_case import CheckpointCriteria
class HypothesisEvaluator(LLMEvaluator):
"""Evaluates the final answer against ground truth"""
def evaluate(
self,
hypothesis_final_answer: str,
ground_truth_answer_dict: Dict[str, Any],
ground_truth_checkpoints: List[CheckpointCriteria],
) -> List[EvaluationResult]:
"""Verify if the final answer meets all specified criteria"""
results = []
for criterion in ground_truth_checkpoints:
evaluation = self.llm_evaluate_with_criterion(
criteria=criterion.criteria,
points=criterion.points,
ground_truth_output=ground_truth_answer_dict,
hypothesis_final_answer=hypothesis_final_answer,
)
results.append(evaluation)
return results