Spaces:

mozilla-ai
/

surf-spot-finder

Running

App Files Files Community

Nathan Brake commited on Mar 25

Commit

7758a19

unverified ·

1 Parent(s): 94a64b0

Add SquadQA metric, split out the files and support for ollama LLM-as-judge (#35)

Browse files

* Split out the files and support for ollama LLM-as-judge

* lint

* no test case

* rename to better answer

Files changed (11) hide show

pyproject.toml +1 -0
src/surf_spot_finder/evaluation/evaluate.py +23 -11
src/surf_spot_finder/evaluation/evaluators/CheckpointEvaluator.py +40 -0
src/surf_spot_finder/evaluation/evaluators/HypothesisEvaluator.py +29 -0
src/surf_spot_finder/evaluation/evaluators/LLMEvaluator.py +95 -0
src/surf_spot_finder/evaluation/evaluators/QuestionAnsweringSquadEvaluator.py +41 -0
src/surf_spot_finder/evaluation/evaluators/__init__.py +9 -0
src/surf_spot_finder/evaluation/evaluators/schemas.py +11 -0
src/surf_spot_finder/evaluation/test_case.py +4 -1
src/surf_spot_finder/evaluation/test_cases/alpha.yaml +3 -0
src/surf_spot_finder/evaluation/utils.py +0 -151

pyproject.toml CHANGED Viewed

@@ -53,6 +53,7 @@ tests = [
   "pytest>=8,<9",
   "pytest-sugar>=0.9.6",
   "debugpy>=1.8.13",
 ]
 # TODO maybe we don't want to keep this, or we want to swap this to Lumigator SDK

   "pytest>=8,<9",
   "pytest-sugar>=0.9.6",
   "debugpy>=1.8.13",
+  "evaluate>=0.4.3",
 ]
 # TODO maybe we don't want to keep this, or we want to swap this to Lumigator SDK

src/surf_spot_finder/evaluation/evaluate.py CHANGED Viewed

@@ -11,9 +11,10 @@ from surf_spot_finder.config import (
     Config,
 )
 from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
-from surf_spot_finder.evaluation.utils import (
-    verify_checkpoints,
-    verify_hypothesis_answer,
 )
 from surf_spot_finder.evaluation.test_case import TestCase
@@ -55,28 +56,39 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
     # Extract the final answer from the telemetry
     processor = TelemetryProcessor.create(agent_type)
     hypothesis_answer = processor.extract_hypothesis_answer(trace=telemetry)
-    # Verify agent behavior against checkpoints using llm-as-a-judge
-    llm_judge = "openai/gpt-4o"
-    checkpoint_results = verify_checkpoints(
         telemetry=telemetry,
         checkpoints=test_case.checkpoints,
-        model=llm_judge,
         processor=processor,
     )
-    hypothesis_answer_results = verify_hypothesis_answer(
         hypothesis_final_answer=hypothesis_answer,
         ground_truth_answer_dict=test_case.ground_truth,
         ground_truth_checkpoints=test_case.final_answer_criteria,
-        model=llm_judge,
     )
     # Summarize results
     output_message = ""
     output_message += (
         f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>\n"""
     )
-    verification_results = checkpoint_results + hypothesis_answer_results
     failed_checks = [r for r in verification_results if not r.passed]
     passed_checks = [r for r in verification_results if r.passed]
     missed_points = sum([r.points for r in failed_checks])

     Config,
 )
 from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
+from surf_spot_finder.evaluation.evaluators import (
+    CheckpointEvaluator,
+    QuestionAnsweringSquadEvaluator,
+    HypothesisEvaluator,
 )
 from surf_spot_finder.evaluation.test_case import TestCase
     # Extract the final answer from the telemetry
     processor = TelemetryProcessor.create(agent_type)
     hypothesis_answer = processor.extract_hypothesis_answer(trace=telemetry)
+    # Checkpoint evaluation
+    checkpoint_evaluator = CheckpointEvaluator(model=test_case.llm_judge)
+    checkpoint_results = checkpoint_evaluator.evaluate(
         telemetry=telemetry,
         checkpoints=test_case.checkpoints,
         processor=processor,
     )
+    # Hypothesis answer evaluation
+    hypothesis_evaluator = HypothesisEvaluator(model=test_case.llm_judge)
+    hypothesis_answer_results = hypothesis_evaluator.evaluate(
         hypothesis_final_answer=hypothesis_answer,
         ground_truth_answer_dict=test_case.ground_truth,
         ground_truth_checkpoints=test_case.final_answer_criteria,
+    )
+    # Direct answer evaluation (new)
+    direct_evaluator = QuestionAnsweringSquadEvaluator()
+    direct_results = direct_evaluator.evaluate(
+        hypothesis_answer=hypothesis_answer,
+        ground_truth_answer=test_case.ground_truth,
+    )
+    # Combine all results
+    verification_results = (
+        checkpoint_results + hypothesis_answer_results + direct_results
     )
     # Summarize results
     output_message = ""
     output_message += (
         f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>\n"""
     )
     failed_checks = [r for r in verification_results if not r.passed]
     passed_checks = [r for r in verification_results if r.passed]
     missed_points = sum([r.points for r in failed_checks])

src/surf_spot_finder/evaluation/evaluators/CheckpointEvaluator.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from typing import Dict, List, Any
+from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
+from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
+from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
+from surf_spot_finder.evaluation.test_case import CheckpointCriteria
+class CheckpointEvaluator(LLMEvaluator):
+    """Evaluates checkpoints against telemetry"""
+    def evaluate(
+        self,
+        telemetry: List[Dict[str, Any]],
+        checkpoints: List[CheckpointCriteria],
+        processor: TelemetryProcessor,
+    ) -> List[EvaluationResult]:
+        """
+        Verify each checkpoint against the telemetry data using LLM
+        Args:
+            telemetry: The telemetry data to evaluate
+            checkpoints: List of checkpoint criteria to verify
+            processor: Telemetry processor to extract evidence
+        Returns:
+            List of evaluation results
+        """
+        evidence = processor.extract_evidence(telemetry)
+        results = []
+        for checkpoint in checkpoints:
+            evaluation = self.llm_evaluate_with_criterion(
+                criteria=checkpoint.criteria,
+                points=checkpoint.points,
+                evidence=evidence,
+            )
+            results.append(evaluation)
+        return results

src/surf_spot_finder/evaluation/evaluators/HypothesisEvaluator.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import Dict, List, Any
+from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
+from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
+from surf_spot_finder.evaluation.test_case import CheckpointCriteria
+class HypothesisEvaluator(LLMEvaluator):
+    """Evaluates the final answer against ground truth"""
+    def evaluate(
+        self,
+        hypothesis_final_answer: str,
+        ground_truth_answer_dict: Dict[str, Any],
+        ground_truth_checkpoints: List[CheckpointCriteria],
+    ) -> List[EvaluationResult]:
+        """Verify if the final answer meets all specified criteria"""
+        results = []
+        for criterion in ground_truth_checkpoints:
+            evaluation = self.llm_evaluate_with_criterion(
+                criteria=criterion.criteria,
+                points=criterion.points,
+                ground_truth_output=ground_truth_answer_dict,
+                hypothesis_final_answer=hypothesis_final_answer,
+            )
+            results.append(evaluation)
+        return results

src/surf_spot_finder/evaluation/evaluators/LLMEvaluator.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from abc import ABC
+import json
+import re
+from typing import Dict, List, Any, Optional, Union
+from textwrap import dedent
+from litellm import completion
+from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
+from surf_spot_finder.evaluation.test_case import CheckpointCriteria
+class LLMEvaluator(ABC):
+    """Base class for evaluators that use LLM-as-judge"""
+    def __init__(self, model: str):
+        self.model = model
+    def llm_evaluate_with_criterion(
+        self,
+        criteria: str,
+        points: int,
+        ground_truth_output: Optional[
+            Union[List[CheckpointCriteria], Dict[str, Any]]
+        ] = None,
+        hypothesis_final_answer: Optional[str] = None,
+        evidence: Optional[str] = None,
+    ) -> EvaluationResult:
+        """Evaluate a single criterion using LLM"""
+        prompt = dedent(f"""
+        Evaluate if the following criterion was met {"based on the provided evidence" if evidence else "in the agent's answer"}.
+        Criterion: {criteria}
+        """)
+        if ground_truth_output:
+            prompt += dedent(f"""
+            Expected output: {json.dumps(ground_truth_output)}
+            """)
+        if hypothesis_final_answer:
+            prompt += dedent(f"""
+            Agent's answer: {hypothesis_final_answer}
+            """)
+        if evidence:
+            prompt += dedent(f"""
+            Telemetry evidence:
+            {evidence}
+            """)
+        prompt += f"""
+        Based on the {"evidence" if evidence else "comparison between the expected output and the actual final answer"},
+        was this criterion satisfied? Answer with:
+        1. "passed": true or false
+        2. "reason": Brief explanation for your decision
+        """
+        prompt += """
+        Output valid JSON with these three fields only, in the format:
+        ```json
+        {
+            "passed": true,
+            "reason": "I have them"
+        }
+        ```
+        """
+        response = completion(
+            model=self.model, messages=[{"role": "user", "content": prompt}]
+        )
+        content = response.choices[0].message.content
+        try:
+            # Extract JSON from the response - looks for patterns like ```json {...} ``` or just {...}
+            json_match = re.search(
+                r"```(?:json)?\s*(\{.*?\})\s*```|(\{.*?\})", content, re.DOTALL
+            )
+            if json_match:
+                # Use the first matching group that captured content
+                json_str = next(group for group in json_match.groups() if group)
+                evaluation = json.loads(json_str)
+            else:
+                # Fallback: try parsing the whole content as JSON
+                evaluation = json.loads(content)
+            evaluation["criteria"] = criteria
+        except (json.JSONDecodeError, AttributeError, StopIteration) as e:
+            evaluation = {
+                "passed": False,
+                "reason": f"Failed to evaluate due to parsing: {str(e)} \n Response: {content}",
+                "criteria": criteria,
+            }
+        evaluation["points"] = points
+        return EvaluationResult.model_validate(evaluation)

src/surf_spot_finder/evaluation/evaluators/QuestionAnsweringSquadEvaluator.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from typing import List
+import evaluate
+from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
+class QuestionAnsweringSquadEvaluator:
+    """Directly compares answers without using LLM-as-judge"""
+    def __init__(self):
+        self.metric = evaluate.load("squad")
+    def evaluate(
+        self, hypothesis_answer: str, ground_truth_answer: list
+    ) -> List[EvaluationResult]:
+        """Directly compare answers using simple matching"""
+        # format the answers so that they're dicts with 'id' and 'prediction' keys for hypo
+        # and the ref has id and answers keys
+        hypothesis_answer = [{"id": "1", "prediction_text": hypothesis_answer}]
+        ground_truth_answer = [
+            {
+                "id": "1",
+                "answers": {
+                    "answer_start": [0],
+                    "text": [ground_truth_answer[0]["value"]],
+                },
+            }
+        ]
+        # Use the SQuAD metric to compare answers
+        result = self.metric.compute(
+            predictions=hypothesis_answer, references=ground_truth_answer
+        )
+        print(result)
+        match = EvaluationResult(
+            passed=True if int(result["exact_match"]) == 1 else False,
+            reason=f"Partial Match (F1) score is {round(result['f1'], 2)}",
+            criteria="Is the answer a direct match?",
+            points=1,
+        )
+        return [match]

src/surf_spot_finder/evaluation/evaluators/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .CheckpointEvaluator import CheckpointEvaluator
+from .QuestionAnsweringSquadEvaluator import QuestionAnsweringSquadEvaluator
+from .HypothesisEvaluator import HypothesisEvaluator
+__all__ = [
+    "CheckpointEvaluator",
+    "QuestionAnsweringSquadEvaluator",
+    "HypothesisEvaluator",
+]

src/surf_spot_finder/evaluation/evaluators/schemas.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from pydantic import BaseModel, ConfigDict
+class EvaluationResult(BaseModel):
+    """Represents the result of evaluating a criterion"""
+    model_config = ConfigDict(extra="forbid")
+    passed: bool
+    reason: str
+    criteria: str
+    points: int

src/surf_spot_finder/evaluation/test_case.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Dict, List, Any
 from pydantic import BaseModel, Field, ConfigDict
 import yaml
 class InputModel(BaseModel):
@@ -26,6 +27,7 @@ class TestCase(BaseModel):
     input: InputModel
     ground_truth: List[Dict[str, Any]] = Field(default_factory=list)
     checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
     final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
     test_case_path: str
     output_path: str = "output/results.json"
@@ -59,5 +61,6 @@ class TestCase(BaseModel):
         ]
         test_case_dict["test_case_path"] = test_case_path
         return cls.model_validate(test_case_dict)

 from typing import Dict, List, Any
 from pydantic import BaseModel, Field, ConfigDict
 import yaml
+from litellm import validate_environment
 class InputModel(BaseModel):
     input: InputModel
     ground_truth: List[Dict[str, Any]] = Field(default_factory=list)
     checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
+    llm_judge: str
     final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
     test_case_path: str
     output_path: str = "output/results.json"
         ]
         test_case_dict["test_case_path"] = test_case_path
+        # verify that the llm_judge is a valid litellm model
+        validate_environment(test_case_dict["llm_judge"])
         return cls.model_validate(test_case_dict)

src/surf_spot_finder/evaluation/test_cases/alpha.yaml CHANGED Viewed

@@ -30,3 +30,6 @@ checkpoints:
     criteria: "Check if the agent used the get_area_lat_lon tool and it succeeded"
   - points: 1
     criteria: "Check if the final answer contains any description about the weather at the chosen location"

     criteria: "Check if the agent used the get_area_lat_lon tool and it succeeded"
   - points: 1
     criteria: "Check if the final answer contains any description about the weather at the chosen location"
+llm_judge: "ollama/gemma3:4b-it-fp16"

src/surf_spot_finder/evaluation/utils.py DELETED Viewed

@@ -1,151 +0,0 @@
-import json
-from typing import Dict, List, Any, Optional
-import re
-from litellm import completion
-from textwrap import dedent
-from pydantic import BaseModel, ConfigDict
-from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
-from surf_spot_finder.evaluation.test_case import CheckpointCriteria
-class EvaluationResult(BaseModel):
-    """Represents the result of evaluating a criterion"""
-    model_config = ConfigDict(extra="forbid")
-    passed: bool
-    reason: str
-    criteria: str
-    points: int
-def evaluate_criterion(
-    criteria: str,
-    model: str,
-    points: int,
-    ground_truth_output: Optional[List[CheckpointCriteria] | Dict[str, Any]] = None,
-    hypothesis_final_answer: Optional[str] = None,
-    evidence: Optional[str] = None,
-) -> EvaluationResult:
-    """Evaluate a single criterion using LLM"""
-    prompt = dedent(f"""
-    Evaluate if the following criterion was met {"based on the provided evidence" if evidence else "in the agent's answer"}.
-    Criterion: {criteria}
-    """)
-    if ground_truth_output:
-        prompt += dedent(f"""
-        Expected output: {json.dumps(ground_truth_output)}
-        """)
-    if hypothesis_final_answer:
-        prompt += dedent(f"""
-        Agent's  answer: {hypothesis_final_answer}
-        """)
-    if evidence:
-        prompt += dedent(f"""
-        Telemetry evidence:
-        {evidence}
-        """)
-    prompt += f"""
-    Based on the {"evidence" if evidence else "comparison between the expected output and the actual final answer"},
-    was this criterion satisfied? Answer with:
-    1. "passed": true or false
-    2. "reason": Brief explanation for your decision
-    """
-    prompt += """
-    Output valid JSON with these three fields only, in the format:
-    ```json
-    {
-        "passed": true,
-        "reason": "I have them"
-    }
-    ```
-    """
-    response = completion(model=model, messages=[{"role": "user", "content": prompt}])
-    content = response.choices[0].message.content
-    try:
-        # Extract JSON from the response - looks for patterns like ```json {...} ``` or just {...}
-        # Claude helped me with this one, regex is hard
-        json_match = re.search(
-            r"```(?:json)?\s*(\{.*?\})\s*```|(\{.*?\})", content, re.DOTALL
-        )
-        if json_match:
-            # Use the first matching group that captured content
-            json_str = next(group for group in json_match.groups() if group)
-            evaluation = json.loads(json_str)
-        else:
-            # Fallback: try parsing the whole content as JSON
-            evaluation = json.loads(content)
-        evaluation["criteria"] = criteria
-    except (json.JSONDecodeError, AttributeError, StopIteration) as e:
-        evaluation = {
-            "passed": False,
-            "reason": f"Failed to evaluate due to parsing: {str(e)} \n Response: {content}",
-            "criteria": criteria,
-        }
-    evaluation["points"] = points
-    return EvaluationResult.model_validate(evaluation)
-def verify_checkpoints(
-    telemetry: List[Dict[str, Any]],
-    checkpoints: List[CheckpointCriteria],
-    model: str,
-    processor: TelemetryProcessor,
-) -> List[EvaluationResult]:
-    """Verify each checkpoint against the telemetry data using LLM
-    These checkpoints do not take the ground truth or hyupothesis
-    answers into account. They are only concerned with the trace and
-    the specific criteria mentioned.
-    """
-    results = []
-    evidence = processor.extract_evidence(telemetry)
-    print(evidence)
-    for checkpoint in checkpoints:
-        criteria = checkpoint.criteria
-        evaluation = evaluate_criterion(
-            criteria=criteria,
-            points=checkpoint.points,
-            model=model,
-            evidence=evidence,
-        )
-        results.append(evaluation)
-    return results
-def verify_hypothesis_answer(
-    hypothesis_final_answer: str,
-    ground_truth_answer_dict: Dict[str, Any],
-    ground_truth_checkpoints: List[CheckpointCriteria],
-    model: str,
-) -> List[EvaluationResult]:
-    """
-    Verify if the final answer meets all specified criteria
-    """
-    results = []
-    for criterion in ground_truth_checkpoints:
-        evaluation = evaluate_criterion(
-            criteria=criterion.criteria,
-            points=criterion.points,
-            ground_truth_output=ground_truth_answer_dict,
-            hypothesis_final_answer=hypothesis_final_answer,
-            model=model,
-        )
-        results.append(evaluation)
-    return results