Spaces:

mozilla-ai
/

surf-spot-finder

Running

App Files Files Community

Nathan Brake commited on Mar 13

Commit

eef7dd3

unverified ·

1 Parent(s): 5d76917

Improvement of evaluation functionality (#13)

Browse files

Files changed (4) hide show

src/surf_spot_finder/evaluation/evaluate.py +35 -35
src/surf_spot_finder/evaluation/test_case.py +25 -25
src/surf_spot_finder/evaluation/test_cases/alpha.yaml +14 -6
src/surf_spot_finder/evaluation/utils.py +47 -39

src/surf_spot_finder/evaluation/evaluate.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 from textwrap import dedent
 from typing import Any, Dict, List, Optional
 from loguru import logger
@@ -16,6 +17,10 @@ from surf_spot_finder.evaluation.utils import (
 )
 from surf_spot_finder.evaluation.test_case import TestCase
 def run_agent(test_case: TestCase) -> str:
     input_data = test_case.input
@@ -62,63 +67,58 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
     # Extract the final answer from the telemetry
     hypothesis_answer = extract_hypothesis_answer(telemetry)
     logger.info(
-        dedent(f"""
-                Hypothesis Final answer extracted:
-                - {hypothesis_answer}
-                """)
     )
     # Verify agent behavior against checkpoints using llm-as-a-judge
     llm_judge = "openai/gpt-4o"
     checkpoint_results = verify_checkpoints(
-        telemetry,
-        hypothesis_answer,
-        test_case.checkpoints,
-        test_case.ground_truth,
-        llm_judge,
     )
     hypothesis_answer_results = verify_hypothesis_answer(
-        hypothesis_answer,
-        test_case.ground_truth,
-        test_case.final_answer_criteria,
-        llm_judge,
     )
     # Summarize results
     verification_results = checkpoint_results + hypothesis_answer_results
-    all_passed = all(result["passed"] for result in verification_results)
-    failed_checks = [r for r in verification_results if not r["passed"]]
-    passed_checks = [r for r in verification_results if r["passed"]]
     if passed_checks:
-        logger.info(
-            f"Passed checkpoints: {len(passed_checks)}/{len(verification_results)}"
-        )
         for check in passed_checks:
             message = dedent(
                 f"""
-                Passed:
-                - {check["criteria"]}
-                - {check["reason"]}
-                """
             )
             logger.info(message)
     if failed_checks:
-        logger.error(
-            f"Failed checkpoints: {len(failed_checks)}/{len(verification_results)}"
-        )
         for check in failed_checks:
             message = dedent(
                 f"""
-                Failed:
-                - {check["criteria"]}
-                - {check["reason"]}
-                """
             )
             logger.error(message)
-        else:
-            logger.info("All checkpoints passed!")
-    return all_passed
 def evaluate(test_case_path: str, telemetry_path: Optional[str] = None) -> None:
@@ -139,7 +139,7 @@ def evaluate(test_case_path: str, telemetry_path: Optional[str] = None) -> None:
     else:
         logger.info(f"Using provided telemetry file: {telemetry_path}")
         logger.info(
-            "For this to work, the telemetry file must align with the test case."
         )
     evaluate_telemetry(test_case, telemetry_path)

 import json
+import sys
 from textwrap import dedent
 from typing import Any, Dict, List, Optional
 from loguru import logger
 )
 from surf_spot_finder.evaluation.test_case import TestCase
+logger.remove()
+logger = logger.opt(ansi=True)
+logger.add(sys.stdout, colorize=True, format="{message}")
 def run_agent(test_case: TestCase) -> str:
     input_data = test_case.input
     # Extract the final answer from the telemetry
     hypothesis_answer = extract_hypothesis_answer(telemetry)
     logger.info(
+        f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>"""
     )
     # Verify agent behavior against checkpoints using llm-as-a-judge
     llm_judge = "openai/gpt-4o"
     checkpoint_results = verify_checkpoints(
+        telemetry=telemetry,
+        checkpoints=test_case.checkpoints,
+        model=llm_judge,
     )
     hypothesis_answer_results = verify_hypothesis_answer(
+        hypothesis_final_answer=hypothesis_answer,
+        ground_truth_answer_dict=test_case.ground_truth,
+        ground_truth_checkpoints=test_case.final_answer_criteria,
+        model=llm_judge,
     )
     # Summarize results
     verification_results = checkpoint_results + hypothesis_answer_results
+    failed_checks = [r for r in verification_results if not r.passed]
+    passed_checks = [r for r in verification_results if r.passed]
+    missed_points = sum([r.points for r in failed_checks])
+    won_points = sum([r.points for r in passed_checks])
     if passed_checks:
         for check in passed_checks:
             message = dedent(
                 f"""
+                <green>Passed:
+                - {check.criteria}
+                - {check.reason}</green>"""
             )
             logger.info(message)
     if failed_checks:
         for check in failed_checks:
             message = dedent(
                 f"""
+                <red>Failed:
+                - {check.criteria}
+                - {check.reason}</red>"""
             )
             logger.error(message)
+    else:
+        logger.info("<green>All checkpoints passed!</green>")
+    logger.info(
+        f"<green>Passed checkpoints: {len(passed_checks)}/{len(verification_results)}</green>"
+    )
+    logger.info(
+        f"<red>Failed checkpoints: {len(failed_checks)}/{len(verification_results)}</red>"
+    )
+    logger.info("<green>=====================================</green>")
+    logger.info(f"<green>Score: {won_points}/{won_points + missed_points}</green>")
+    logger.info("<green>=====================================</green>")
 def evaluate(test_case_path: str, telemetry_path: Optional[str] = None) -> None:
     else:
         logger.info(f"Using provided telemetry file: {telemetry_path}")
         logger.info(
+            "For this to work, the telemetry file must align with the test case.",
         )
     evaluate_telemetry(test_case, telemetry_path)

src/surf_spot_finder/evaluation/test_case.py CHANGED Viewed

@@ -18,18 +18,17 @@ class InputModel(BaseModel):
 class CheckpointCriteria(BaseModel):
-    """Represents a checkpoint criteria with a value and description"""
     model_config = ConfigDict(extra="forbid")
-    value: int
     criteria: str
 class TestCase(BaseModel):
     model_config = ConfigDict(extra="forbid")
     input: InputModel
-    ground_truth: Dict[str, Any]
     checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
     final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
@@ -38,26 +37,27 @@ class TestCase(BaseModel):
         """Load a test case from a YAML file and process it"""
         with open(case_path, "r") as f:
             test_case_dict = yaml.safe_load(f)
-        # Generate final_answer_criteria if not explicitly provided
-        if "final_answer_criteria" not in test_case_dict:
-            final_answer_criteria = []
-            def add_gt_final_answer_criteria(ground_truth_dict, prefix=""):
-                """Recursively add checkpoints for each value in the ground_truth dictionary"""
-                for key, value in ground_truth_dict.items():
-                    path = f"{prefix}: {key}" if prefix else key
-                    if isinstance(value, dict):
-                        add_gt_final_answer_criteria(value, path)
-                    else:
-                        final_answer_criteria.append(
-                            {
-                                "value": 1,
-                                "criteria": f"Check if {path} is approximately '{value}'.",
-                            }
-                        )
-            add_gt_final_answer_criteria(test_case_dict["ground_truth"])
-            test_case_dict["final_answer_criteria"] = final_answer_criteria
         return cls.model_validate(test_case_dict)

 class CheckpointCriteria(BaseModel):
+    """Represents a checkpoint criteria with a description"""
     model_config = ConfigDict(extra="forbid")
     criteria: str
+    points: int
 class TestCase(BaseModel):
     model_config = ConfigDict(extra="forbid")
     input: InputModel
+    ground_truth: List[Dict[str, Any]] = Field(default_factory=list)
     checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
     final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
         """Load a test case from a YAML file and process it"""
         with open(case_path, "r") as f:
             test_case_dict = yaml.safe_load(f)
+        final_answer_criteria = []
+        def add_gt_final_answer_criteria(ground_truth_list):
+            """Add checkpoints for each item in the ground_truth list"""
+            for item in ground_truth_list:
+                if isinstance(item, dict) and "name" in item and "value" in item:
+                    points = item.get(
+                        "points", 1
+                    )  # Default to 1 if points not specified
+                    final_answer_criteria.append(
+                        {
+                            "points": points,
+                            "criteria": f"Check if {item['name']} is approximately '{item['value']}'.",
+                        }
+                    )
+        add_gt_final_answer_criteria(test_case_dict["ground_truth"])
+        test_case_dict["final_answer_criteria"] = final_answer_criteria
+        # remove the points from the ground_truth list but keep the name and value
+        test_case_dict["ground_truth"] = [
+            item for item in test_case_dict["ground_truth"] if isinstance(item, dict)
+        ]
         return cls.model_validate(test_case_dict)

src/surf_spot_finder/evaluation/test_cases/alpha.yaml CHANGED Viewed

@@ -3,20 +3,28 @@ input:
   location: "Vigo"
   date: "2025-03-15 22:00"
   max_driving_hours: 3
-  model_id: "openai/gpt-4o"
   api_key_var: "OPENAI_API_KEY"
   json_tracer: true
   api_base: null
   agent_type: "smolagents"
 ground_truth:
-  "Surf location": "Playa de Patos"
-  "Water temperature": "about 14°C +-5°C"
-  "Wave height": "about 1 meter"
 # Base checkpoints for agent behavior
 checkpoints:
-  - value: 1
     criteria: "Check if the agent consulted DuckDuckGoSearchTool for locations near Vigo."
-  - value: 1
     criteria: "Check if the agent fetched a website for forecasting, not relying on text from a DuckDuckGo search."

   location: "Vigo"
   date: "2025-03-15 22:00"
   max_driving_hours: 3
+  model_id: "openai/o3-mini"
   api_key_var: "OPENAI_API_KEY"
   json_tracer: true
   api_base: null
   agent_type: "smolagents"
 ground_truth:
+  - name: "Surf location"
+    points: 5
+    value: "Playa de Patos"
+  - name: "Water temperature"
+    points: 1
+    value: "about 14°C +-5°C"
+  - name: "Wave height"
+    points: 1
+    value: "about 1 meter"
 # Base checkpoints for agent behavior
+# These evaluators for these checkpoints
+# will not consider the hypothesis answer or final answer in their decision making
 checkpoints:
+  - points: 1
     criteria: "Check if the agent consulted DuckDuckGoSearchTool for locations near Vigo."
+  - points: 1
     criteria: "Check if the agent fetched a website for forecasting, not relying on text from a DuckDuckGo search."

src/surf_spot_finder/evaluation/utils.py CHANGED Viewed

@@ -3,10 +3,22 @@ from typing import Dict, List, Any, Optional
 import re
 from litellm import completion
 from surf_spot_finder.evaluation.test_case import CheckpointCriteria
 def extract_hypothesis_answer(telemetry: List[Dict[str, Any]]) -> str | None:
     """Extract the hypothesis agent final answer from the telemetry data"""
     for span in reversed(telemetry):
@@ -18,47 +30,48 @@ def extract_hypothesis_answer(telemetry: List[Dict[str, Any]]) -> str | None:
 def evaluate_criterion(
     criteria: str,
-    value: int,
-    ground_truth_output: List[CheckpointCriteria] | Dict[str, Any],
-    hypothesis_final_answer: str,
     model: str,
     evidence: Optional[str] = None,
-) -> Dict[str, Any]:
     """Evaluate a single criterion using LLM"""
-    prompt = f"""
-    Evaluate if the following {"checkpoint" if evidence else "criterion"} was met {"based on the provided evidence" if evidence else "in the agent's answer"}.
-    {"Checkpoint" if evidence else "Criterion"}: {criteria}
-    Value: {value}
-    Expected output: {json.dumps(ground_truth_output)}
-    Agent's  answer: {hypothesis_final_answer}
-    """
     if evidence:
-        prompt += f"""
         Telemetry evidence:
         {evidence}
-        """
     prompt += f"""
     Based on the {"evidence" if evidence else "comparison between the expected output and the actual final answer"},
-    was this {"checkpoint" if evidence else "criterion"} satisfied? Answer with:
     1. "passed": true or false
     2. "reason": Brief explanation for your decision
-    3. "score": A score from 0 to {value} indicating how well the {"checkpoint" if evidence else "criterion"} was met
     """
     prompt += """
     Output valid JSON with these three fields only, in the format:
     ```json
     {
         "passed": true,
-        "reason": "I have them",
-        "score": 1
     }
     ```
     """
@@ -82,38 +95,35 @@ def evaluate_criterion(
             evaluation = json.loads(content)
         evaluation["criteria"] = criteria
-        evaluation["value"] = value
-        return evaluation
     except (json.JSONDecodeError, AttributeError, StopIteration) as e:
-        return {
             "passed": False,
             "reason": f"Failed to evaluate due to parsing: {str(e)} \n Response: {content}",
-            "score": 0,
             "criteria": criteria,
-            "value": value,
         }
 def verify_checkpoints(
     telemetry: List[Dict[str, Any]],
-    hypothesis_final_answer: str,
     checkpoints: List[CheckpointCriteria],
-    ground_truth_checkpoints: List[CheckpointCriteria],
     model: str,
-) -> List[Dict[str, Any]]:
-    """Verify each checkpoint against the telemetry data using LLM"""
     results = []
     for checkpoint in checkpoints:
         criteria = checkpoint.criteria
-        value = checkpoint.value
         evidence = extract_relevant_evidence(telemetry, criteria)
         evaluation = evaluate_criterion(
             criteria=criteria,
-            value=value,
-            ground_truth_output=ground_truth_checkpoints,
-            hypothesis_final_answer=hypothesis_final_answer,
             model=model,
             evidence=evidence,
         )
@@ -128,19 +138,16 @@ def verify_hypothesis_answer(
     ground_truth_answer_dict: Dict[str, Any],
     ground_truth_checkpoints: List[CheckpointCriteria],
     model: str,
-) -> List[Dict[str, Any]]:
     """
     Verify if the final answer meets all specified criteria
     """
     results = []
     for criterion in ground_truth_checkpoints:
-        criteria = criterion.criteria
-        value = criterion.value
         evaluation = evaluate_criterion(
-            criteria=criteria,
-            value=value,
             ground_truth_output=ground_truth_answer_dict,
             hypothesis_final_answer=hypothesis_final_answer,
             model=model,
@@ -155,7 +162,8 @@ def extract_relevant_evidence(telemetry: List[Dict[str, Any]], criteria: str) ->
     """Extract relevant telemetry evidence based on the checkpoint criteria
     TODO this is not a very robust implementation, since it requires knowledge about which tools have been
     implemented. We should abstract this so that it can dynamically figure out what tools may have been used
-    and check for them appropriately."""
     evidence = ""
     # Look for evidence of tool usage

 import re
 from litellm import completion
+from textwrap import dedent
+from pydantic import BaseModel, ConfigDict
 from surf_spot_finder.evaluation.test_case import CheckpointCriteria
+class EvaluationResult(BaseModel):
+    """Represents the result of evaluating a criterion"""
+    model_config = ConfigDict(extra="forbid")
+    passed: bool
+    reason: str
+    criteria: str
+    points: int
 def extract_hypothesis_answer(telemetry: List[Dict[str, Any]]) -> str | None:
     """Extract the hypothesis agent final answer from the telemetry data"""
     for span in reversed(telemetry):
 def evaluate_criterion(
     criteria: str,
     model: str,
+    points: int,
+    ground_truth_output: Optional[List[CheckpointCriteria] | Dict[str, Any]] = None,
+    hypothesis_final_answer: Optional[str] = None,
     evidence: Optional[str] = None,
+) -> EvaluationResult:
     """Evaluate a single criterion using LLM"""
+    prompt = dedent(f"""
+    Evaluate if the following criterion was met {"based on the provided evidence" if evidence else "in the agent's answer"}.
+    Criterion: {criteria}
+    """)
+    if ground_truth_output:
+        prompt += dedent(f"""
+        Expected output: {json.dumps(ground_truth_output)}
+        """)
+    if hypothesis_final_answer:
+        prompt += dedent(f"""
+        Agent's  answer: {hypothesis_final_answer}
+        """)
     if evidence:
+        prompt += dedent(f"""
         Telemetry evidence:
         {evidence}
+        """)
     prompt += f"""
     Based on the {"evidence" if evidence else "comparison between the expected output and the actual final answer"},
+    was this criterion satisfied? Answer with:
     1. "passed": true or false
     2. "reason": Brief explanation for your decision
     """
     prompt += """
     Output valid JSON with these three fields only, in the format:
     ```json
     {
         "passed": true,
+        "reason": "I have them"
     }
     ```
     """
             evaluation = json.loads(content)
         evaluation["criteria"] = criteria
     except (json.JSONDecodeError, AttributeError, StopIteration) as e:
+        evaluation = {
             "passed": False,
             "reason": f"Failed to evaluate due to parsing: {str(e)} \n Response: {content}",
             "criteria": criteria,
         }
+    evaluation["points"] = points
+    return EvaluationResult.model_validate(evaluation)
 def verify_checkpoints(
     telemetry: List[Dict[str, Any]],
     checkpoints: List[CheckpointCriteria],
     model: str,
+) -> List[EvaluationResult]:
+    """Verify each checkpoint against the telemetry data using LLM
+    These checkpoints do not take the ground truth or hyupothesis
+    answers into account. They are only concerned with the trace and
+    the specific criteria mentioned.
+    """
     results = []
     for checkpoint in checkpoints:
         criteria = checkpoint.criteria
         evidence = extract_relevant_evidence(telemetry, criteria)
         evaluation = evaluate_criterion(
             criteria=criteria,
+            points=checkpoint.points,
             model=model,
             evidence=evidence,
         )
     ground_truth_answer_dict: Dict[str, Any],
     ground_truth_checkpoints: List[CheckpointCriteria],
     model: str,
+) -> List[EvaluationResult]:
     """
     Verify if the final answer meets all specified criteria
     """
     results = []
     for criterion in ground_truth_checkpoints:
         evaluation = evaluate_criterion(
+            criteria=criterion.criteria,
+            points=criterion.points,
             ground_truth_output=ground_truth_answer_dict,
             hypothesis_final_answer=hypothesis_final_answer,
             model=model,
     """Extract relevant telemetry evidence based on the checkpoint criteria
     TODO this is not a very robust implementation, since it requires knowledge about which tools have been
     implemented. We should abstract this so that it can dynamically figure out what tools may have been used
+    and check for them appropriately. I understand that this tool should probably have some better way of abstracting
+    relevant information from the opentelemetry spans."""
     evidence = ""
     # Look for evidence of tool usage