Spaces:

mozilla-ai
/

surf-spot-finder

Running

App Files Files Community

Nathan Brake commited on Apr 7

Commit

9546c66

unverified ·

1 Parent(s): cdd4ebc

Brake/any agent eval code (#52)

Browse files

* remove eval code

* Only do eval with existing trace

Files changed (12) hide show

examples/openai_single_agent_vertical.yaml +4 -5
notebooks/experiment/test_cases/alpha.yaml +26 -33
src/surf_spot_finder/evaluation/evaluators/CheckpointEvaluator.py +0 -44
src/surf_spot_finder/evaluation/evaluators/HypothesisEvaluator.py +0 -29
src/surf_spot_finder/evaluation/evaluators/LLMEvaluator.py +0 -95
src/surf_spot_finder/evaluation/evaluators/QuestionAnsweringSquadEvaluator.py +0 -40
src/surf_spot_finder/evaluation/evaluators/__init__.py +0 -9
src/surf_spot_finder/evaluation/evaluators/schemas.py +0 -11
src/surf_spot_finder/evaluation/main.py +18 -154
src/surf_spot_finder/evaluation/results_saver.py +0 -64
src/surf_spot_finder/evaluation/test_case.py +0 -86
src/surf_spot_finder/utils/logging.py +0 -14

examples/openai_single_agent_vertical.yaml CHANGED Viewed

@@ -1,11 +1,11 @@
-location: Pontevedra
-date: 2025-04-10 12:00
-max_driving_hours: 2
 framework: openai
 main_agent:
-  model_id: o3-mini
   tools:
   - "surf_spot_finder.tools.driving_hours_to_meters"
   - "surf_spot_finder.tools.get_area_lat_lon"
@@ -13,5 +13,4 @@ main_agent:
   - "surf_spot_finder.tools.get_wave_forecast"
   - "surf_spot_finder.tools.get_wind_forecast"
   - "any_agent.tools.search_web"
-  - "any_agent.tools.show_plan"
   - "any_agent.tools.visit_webpage"

+location: Lisbon
+date: 2025-04-08 19:00
+max_driving_hours: 1
 framework: openai
 main_agent:
+  model_id: gpt-4o
   tools:
   - "surf_spot_finder.tools.driving_hours_to_meters"
   - "surf_spot_finder.tools.get_area_lat_lon"
   - "surf_spot_finder.tools.get_wave_forecast"
   - "surf_spot_finder.tools.get_wind_forecast"
   - "any_agent.tools.search_web"
   - "any_agent.tools.visit_webpage"

notebooks/experiment/test_cases/alpha.yaml CHANGED Viewed

@@ -1,44 +1,37 @@
-output_path: "results.json"
-# You only need this input data if you want to run the test case, if you pass in a path to a telemetry file this
-# is ignored
-input:
-  location: "Huntington Beach, California"
-  date: "2025-04-10 14:00"
-  max_driving_hours: 1
-  input_prompt_template: |
-    Please help find the best place to surf around {LOCATION},
-    in a {MAX_DRIVING_HOURS} hour driving radius, at {DATE}?
-    Find a few options and then message David de la Iglesia Castro to discuss it with him.
-    Make sure he agrees with the choice. Your final answer should be a detailed description of the surf location, wind, wave, and weather conditions.
-    In addition to making it your final answer, also include description of the weather at the chosen location by writing to the file /projects/final_answer.txt
-# Optionally I can check whether it picked the right response
-ground_truth:
-  - name: "Surf location"
-    points: 5
-    value: "T Street, San Clemente, California"
 # Base checkpoints for agent behavior
-# These evaluators for these checkpoints
-# will not consider the hypothesis answer or final answer in their decision making
 checkpoints:
   - points: 1
-    criteria: "Check if the agent used the get_surfing_spots tool and it succeeded, and that the tool was used before the get_wave_forecast and get_wind_forecast tools"
   - points: 1
-    criteria: "Check if the agent used the get_wave_forecast tool and it succeeded"
   - points: 1
-    criteria: "Check if the agent used the get_wind_forecast tool and it succeeded"
   - points: 1
-    criteria: "Check if the agent used the get_area_lat_lon tool and it succeeded"
   - points: 1
-    criteria: "Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded"
   - points: 1
-    criteria: "Check if the agent confirmed the selection with David de la Iglesia Castro"
   - points: 1
-    criteria: "Check if the final answer contains any description about the weather at the chosen location"
-  - points: 1
-    criteria: "Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool"
-llm_judge: "openai/gpt-4o"

 # Base checkpoints for agent behavior
+# The evaluators for these checkpoints will not consider the hypothesis answer or final answer in their decision making
+llm_judge: openai/gpt-4o
 checkpoints:
   - points: 1
+    criteria:  |
+      Check if the agent used the get_surfing_spots tool and it succeeded,
+      and that the tool was used before the get_wave_forecast and get_wind_forecast tools
   - points: 1
+    criteria: Check if the agent used the get_wave_forecast tool and it succeeded
   - points: 1
+    criteria: Check if the agent used the get_wind_forecast tool and it succeeded
   - points: 1
+    criteria: Check if the agent used the get_area_lat_lon tool and it succeeded
   - points: 1
+    criteria: |
+      Check if the agent used the driving_hours_to_meters tool to convert
+      the driving hours to meters and it succeeded
   - points: 1
+    criteria: Check if the final answer contains any description about the weather at the chosen location
   - points: 1
+    criteria: Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool
+# Optionally, you can check whether the final answer is what was expected. Checking this value does not use an LLM
+ground_truth:
+  - name: Surf location
+    points: 5
+    value: Fonte da Telha
+output_path: results.json
+# You only need this input data if you want to run the test case, if you pass in a path to a telemetry file this
+# is ignored
+input:
+  location: "Lisbon"
+  date: "2025-04-08 19:00"
+  max_driving_hours: 1

src/surf_spot_finder/evaluation/evaluators/CheckpointEvaluator.py DELETED Viewed

@@ -1,44 +0,0 @@
-from typing import Dict, List, Any
-from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
-from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
-from any_agent.telemetry import TelemetryProcessor
-from surf_spot_finder.evaluation.test_case import CheckpointCriteria
-from surf_spot_finder.utils.logging import get_logger
-logger = get_logger()
-class CheckpointEvaluator(LLMEvaluator):
-    """Evaluates checkpoints against telemetry"""
-    def evaluate(
-        self,
-        telemetry: List[Dict[str, Any]],
-        checkpoints: List[CheckpointCriteria],
-        processor: TelemetryProcessor,
-    ) -> List[EvaluationResult]:
-        """
-        Verify each checkpoint against the telemetry data using LLM
-        Args:
-            telemetry: The telemetry data to evaluate
-            checkpoints: List of checkpoint criteria to verify
-            processor: Telemetry processor to extract evidence
-        Returns:
-            List of evaluation results
-        """
-        evidence = processor.extract_evidence(telemetry)
-        logger.info(f"""<yellow>Evidence\n{evidence}</yellow>\n""")
-        results = []
-        for checkpoint in checkpoints:
-            evaluation = self.llm_evaluate_with_criterion(
-                criteria=checkpoint.criteria,
-                points=checkpoint.points,
-                evidence=evidence,
-            )
-            results.append(evaluation)
-        return results

src/surf_spot_finder/evaluation/evaluators/HypothesisEvaluator.py DELETED Viewed

@@ -1,29 +0,0 @@
-from typing import Dict, List, Any
-from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
-from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
-from surf_spot_finder.evaluation.test_case import CheckpointCriteria
-class HypothesisEvaluator(LLMEvaluator):
-    """Evaluates the final answer against ground truth"""
-    def evaluate(
-        self,
-        hypothesis_final_answer: str,
-        ground_truth_answer_dict: Dict[str, Any],
-        ground_truth_checkpoints: List[CheckpointCriteria],
-    ) -> List[EvaluationResult]:
-        """Verify if the final answer meets all specified criteria"""
-        results = []
-        for criterion in ground_truth_checkpoints:
-            evaluation = self.llm_evaluate_with_criterion(
-                criteria=criterion.criteria,
-                points=criterion.points,
-                ground_truth_output=ground_truth_answer_dict,
-                hypothesis_final_answer=hypothesis_final_answer,
-            )
-            results.append(evaluation)
-        return results

src/surf_spot_finder/evaluation/evaluators/LLMEvaluator.py DELETED Viewed

@@ -1,95 +0,0 @@
-from abc import ABC
-import json
-import re
-from typing import Dict, List, Any, Optional, Union
-from textwrap import dedent
-from litellm import completion
-from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
-from surf_spot_finder.evaluation.test_case import CheckpointCriteria
-class LLMEvaluator(ABC):
-    """Base class for evaluators that use LLM-as-judge"""
-    def __init__(self, model: str):
-        self.model = model
-    def llm_evaluate_with_criterion(
-        self,
-        criteria: str,
-        points: int,
-        ground_truth_output: Optional[
-            Union[List[CheckpointCriteria], Dict[str, Any]]
-        ] = None,
-        hypothesis_final_answer: Optional[str] = None,
-        evidence: Optional[str] = None,
-    ) -> EvaluationResult:
-        """Evaluate a single criterion using LLM"""
-        prompt = dedent(f"""
-        Evaluate if the following criterion was met {"based on the provided evidence" if evidence else "in the agent's answer"}.
-        Criterion: {criteria}
-        """)
-        if ground_truth_output:
-            prompt += dedent(f"""
-            Expected output: {json.dumps(ground_truth_output)}
-            """)
-        if hypothesis_final_answer:
-            prompt += dedent(f"""
-            Agent's answer: {hypothesis_final_answer}
-            """)
-        if evidence:
-            prompt += dedent(f"""
-            Telemetry evidence:
-            {evidence}
-            """)
-        prompt += f"""
-        Based on the {"evidence" if evidence else "comparison between the expected output and the actual final answer"},
-        was this criterion satisfied? Answer with:
-        1. "passed": true or false
-        2. "reason": Brief explanation for your decision
-        """
-        prompt += """
-        Output valid JSON with these three fields only, in the format:
-        ```json
-        {
-            "passed": true,
-            "reason": "I have them"
-        }
-        ```
-        """
-        response = completion(
-            model=self.model, messages=[{"role": "user", "content": prompt}]
-        )
-        content = response.choices[0].message.content
-        try:
-            # Extract JSON from the response - looks for patterns like ```json {...} ``` or just {...}
-            json_match = re.search(
-                r"```(?:json)?\s*(\{.*?\})\s*```|(\{.*?\})", content, re.DOTALL
-            )
-            if json_match:
-                # Use the first matching group that captured content
-                json_str = next(group for group in json_match.groups() if group)
-                evaluation = json.loads(json_str)
-            else:
-                # Fallback: try parsing the whole content as JSON
-                evaluation = json.loads(content)
-            evaluation["criteria"] = criteria
-        except (json.JSONDecodeError, AttributeError, StopIteration) as e:
-            evaluation = {
-                "passed": False,
-                "reason": f"Failed to evaluate due to parsing: {str(e)} \n Response: {content}",
-                "criteria": criteria,
-            }
-        evaluation["points"] = points
-        return EvaluationResult.model_validate(evaluation)

src/surf_spot_finder/evaluation/evaluators/QuestionAnsweringSquadEvaluator.py DELETED Viewed

@@ -1,40 +0,0 @@
-from typing import List
-import evaluate
-from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
-class QuestionAnsweringSquadEvaluator:
-    """Directly compares answers without using LLM-as-judge"""
-    def __init__(self):
-        self.metric = evaluate.load("squad")
-    def evaluate(
-        self, hypothesis_answer: str, ground_truth_answer: list
-    ) -> List[EvaluationResult]:
-        """Directly compare answers using simple matching"""
-        # format the answers so that they're dicts with 'id' and 'prediction' keys for hypo
-        # and the ref has id and answers keys
-        hypothesis_answer = [{"id": "1", "prediction_text": hypothesis_answer}]
-        ground_truth_answer = [
-            {
-                "id": "1",
-                "answers": {
-                    "answer_start": [0],
-                    "text": [ground_truth_answer[0]["value"]],
-                },
-            }
-        ]
-        # Use the SQuAD metric to compare answers
-        result = self.metric.compute(
-            predictions=hypothesis_answer, references=ground_truth_answer
-        )
-        match = EvaluationResult(
-            passed=True if int(result["exact_match"]) == 1 else False,
-            reason=f"Partial Match (F1) score is {round(result['f1'], 2)}",
-            criteria="Is the answer a direct match?",
-            points=1,
-        )
-        return [match]

src/surf_spot_finder/evaluation/evaluators/__init__.py DELETED Viewed

@@ -1,9 +0,0 @@
-from .CheckpointEvaluator import CheckpointEvaluator
-from .QuestionAnsweringSquadEvaluator import QuestionAnsweringSquadEvaluator
-from .HypothesisEvaluator import HypothesisEvaluator
-__all__ = [
-    "CheckpointEvaluator",
-    "QuestionAnsweringSquadEvaluator",
-    "HypothesisEvaluator",
-]

src/surf_spot_finder/evaluation/evaluators/schemas.py DELETED Viewed

@@ -1,11 +0,0 @@
-from pydantic import BaseModel, ConfigDict
-class EvaluationResult(BaseModel):
-    """Represents the result of evaluating a criterion"""
-    model_config = ConfigDict(extra="forbid")
-    passed: bool
-    reason: str
-    criteria: str
-    points: int

src/surf_spot_finder/evaluation/main.py CHANGED Viewed

@@ -1,153 +1,28 @@
-import json
-from textwrap import dedent
-from typing import Any, Dict, List, Optional
-from any_agent import AnyAgent
-from any_agent.telemetry import TelemetryProcessor
-from any_agent.tracing import setup_tracing
 from fire import Fire
-from surf_spot_finder.config import (
-    Config,
-)
-from surf_spot_finder.evaluation.evaluators import (
-    CheckpointEvaluator,
-    HypothesisEvaluator,
-    QuestionAnsweringSquadEvaluator,
-)
-from surf_spot_finder.evaluation.test_case import TestCase
-from surf_spot_finder.evaluation.results_saver import save_evaluation_results
-from surf_spot_finder.utils.logging import get_logger
 # Replace the existing logger setup with the shared logger
 logger = get_logger()
-def run(agent_config: Config) -> str:
-    logger.info("Setting up tracing")
-    tracing_path = setup_tracing(agent_config.framework, "output")
-    logger.info(f"Loading {agent_config.framework} agent")
-    logger.info(f"{agent_config.managed_agents}")
-    agent = AnyAgent.create(
-        agent_framework=agent_config.framework,
-        agent_config=agent_config.main_agent,
-        managed_agents=agent_config.managed_agents,
-    )
-    query = agent_config.input_prompt_template.format(
-        LOCATION=agent_config.location,
-        MAX_DRIVING_HOURS=agent_config.max_driving_hours,
-        DATE=agent_config.date,
-    )
-    logger.info(f"Running agent with query:\n{query}")
-    agent.run(query)
-    logger.success("Done!")
-    return tracing_path
-def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
-    # load the json file
-    with open(telemetry_path, "r") as f:
-        telemetry: List[Dict[str, Any]] = json.loads(f.read())
-    logger.info(f"Telemetry loaded from {telemetry_path}")
-    agent_framework = TelemetryProcessor.determine_agent_framework(telemetry)
-    # Extract the final answer from the telemetry
-    processor = TelemetryProcessor.create(agent_framework)
-    hypothesis_answer = processor.extract_hypothesis_answer(trace=telemetry)
-    # Checkpoint evaluation
-    checkpoint_evaluator = CheckpointEvaluator(model=test_case.llm_judge)
-    checkpoint_results = checkpoint_evaluator.evaluate(
-        telemetry=telemetry,
-        checkpoints=test_case.checkpoints,
-        processor=processor,
-    )
-    # Hypothesis answer evaluation
-    hypothesis_evaluator = HypothesisEvaluator(model=test_case.llm_judge)
-    hypothesis_answer_results = hypothesis_evaluator.evaluate(
-        hypothesis_final_answer=hypothesis_answer,
-        ground_truth_answer_dict=test_case.ground_truth,
-        ground_truth_checkpoints=test_case.final_answer_criteria,
-    )
-    # Direct answer evaluation (new)
-    if test_case.ground_truth:
-        direct_evaluator = QuestionAnsweringSquadEvaluator()
-        direct_results = direct_evaluator.evaluate(
-            hypothesis_answer=hypothesis_answer,
-            ground_truth_answer=test_case.ground_truth,
-        )
-    else:
-        direct_results = []
-    # Combine all results
-    verification_results = (
-        checkpoint_results + hypothesis_answer_results + direct_results
-    )
-    # Summarize results
-    output_message = ""
-    output_message += (
-        f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>\n"""
-    )
-    failed_checks = [r for r in verification_results if not r.passed]
-    passed_checks = [r for r in verification_results if r.passed]
-    missed_points = sum([r.points for r in failed_checks])
-    won_points = sum([r.points for r in passed_checks])
-    if passed_checks:
-        for check in passed_checks:
-            message = dedent(
-                f"""
-                <green>Passed:
-                - {check.criteria}
-                - {check.reason}</green>"""
-            )
-            output_message += message + "\n"
-    if failed_checks:
-        for check in failed_checks:
-            message = dedent(
-                f"""
-                <red>Failed:
-                - {check.criteria}
-                - {check.reason}</red>"""
-            )
-            output_message += message + "\n"
-    else:
-        output_message += "<green>All checkpoints passed!</green>\n"
-    output_message += f"<green>Passed checkpoints: {len(passed_checks)}</green>\n"
-    output_message += f"<red>Failed checkpoints: {len(failed_checks)}</red>\n"
-    output_message += "<green>=====================================</green>\n"
-    output_message += (
-        f"<green>Score: {won_points}/{won_points + missed_points}</green>\n"
-    )
-    output_message += "<green>=====================================</green>\n"
-    logger.info(output_message)
-    if won_points + missed_points == 0:
-        raise ValueError("No points were defined in the test case")
-    score = won_points / (won_points + missed_points) * 100
-    # Save the evaluation results
-    save_evaluation_results(
-        test_case=test_case,
-        output_path=test_case.output_path,
-        output_message=output_message,
-        telemetry_path=telemetry_path,
-        hypothesis_answer=hypothesis_answer,
-        passed_checks=len(passed_checks),
-        failed_checks=len(failed_checks),
-        score=score,
-    )
 def evaluate(
     test_case_path: str,
-    agent_config_path: str = None,
-    telemetry_path: Optional[str] = None,
 ) -> None:
     """
     Evaluate agent performance using either a provided telemetry file or by running the agent.
@@ -156,23 +31,12 @@ def evaluate(
         telemetry_path: Optional path to an existing telemetry file. If not provided,
                         the agent will be run to generate one.
     """
-    test_case = TestCase.from_yaml(
-        test_case_path=test_case_path, agent_config_path=agent_config_path
-    )
-    if telemetry_path is None:
-        logger.info(
-            "No telemetry path provided. Running agent to generate telemetry..."
-        )
-        assert (
-            agent_config_path is not None
-        ), "Agent config path must be provided if running agent"
-        telemetry_path = run(test_case.agent_config)
-    else:
-        logger.info(f"Using provided telemetry file: {telemetry_path}")
-        logger.info(
-            "For this to work, the telemetry file must align with the test case.",
-        )
     evaluate_telemetry(test_case, telemetry_path)

+from typing import Optional
 from fire import Fire
+from pydantic import BaseModel, ConfigDict
+from any_agent.evaluation.test_case import TestCase
+from any_agent.evaluation.logging import get_logger
+from any_agent.evaluation.evaluate import evaluate_telemetry
 # Replace the existing logger setup with the shared logger
 logger = get_logger()
+class InputModel(BaseModel):
+    """Input configuration for an evaluation test case"""
+    model_config = ConfigDict(extra="forbid")
+    location: str
+    date: str
+    max_driving_hours: int
+    input_prompt_template: str | None = None
 def evaluate(
     test_case_path: str,
+    telemetry_path: Optional[str],
 ) -> None:
     """
     Evaluate agent performance using either a provided telemetry file or by running the agent.
         telemetry_path: Optional path to an existing telemetry file. If not provided,
                         the agent will be run to generate one.
     """
+    test_case = TestCase.from_yaml(test_case_path)
+    logger.info(f"Using provided telemetry file: {telemetry_path}")
+    logger.info(
+        "For this to work, the telemetry file must align with the test case.",
+    )
     evaluate_telemetry(test_case, telemetry_path)

src/surf_spot_finder/evaluation/results_saver.py DELETED Viewed

@@ -1,64 +0,0 @@
-import os
-import pandas as pd
-from surf_spot_finder.evaluation.test_case import TestCase
-from surf_spot_finder.utils.logging import get_logger
-# Use the shared logger
-logger = get_logger()
-def save_evaluation_results(
-    test_case: TestCase,
-    output_path: str,
-    output_message: str,
-    telemetry_path: str,
-    hypothesis_answer: str,
-    passed_checks: int,
-    failed_checks: int,
-    score: float,
-) -> None:
-    """
-    Save evaluation results to the specified output path.
-    Args:
-        test_case: Path to the test case file
-        agent_config: Path to the agent configuration file
-        output_path: Path to save the results
-        output_message: Formatted output message with evaluation details
-        telemetry_path: Path to the telemetry file used
-        hypothesis_answer: The extracted hypothesis answer
-        passed_checks: Number of passed checkpoints
-        failed_checks: Number of failed checkpoints
-        score: Evaluation score as a percentage
-    """
-    # See if the output_path file exists
-    if os.path.exists(output_path):
-        logger.info(f"Reading existing output from {output_path}")
-        df = pd.read_json(output_path, orient="records", lines=True)
-    else:
-        logger.info(f"Creating new output file at {output_path}")
-        df = pd.DataFrame()
-    df = pd.concat(
-        [
-            df,
-            pd.DataFrame(
-                [
-                    {
-                        "config": test_case.model_dump(),
-                        "agent_config_path": test_case.agent_config_path,
-                        "test_case_path": test_case.test_case_path,
-                        "output_message": output_message,
-                        "telemetry_path": telemetry_path,
-                        "hypothesis_answer": hypothesis_answer,
-                        "passed_checks": passed_checks,
-                        "failed_checks": failed_checks,
-                        "score": round(score, 2),
-                    }
-                ]
-            ),
-        ]
-    )
-    logger.info(f"Writing output to {output_path}")
-    df.to_json(output_path, orient="records", lines=True)

src/surf_spot_finder/evaluation/test_case.py DELETED Viewed

@@ -1,86 +0,0 @@
-from typing import Dict, List, Any
-from pydantic import BaseModel, Field, ConfigDict
-import yaml
-from litellm import validate_environment
-from surf_spot_finder.config import Config
-class InputModel(BaseModel):
-    """Input configuration for the surf spot finder test case"""
-    model_config = ConfigDict(extra="forbid")
-    location: str
-    date: str
-    max_driving_hours: int
-    input_prompt_template: str
-class CheckpointCriteria(BaseModel):
-    """Represents a checkpoint criteria with a description"""
-    model_config = ConfigDict(extra="forbid")
-    criteria: str
-    points: int
-class TestCase(BaseModel):
-    model_config = ConfigDict(extra="forbid")
-    input: InputModel
-    ground_truth: List[Dict[str, Any]] = Field(default_factory=list)
-    checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
-    llm_judge: str
-    final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
-    test_case_path: str
-    agent_config_path: str
-    agent_config: Config
-    output_path: str = "output/results.json"
-    @classmethod
-    def from_yaml(cls, test_case_path: str, agent_config_path: str) -> "TestCase":
-        """Load a test case from a YAML file and process it"""
-        with open(test_case_path, "r") as f:
-            test_case_dict = yaml.safe_load(f)
-        final_answer_criteria = []
-        def add_gt_final_answer_criteria(ground_truth_list):
-            """Add checkpoints for each item in the ground_truth list"""
-            for item in ground_truth_list:
-                if isinstance(item, dict) and "name" in item and "value" in item:
-                    points = item.get(
-                        "points", 1
-                    )  # Default to 1 if points not specified
-                    final_answer_criteria.append(
-                        {
-                            "points": points,
-                            "criteria": f"Check if {item['name']} is approximately '{item['value']}'.",
-                        }
-                    )
-        if "ground_truth" in test_case_dict:
-            add_gt_final_answer_criteria(test_case_dict["ground_truth"])
-            test_case_dict["final_answer_criteria"] = final_answer_criteria
-            # remove the points from the ground_truth list but keep the name and value
-            test_case_dict["ground_truth"] = [
-                item
-                for item in test_case_dict["ground_truth"]
-                if isinstance(item, dict)
-            ]
-        test_case_dict["test_case_path"] = test_case_path
-        test_case_dict["agent_config_path"] = agent_config_path
-        with open(agent_config_path, "r") as f:
-            agent_config_dict = yaml.safe_load(f)
-        agent_config_dict["location"] = test_case_dict["input"]["location"]
-        agent_config_dict["date"] = test_case_dict["input"]["date"]
-        agent_config_dict["max_driving_hours"] = test_case_dict["input"][
-            "max_driving_hours"
-        ]
-        agent_config_dict["input_prompt_template"] = test_case_dict["input"][
-            "input_prompt_template"
-        ]
-        agent_config = Config.model_validate(agent_config_dict)
-        test_case_dict["agent_config"] = agent_config
-        # verify that the llm_judge is a valid litellm model
-        validate_environment(test_case_dict["llm_judge"])
-        return cls.model_validate(test_case_dict)

src/surf_spot_finder/utils/logging.py DELETED Viewed

@@ -1,14 +0,0 @@
-import sys
-from loguru import logger
-# Remove default logger
-logger.remove()
-# Add custom colored logger
-logger = logger.opt(ansi=True)
-logger.add(sys.stdout, colorize=True, format="{message}")
-# Export configured logger
-def get_logger():
-    return logger