Spaces:

mozilla-ai
/

surf-spot-finder

Running

App Files Files Community

Nathan Brake commited on May 8

Commit

515cbf5

unverified ·

1 Parent(s): c211e28

Optionally Evaluate Cases after generating trace (#57)

Browse files

Files changed (5) hide show

README.md +1 -1
examples/single_agent_with_tools.yaml +21 -0
pyproject.toml +1 -1
src/surf_spot_finder/cli.py +39 -5
src/surf_spot_finder/config.py +5 -1

README.md CHANGED Viewed

@@ -49,7 +49,7 @@ pip install -e .  # Install root project dependencies
 ### 3️⃣ Run
 ```bash
-surf-spot-finder
 ```
 ## How it Works

 ### 3️⃣ Run
 ```bash
+surf-spot-finder examples/single_agent_with_tools.yaml
 ```
 ## How it Works

examples/single_agent_with_tools.yaml CHANGED Viewed

@@ -8,3 +8,24 @@ main_agent:
   - "surf_spot_finder.tools.get_wind_forecast"
   - "any_agent.tools.search_web"
   - "any_agent.tools.visit_webpage"

   - "surf_spot_finder.tools.get_wind_forecast"
   - "any_agent.tools.search_web"
   - "any_agent.tools.visit_webpage"
+evaluation_cases:
+  - llm_judge: openai/gpt-4.1-mini
+    checkpoints:
+      - criteria: "Check if the agent used the get_surfing_spots tool and it succeeded, and that the tool was used before the get_wave_forecast and get_wind_forecast tools"
+        points: 1
+      - criteria: "Check if the agent used the get_wave_forecast tool and it succeeded"
+        points: 1
+      - criteria: "Check if the agent used the get_wind_forecast tool and it succeeded"
+        points: 1
+      - criteria: "Check if the agent used the get_area_lat_lon tool and it succeeded"
+        points: 1
+      - criteria: "Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded"
+        points: 1
+      - criteria: "Check if the final answer contains any description about the weather at the chosen location"
+        points: 1
+      - criteria: "Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool"
+        points: 1
+      - criteria: "Check that the agent completed in fewer than 10 steps"
+        points: 1

pyproject.toml CHANGED Viewed

@@ -9,7 +9,7 @@ license = {text = "Apache-2.0"}
 requires-python = ">=3.11"
 dynamic = ["version"]
 dependencies = [
-  "any-agent[all]",
   "fire",
   "pydantic",
   "pyyaml",

 requires-python = ">=3.11"
 dynamic = ["version"]
 dependencies = [
+  "any-agent[all]>=0.12.2",
   "fire",
   "pydantic",
   "pyyaml",

src/surf_spot_finder/cli.py CHANGED Viewed

@@ -3,8 +3,10 @@ import os
 from pathlib import Path
 from any_agent import AgentFramework, AnyAgent, TracingConfig
 from fire import Fire
 from any_agent.logging import logger
 from surf_spot_finder.config import (
     Config,
@@ -27,7 +29,7 @@ async def find_surf_spot(
     if config_file is None:
         config = Config.from_dict({})
     else:
-        logger.info(f"Loading {config_file}")
         config = Config.from_yaml(config_file)
     if not config.main_agent.instructions:
@@ -36,8 +38,8 @@ async def find_surf_spot(
         elif config.framework == AgentFramework.OPENAI:
             config.main_agent.instructions = SINGLE_AGENT_SYSTEM_PROMPT
-    logger.info(f"Loading {config.framework} agent")
-    logger.info(f"{config.managed_agents}")
     agent = await AnyAgent.create_async(
         agent_framework=config.framework,
         agent_config=config.main_agent,
@@ -50,10 +52,10 @@ async def find_surf_spot(
         MAX_DRIVING_HOURS=config.max_driving_hours,
         DATE=config.date,
     )
-    logger.info(f"Running agent with query:\n{query}")
     agent_trace = await agent.run_async(query)
-    logger.info(f"Final output from agent:\n{agent_trace.final_output}")
     # dump the trace in the "output" directory
     output_dir = "output"
@@ -63,6 +65,38 @@ async def find_surf_spot(
     with open(file_path, "w") as f:
         f.write(agent_trace.model_dump_json(indent=2))
 def main():
     Fire(find_surf_spot)

 from pathlib import Path
 from any_agent import AgentFramework, AnyAgent, TracingConfig
+from any_agent.evaluation.schemas import TraceEvaluationResult
 from fire import Fire
 from any_agent.logging import logger
+from any_agent.evaluation import evaluate
 from surf_spot_finder.config import (
     Config,
     if config_file is None:
         config = Config.from_dict({})
     else:
+        logger.info("Loading %s", config_file)
         config = Config.from_yaml(config_file)
     if not config.main_agent.instructions:
         elif config.framework == AgentFramework.OPENAI:
             config.main_agent.instructions = SINGLE_AGENT_SYSTEM_PROMPT
+    logger.info("Loading %s agent", config.framework)
+    logger.info("Managed agents: %s", config.managed_agents)
     agent = await AnyAgent.create_async(
         agent_framework=config.framework,
         agent_config=config.main_agent,
         MAX_DRIVING_HOURS=config.max_driving_hours,
         DATE=config.date,
     )
+    logger.info("Running agent with query:\n%s", query)
     agent_trace = await agent.run_async(query)
+    logger.info("Final output from agent:\n%s", agent_trace.final_output)
     # dump the trace in the "output" directory
     output_dir = "output"
     with open(file_path, "w") as f:
         f.write(agent_trace.model_dump_json(indent=2))
+    if config.evaluation_cases is not None:
+        results = []
+        logger.info("Found evaluation cases, running trace evaluation")
+        for i, case in enumerate(config.evaluation_cases):
+            logger.info("Evaluating case: %s", case)
+            result: TraceEvaluationResult = evaluate(
+                evaluation_case=case,
+                trace=agent_trace,
+                agent_framework=config.framework,
+            )
+            for list_of_checkpoints in [
+                result.checkpoint_results,
+                result.direct_results,
+                result.hypothesis_answer_results,
+            ]:
+                for checkpoint in list_of_checkpoints:
+                    msg = (
+                        f"Checkpoint: {checkpoint.criteria}\n"
+                        f"\tPassed: {checkpoint.passed}\n"
+                        f"\tReason: {checkpoint.reason}\n"
+                        f"\tScore: {'%d/%d' % (checkpoint.points, checkpoint.points) if checkpoint.passed else '0/%d' % checkpoint.points}"
+                    )
+                    logger.info(msg)
+            logger.info("==========================")
+            logger.info("Overall Score: %d%%", 100 * result.score)
+            logger.info("==========================")
+            results.append(result)
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        file_path = Path(output_dir) / f"{timestamp}_eval_case_{i}.json"
+        with open(file_path, "w") as f:
+            f.write(result.model_dump_json(indent=2))
 def main():
     Fire(find_surf_spot)

src/surf_spot_finder/config.py CHANGED Viewed

@@ -8,6 +8,7 @@ from pydantic import AfterValidator, BaseModel, ConfigDict, FutureDatetime, Posi
 import yaml
 from rich.prompt import Prompt
 from any_agent.logging import logger
 import geocoder
 from litellm.litellm_core_utils.get_llm_provider_logic import (
     get_llm_provider,
@@ -36,7 +37,7 @@ def ask_framework() -> AgentFramework:
         [f"{i}: {framework}" for i, framework in enumerate(frameworks)]
     )
     prompt = f"Select the agent framework to use:\n{frameworks_str}\n"
-    choice = Prompt.ask(prompt, default="3")
     try:
         choice = int(choice)
         if choice < 0 or choice >= len(frameworks):
@@ -148,6 +149,8 @@ class Config(BaseModel):
     main_agent: AgentConfig
     managed_agents: list[AgentConfig] | None = None
     @classmethod
     def from_dict(cls, data: dict) -> "Config":
         """
@@ -212,6 +215,7 @@ class Config(BaseModel):
             data["date"] = date_picker()
         else:
             logger.info(f"Using date {data['date']}")
         return cls(**data)
     @classmethod

 import yaml
 from rich.prompt import Prompt
 from any_agent.logging import logger
+from any_agent.evaluation import EvaluationCase
 import geocoder
 from litellm.litellm_core_utils.get_llm_provider_logic import (
     get_llm_provider,
         [f"{i}: {framework}" for i, framework in enumerate(frameworks)]
     )
     prompt = f"Select the agent framework to use:\n{frameworks_str}\n"
+    choice = Prompt.ask(prompt, default="0")
     try:
         choice = int(choice)
         if choice < 0 or choice >= len(frameworks):
     main_agent: AgentConfig
     managed_agents: list[AgentConfig] | None = None
+    evaluation_cases: list[EvaluationCase] | None = None
     @classmethod
     def from_dict(cls, data: dict) -> "Config":
         """
             data["date"] = date_picker()
         else:
             logger.info(f"Using date {data['date']}")
         return cls(**data)
     @classmethod