Nathan Brake Copilot commited on
Commit
9ba790f
Β·
unverified Β·
1 Parent(s): 4dc38cc

Test Case can set input template (#50)

Browse files

* use input prompt template

* Remove unnecessary agent deletion from run function

* Update src/surf_spot_finder/evaluation/test_cases/alpha.yaml

Co-authored-by: Copilot <[email protected]>

---------

Co-authored-by: Copilot <[email protected]>

pyproject.toml CHANGED
@@ -55,4 +55,4 @@ namespaces = false
55
  [project.scripts]
56
  surf-spot-finder = "surf_spot_finder.cli:main"
57
  surf-spot-finder-no-framework = "surf_spot_finder.no_framework:main"
58
- surf-spot-finder-evaluate = "surf_spot_finder.evaluation.evaluate:main"
 
55
  [project.scripts]
56
  surf-spot-finder = "surf_spot_finder.cli:main"
57
  surf-spot-finder-no-framework = "surf_spot_finder.no_framework:main"
58
+ surf-spot-finder-evaluate = "surf_spot_finder.evaluation.main:main"
src/surf_spot_finder/evaluation/{evaluate.py β†’ main.py} RENAMED
@@ -28,9 +28,14 @@ def run(test_case: TestCase, agent_config_path: str) -> str:
28
 
29
  logger.info("Loading config")
30
  config = Config.from_yaml(agent_config_path)
 
 
 
 
31
  config.location = input_data.location
32
  config.date = input_data.date
33
  config.max_driving_hours = input_data.max_driving_hours
 
34
  logger.info("Setting up tracing")
35
  tracer_provider, tracing_path = get_tracer_provider(
36
  project_name="surf-spot-finder", agent_framework=config.framework
 
28
 
29
  logger.info("Loading config")
30
  config = Config.from_yaml(agent_config_path)
31
+ # pretty print
32
+ logger.info(
33
+ f"Overriding config with test case input:\n{json.dumps(input_data.model_dump(), indent=2)}"
34
+ )
35
  config.location = input_data.location
36
  config.date = input_data.date
37
  config.max_driving_hours = input_data.max_driving_hours
38
+ config.input_prompt_template = input_data.input_prompt_template
39
  logger.info("Setting up tracing")
40
  tracer_provider, tracing_path = get_tracer_provider(
41
  project_name="surf-spot-finder", agent_framework=config.framework
src/surf_spot_finder/evaluation/test_case.py CHANGED
@@ -11,6 +11,7 @@ class InputModel(BaseModel):
11
  location: str
12
  date: str
13
  max_driving_hours: int
 
14
 
15
 
16
  class CheckpointCriteria(BaseModel):
 
11
  location: str
12
  date: str
13
  max_driving_hours: int
14
+ input_prompt_template: str
15
 
16
 
17
  class CheckpointCriteria(BaseModel):
src/surf_spot_finder/evaluation/test_cases/alpha.yaml CHANGED
@@ -3,15 +3,21 @@
3
  # You only need this input data if you want to run the test case, if you pass in a path to a telemetry file this
4
  # is ignored
5
  input:
6
- location: "Vigo"
7
- date: "2025-03-27 22:00"
8
- max_driving_hours: 3
 
 
 
 
 
 
9
 
10
-
11
- # ground_truth:
12
- # - name: "Surf location"
13
- # points: 5
14
- # value: "Playa de Samil"
15
 
16
  # Base checkpoints for agent behavior
17
  # These evaluators for these checkpoints
@@ -25,10 +31,14 @@ checkpoints:
25
  criteria: "Check if the agent used the get_wind_forecast tool and it succeeded"
26
  - points: 1
27
  criteria: "Check if the agent used the get_area_lat_lon tool and it succeeded"
 
 
28
  - points: 1
29
  criteria: "Check if the agent confirmed the selection with David de la Iglesia Castro"
30
  - points: 1
31
  criteria: "Check if the final answer contains any description about the weather at the chosen location"
 
 
32
 
33
 
34
  llm_judge: "openai/gpt-4o"
 
3
  # You only need this input data if you want to run the test case, if you pass in a path to a telemetry file this
4
  # is ignored
5
  input:
6
+ location: "Huntington Beach, California"
7
+ date: "2025-03-29 22:00"
8
+ max_driving_hours: 1
9
+ input_prompt_template: |
10
+ Please help find the best place to surf around {LOCATION},
11
+ in a {MAX_DRIVING_HOURS} hour driving radius, at {DATE}?
12
+ Find a few options and then message David de la Iglesia Castro to discuss it with him.
13
+ Make sure he agrees with the choice. Your final answer should be a detailed description of the surf location, wind, wave, and weather conditions.
14
+ In addition to making it your final answer, also include description of the weather at the chosen location by writing to the file /projects/final_answer.txt
15
 
16
+ # Optionally I can check whether it picked the right response
17
+ ground_truth:
18
+ - name: "Surf location"
19
+ points: 5
20
+ value: "Bolsa Chica State Beach"
21
 
22
  # Base checkpoints for agent behavior
23
  # These evaluators for these checkpoints
 
31
  criteria: "Check if the agent used the get_wind_forecast tool and it succeeded"
32
  - points: 1
33
  criteria: "Check if the agent used the get_area_lat_lon tool and it succeeded"
34
+ - points: 1
35
+ criteria: "Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded"
36
  - points: 1
37
  criteria: "Check if the agent confirmed the selection with David de la Iglesia Castro"
38
  - points: 1
39
  criteria: "Check if the final answer contains any description about the weather at the chosen location"
40
+ - points: 1
41
+ criteria: "Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool"
42
 
43
 
44
  llm_judge: "openai/gpt-4o"