Spaces:
Running
Running
Nathan Brake
Copilot
commited on
Test Case can set input template (#50)
Browse files* use input prompt template
* Remove unnecessary agent deletion from run function
* Update src/surf_spot_finder/evaluation/test_cases/alpha.yaml
Co-authored-by: Copilot <[email protected]>
---------
Co-authored-by: Copilot <[email protected]>
pyproject.toml
CHANGED
@@ -55,4 +55,4 @@ namespaces = false
|
|
55 |
[project.scripts]
|
56 |
surf-spot-finder = "surf_spot_finder.cli:main"
|
57 |
surf-spot-finder-no-framework = "surf_spot_finder.no_framework:main"
|
58 |
-
surf-spot-finder-evaluate = "surf_spot_finder.evaluation.
|
|
|
55 |
[project.scripts]
|
56 |
surf-spot-finder = "surf_spot_finder.cli:main"
|
57 |
surf-spot-finder-no-framework = "surf_spot_finder.no_framework:main"
|
58 |
+
surf-spot-finder-evaluate = "surf_spot_finder.evaluation.main:main"
|
src/surf_spot_finder/evaluation/{evaluate.py β main.py}
RENAMED
@@ -28,9 +28,14 @@ def run(test_case: TestCase, agent_config_path: str) -> str:
|
|
28 |
|
29 |
logger.info("Loading config")
|
30 |
config = Config.from_yaml(agent_config_path)
|
|
|
|
|
|
|
|
|
31 |
config.location = input_data.location
|
32 |
config.date = input_data.date
|
33 |
config.max_driving_hours = input_data.max_driving_hours
|
|
|
34 |
logger.info("Setting up tracing")
|
35 |
tracer_provider, tracing_path = get_tracer_provider(
|
36 |
project_name="surf-spot-finder", agent_framework=config.framework
|
|
|
28 |
|
29 |
logger.info("Loading config")
|
30 |
config = Config.from_yaml(agent_config_path)
|
31 |
+
# pretty print
|
32 |
+
logger.info(
|
33 |
+
f"Overriding config with test case input:\n{json.dumps(input_data.model_dump(), indent=2)}"
|
34 |
+
)
|
35 |
config.location = input_data.location
|
36 |
config.date = input_data.date
|
37 |
config.max_driving_hours = input_data.max_driving_hours
|
38 |
+
config.input_prompt_template = input_data.input_prompt_template
|
39 |
logger.info("Setting up tracing")
|
40 |
tracer_provider, tracing_path = get_tracer_provider(
|
41 |
project_name="surf-spot-finder", agent_framework=config.framework
|
src/surf_spot_finder/evaluation/test_case.py
CHANGED
@@ -11,6 +11,7 @@ class InputModel(BaseModel):
|
|
11 |
location: str
|
12 |
date: str
|
13 |
max_driving_hours: int
|
|
|
14 |
|
15 |
|
16 |
class CheckpointCriteria(BaseModel):
|
|
|
11 |
location: str
|
12 |
date: str
|
13 |
max_driving_hours: int
|
14 |
+
input_prompt_template: str
|
15 |
|
16 |
|
17 |
class CheckpointCriteria(BaseModel):
|
src/surf_spot_finder/evaluation/test_cases/alpha.yaml
CHANGED
@@ -3,15 +3,21 @@
|
|
3 |
# You only need this input data if you want to run the test case, if you pass in a path to a telemetry file this
|
4 |
# is ignored
|
5 |
input:
|
6 |
-
location: "
|
7 |
-
date: "2025-03-
|
8 |
-
max_driving_hours:
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
|
16 |
# Base checkpoints for agent behavior
|
17 |
# These evaluators for these checkpoints
|
@@ -25,10 +31,14 @@ checkpoints:
|
|
25 |
criteria: "Check if the agent used the get_wind_forecast tool and it succeeded"
|
26 |
- points: 1
|
27 |
criteria: "Check if the agent used the get_area_lat_lon tool and it succeeded"
|
|
|
|
|
28 |
- points: 1
|
29 |
criteria: "Check if the agent confirmed the selection with David de la Iglesia Castro"
|
30 |
- points: 1
|
31 |
criteria: "Check if the final answer contains any description about the weather at the chosen location"
|
|
|
|
|
32 |
|
33 |
|
34 |
llm_judge: "openai/gpt-4o"
|
|
|
3 |
# You only need this input data if you want to run the test case, if you pass in a path to a telemetry file this
|
4 |
# is ignored
|
5 |
input:
|
6 |
+
location: "Huntington Beach, California"
|
7 |
+
date: "2025-03-29 22:00"
|
8 |
+
max_driving_hours: 1
|
9 |
+
input_prompt_template: |
|
10 |
+
Please help find the best place to surf around {LOCATION},
|
11 |
+
in a {MAX_DRIVING_HOURS} hour driving radius, at {DATE}?
|
12 |
+
Find a few options and then message David de la Iglesia Castro to discuss it with him.
|
13 |
+
Make sure he agrees with the choice. Your final answer should be a detailed description of the surf location, wind, wave, and weather conditions.
|
14 |
+
In addition to making it your final answer, also include description of the weather at the chosen location by writing to the file /projects/final_answer.txt
|
15 |
|
16 |
+
# Optionally I can check whether it picked the right response
|
17 |
+
ground_truth:
|
18 |
+
- name: "Surf location"
|
19 |
+
points: 5
|
20 |
+
value: "Bolsa Chica State Beach"
|
21 |
|
22 |
# Base checkpoints for agent behavior
|
23 |
# These evaluators for these checkpoints
|
|
|
31 |
criteria: "Check if the agent used the get_wind_forecast tool and it succeeded"
|
32 |
- points: 1
|
33 |
criteria: "Check if the agent used the get_area_lat_lon tool and it succeeded"
|
34 |
+
- points: 1
|
35 |
+
criteria: "Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded"
|
36 |
- points: 1
|
37 |
criteria: "Check if the agent confirmed the selection with David de la Iglesia Castro"
|
38 |
- points: 1
|
39 |
criteria: "Check if the final answer contains any description about the weather at the chosen location"
|
40 |
+
- points: 1
|
41 |
+
criteria: "Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool"
|
42 |
|
43 |
|
44 |
llm_judge: "openai/gpt-4o"
|