Nathan Brake commited on
Commit
9546c66
·
unverified ·
1 Parent(s): cdd4ebc

Brake/any agent eval code (#52)

Browse files

* remove eval code

* Only do eval with existing trace

examples/openai_single_agent_vertical.yaml CHANGED
@@ -1,11 +1,11 @@
1
- location: Pontevedra
2
- date: 2025-04-10 12:00
3
- max_driving_hours: 2
4
 
5
  framework: openai
6
 
7
  main_agent:
8
- model_id: o3-mini
9
  tools:
10
  - "surf_spot_finder.tools.driving_hours_to_meters"
11
  - "surf_spot_finder.tools.get_area_lat_lon"
@@ -13,5 +13,4 @@ main_agent:
13
  - "surf_spot_finder.tools.get_wave_forecast"
14
  - "surf_spot_finder.tools.get_wind_forecast"
15
  - "any_agent.tools.search_web"
16
- - "any_agent.tools.show_plan"
17
  - "any_agent.tools.visit_webpage"
 
1
+ location: Lisbon
2
+ date: 2025-04-08 19:00
3
+ max_driving_hours: 1
4
 
5
  framework: openai
6
 
7
  main_agent:
8
+ model_id: gpt-4o
9
  tools:
10
  - "surf_spot_finder.tools.driving_hours_to_meters"
11
  - "surf_spot_finder.tools.get_area_lat_lon"
 
13
  - "surf_spot_finder.tools.get_wave_forecast"
14
  - "surf_spot_finder.tools.get_wind_forecast"
15
  - "any_agent.tools.search_web"
 
16
  - "any_agent.tools.visit_webpage"
notebooks/experiment/test_cases/alpha.yaml CHANGED
@@ -1,44 +1,37 @@
1
- output_path: "results.json"
2
-
3
- # You only need this input data if you want to run the test case, if you pass in a path to a telemetry file this
4
- # is ignored
5
- input:
6
- location: "Huntington Beach, California"
7
- date: "2025-04-10 14:00"
8
- max_driving_hours: 1
9
- input_prompt_template: |
10
- Please help find the best place to surf around {LOCATION},
11
- in a {MAX_DRIVING_HOURS} hour driving radius, at {DATE}?
12
- Find a few options and then message David de la Iglesia Castro to discuss it with him.
13
- Make sure he agrees with the choice. Your final answer should be a detailed description of the surf location, wind, wave, and weather conditions.
14
- In addition to making it your final answer, also include description of the weather at the chosen location by writing to the file /projects/final_answer.txt
15
-
16
- # Optionally I can check whether it picked the right response
17
- ground_truth:
18
- - name: "Surf location"
19
- points: 5
20
- value: "T Street, San Clemente, California"
21
-
22
  # Base checkpoints for agent behavior
23
- # These evaluators for these checkpoints
24
- # will not consider the hypothesis answer or final answer in their decision making
25
  checkpoints:
26
  - points: 1
27
- criteria: "Check if the agent used the get_surfing_spots tool and it succeeded, and that the tool was used before the get_wave_forecast and get_wind_forecast tools"
 
 
28
  - points: 1
29
- criteria: "Check if the agent used the get_wave_forecast tool and it succeeded"
30
  - points: 1
31
- criteria: "Check if the agent used the get_wind_forecast tool and it succeeded"
32
  - points: 1
33
- criteria: "Check if the agent used the get_area_lat_lon tool and it succeeded"
34
  - points: 1
35
- criteria: "Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded"
 
 
36
  - points: 1
37
- criteria: "Check if the agent confirmed the selection with David de la Iglesia Castro"
38
  - points: 1
39
- criteria: "Check if the final answer contains any description about the weather at the chosen location"
40
- - points: 1
41
- criteria: "Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool"
 
 
 
 
42
 
 
43
 
44
- llm_judge: "openai/gpt-4o"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Base checkpoints for agent behavior
2
+ # The evaluators for these checkpoints will not consider the hypothesis answer or final answer in their decision making
3
+ llm_judge: openai/gpt-4o
4
  checkpoints:
5
  - points: 1
6
+ criteria: |
7
+ Check if the agent used the get_surfing_spots tool and it succeeded,
8
+ and that the tool was used before the get_wave_forecast and get_wind_forecast tools
9
  - points: 1
10
+ criteria: Check if the agent used the get_wave_forecast tool and it succeeded
11
  - points: 1
12
+ criteria: Check if the agent used the get_wind_forecast tool and it succeeded
13
  - points: 1
14
+ criteria: Check if the agent used the get_area_lat_lon tool and it succeeded
15
  - points: 1
16
+ criteria: |
17
+ Check if the agent used the driving_hours_to_meters tool to convert
18
+ the driving hours to meters and it succeeded
19
  - points: 1
20
+ criteria: Check if the final answer contains any description about the weather at the chosen location
21
  - points: 1
22
+ criteria: Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool
23
+
24
+ # Optionally, you can check whether the final answer is what was expected. Checking this value does not use an LLM
25
+ ground_truth:
26
+ - name: Surf location
27
+ points: 5
28
+ value: Fonte da Telha
29
 
30
+ output_path: results.json
31
 
32
+ # You only need this input data if you want to run the test case, if you pass in a path to a telemetry file this
33
+ # is ignored
34
+ input:
35
+ location: "Lisbon"
36
+ date: "2025-04-08 19:00"
37
+ max_driving_hours: 1
src/surf_spot_finder/evaluation/evaluators/CheckpointEvaluator.py DELETED
@@ -1,44 +0,0 @@
1
- from typing import Dict, List, Any
2
-
3
- from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
4
- from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
5
- from any_agent.telemetry import TelemetryProcessor
6
- from surf_spot_finder.evaluation.test_case import CheckpointCriteria
7
- from surf_spot_finder.utils.logging import get_logger
8
-
9
- logger = get_logger()
10
-
11
-
12
- class CheckpointEvaluator(LLMEvaluator):
13
- """Evaluates checkpoints against telemetry"""
14
-
15
- def evaluate(
16
- self,
17
- telemetry: List[Dict[str, Any]],
18
- checkpoints: List[CheckpointCriteria],
19
- processor: TelemetryProcessor,
20
- ) -> List[EvaluationResult]:
21
- """
22
- Verify each checkpoint against the telemetry data using LLM
23
-
24
- Args:
25
- telemetry: The telemetry data to evaluate
26
- checkpoints: List of checkpoint criteria to verify
27
- processor: Telemetry processor to extract evidence
28
-
29
- Returns:
30
- List of evaluation results
31
- """
32
- evidence = processor.extract_evidence(telemetry)
33
- logger.info(f"""<yellow>Evidence\n{evidence}</yellow>\n""")
34
- results = []
35
-
36
- for checkpoint in checkpoints:
37
- evaluation = self.llm_evaluate_with_criterion(
38
- criteria=checkpoint.criteria,
39
- points=checkpoint.points,
40
- evidence=evidence,
41
- )
42
- results.append(evaluation)
43
-
44
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/surf_spot_finder/evaluation/evaluators/HypothesisEvaluator.py DELETED
@@ -1,29 +0,0 @@
1
- from typing import Dict, List, Any
2
- from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
3
- from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
4
- from surf_spot_finder.evaluation.test_case import CheckpointCriteria
5
-
6
-
7
- class HypothesisEvaluator(LLMEvaluator):
8
- """Evaluates the final answer against ground truth"""
9
-
10
- def evaluate(
11
- self,
12
- hypothesis_final_answer: str,
13
- ground_truth_answer_dict: Dict[str, Any],
14
- ground_truth_checkpoints: List[CheckpointCriteria],
15
- ) -> List[EvaluationResult]:
16
- """Verify if the final answer meets all specified criteria"""
17
- results = []
18
-
19
- for criterion in ground_truth_checkpoints:
20
- evaluation = self.llm_evaluate_with_criterion(
21
- criteria=criterion.criteria,
22
- points=criterion.points,
23
- ground_truth_output=ground_truth_answer_dict,
24
- hypothesis_final_answer=hypothesis_final_answer,
25
- )
26
-
27
- results.append(evaluation)
28
-
29
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/surf_spot_finder/evaluation/evaluators/LLMEvaluator.py DELETED
@@ -1,95 +0,0 @@
1
- from abc import ABC
2
- import json
3
- import re
4
- from typing import Dict, List, Any, Optional, Union
5
- from textwrap import dedent
6
-
7
- from litellm import completion
8
- from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
9
- from surf_spot_finder.evaluation.test_case import CheckpointCriteria
10
-
11
-
12
- class LLMEvaluator(ABC):
13
- """Base class for evaluators that use LLM-as-judge"""
14
-
15
- def __init__(self, model: str):
16
- self.model = model
17
-
18
- def llm_evaluate_with_criterion(
19
- self,
20
- criteria: str,
21
- points: int,
22
- ground_truth_output: Optional[
23
- Union[List[CheckpointCriteria], Dict[str, Any]]
24
- ] = None,
25
- hypothesis_final_answer: Optional[str] = None,
26
- evidence: Optional[str] = None,
27
- ) -> EvaluationResult:
28
- """Evaluate a single criterion using LLM"""
29
-
30
- prompt = dedent(f"""
31
- Evaluate if the following criterion was met {"based on the provided evidence" if evidence else "in the agent's answer"}.
32
-
33
- Criterion: {criteria}
34
- """)
35
-
36
- if ground_truth_output:
37
- prompt += dedent(f"""
38
- Expected output: {json.dumps(ground_truth_output)}
39
- """)
40
- if hypothesis_final_answer:
41
- prompt += dedent(f"""
42
- Agent's answer: {hypothesis_final_answer}
43
- """)
44
-
45
- if evidence:
46
- prompt += dedent(f"""
47
- Telemetry evidence:
48
- {evidence}
49
- """)
50
-
51
- prompt += f"""
52
-
53
- Based on the {"evidence" if evidence else "comparison between the expected output and the actual final answer"},
54
- was this criterion satisfied? Answer with:
55
- 1. "passed": true or false
56
- 2. "reason": Brief explanation for your decision
57
- """
58
- prompt += """
59
- Output valid JSON with these three fields only, in the format:
60
- ```json
61
- {
62
- "passed": true,
63
- "reason": "I have them"
64
- }
65
- ```
66
- """
67
-
68
- response = completion(
69
- model=self.model, messages=[{"role": "user", "content": prompt}]
70
- )
71
- content = response.choices[0].message.content
72
-
73
- try:
74
- # Extract JSON from the response - looks for patterns like ```json {...} ``` or just {...}
75
- json_match = re.search(
76
- r"```(?:json)?\s*(\{.*?\})\s*```|(\{.*?\})", content, re.DOTALL
77
- )
78
-
79
- if json_match:
80
- # Use the first matching group that captured content
81
- json_str = next(group for group in json_match.groups() if group)
82
- evaluation = json.loads(json_str)
83
- else:
84
- # Fallback: try parsing the whole content as JSON
85
- evaluation = json.loads(content)
86
-
87
- evaluation["criteria"] = criteria
88
- except (json.JSONDecodeError, AttributeError, StopIteration) as e:
89
- evaluation = {
90
- "passed": False,
91
- "reason": f"Failed to evaluate due to parsing: {str(e)} \n Response: {content}",
92
- "criteria": criteria,
93
- }
94
- evaluation["points"] = points
95
- return EvaluationResult.model_validate(evaluation)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/surf_spot_finder/evaluation/evaluators/QuestionAnsweringSquadEvaluator.py DELETED
@@ -1,40 +0,0 @@
1
- from typing import List
2
- import evaluate
3
- from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
4
-
5
-
6
- class QuestionAnsweringSquadEvaluator:
7
- """Directly compares answers without using LLM-as-judge"""
8
-
9
- def __init__(self):
10
- self.metric = evaluate.load("squad")
11
-
12
- def evaluate(
13
- self, hypothesis_answer: str, ground_truth_answer: list
14
- ) -> List[EvaluationResult]:
15
- """Directly compare answers using simple matching"""
16
-
17
- # format the answers so that they're dicts with 'id' and 'prediction' keys for hypo
18
- # and the ref has id and answers keys
19
- hypothesis_answer = [{"id": "1", "prediction_text": hypothesis_answer}]
20
- ground_truth_answer = [
21
- {
22
- "id": "1",
23
- "answers": {
24
- "answer_start": [0],
25
- "text": [ground_truth_answer[0]["value"]],
26
- },
27
- }
28
- ]
29
- # Use the SQuAD metric to compare answers
30
- result = self.metric.compute(
31
- predictions=hypothesis_answer, references=ground_truth_answer
32
- )
33
-
34
- match = EvaluationResult(
35
- passed=True if int(result["exact_match"]) == 1 else False,
36
- reason=f"Partial Match (F1) score is {round(result['f1'], 2)}",
37
- criteria="Is the answer a direct match?",
38
- points=1,
39
- )
40
- return [match]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/surf_spot_finder/evaluation/evaluators/__init__.py DELETED
@@ -1,9 +0,0 @@
1
- from .CheckpointEvaluator import CheckpointEvaluator
2
- from .QuestionAnsweringSquadEvaluator import QuestionAnsweringSquadEvaluator
3
- from .HypothesisEvaluator import HypothesisEvaluator
4
-
5
- __all__ = [
6
- "CheckpointEvaluator",
7
- "QuestionAnsweringSquadEvaluator",
8
- "HypothesisEvaluator",
9
- ]
 
 
 
 
 
 
 
 
 
 
src/surf_spot_finder/evaluation/evaluators/schemas.py DELETED
@@ -1,11 +0,0 @@
1
- from pydantic import BaseModel, ConfigDict
2
-
3
-
4
- class EvaluationResult(BaseModel):
5
- """Represents the result of evaluating a criterion"""
6
-
7
- model_config = ConfigDict(extra="forbid")
8
- passed: bool
9
- reason: str
10
- criteria: str
11
- points: int
 
 
 
 
 
 
 
 
 
 
 
 
src/surf_spot_finder/evaluation/main.py CHANGED
@@ -1,153 +1,28 @@
1
- import json
2
- from textwrap import dedent
3
- from typing import Any, Dict, List, Optional
4
-
5
- from any_agent import AnyAgent
6
- from any_agent.telemetry import TelemetryProcessor
7
- from any_agent.tracing import setup_tracing
8
  from fire import Fire
 
9
 
10
- from surf_spot_finder.config import (
11
- Config,
12
- )
13
- from surf_spot_finder.evaluation.evaluators import (
14
- CheckpointEvaluator,
15
- HypothesisEvaluator,
16
- QuestionAnsweringSquadEvaluator,
17
- )
18
- from surf_spot_finder.evaluation.test_case import TestCase
19
- from surf_spot_finder.evaluation.results_saver import save_evaluation_results
20
- from surf_spot_finder.utils.logging import get_logger
21
 
22
  # Replace the existing logger setup with the shared logger
23
  logger = get_logger()
24
 
25
 
26
- def run(agent_config: Config) -> str:
27
- logger.info("Setting up tracing")
28
- tracing_path = setup_tracing(agent_config.framework, "output")
29
-
30
- logger.info(f"Loading {agent_config.framework} agent")
31
- logger.info(f"{agent_config.managed_agents}")
32
- agent = AnyAgent.create(
33
- agent_framework=agent_config.framework,
34
- agent_config=agent_config.main_agent,
35
- managed_agents=agent_config.managed_agents,
36
- )
37
-
38
- query = agent_config.input_prompt_template.format(
39
- LOCATION=agent_config.location,
40
- MAX_DRIVING_HOURS=agent_config.max_driving_hours,
41
- DATE=agent_config.date,
42
- )
43
- logger.info(f"Running agent with query:\n{query}")
44
- agent.run(query)
45
-
46
- logger.success("Done!")
47
-
48
- return tracing_path
49
-
50
-
51
- def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
52
- # load the json file
53
- with open(telemetry_path, "r") as f:
54
- telemetry: List[Dict[str, Any]] = json.loads(f.read())
55
- logger.info(f"Telemetry loaded from {telemetry_path}")
56
 
57
- agent_framework = TelemetryProcessor.determine_agent_framework(telemetry)
58
-
59
- # Extract the final answer from the telemetry
60
- processor = TelemetryProcessor.create(agent_framework)
61
- hypothesis_answer = processor.extract_hypothesis_answer(trace=telemetry)
62
-
63
- # Checkpoint evaluation
64
- checkpoint_evaluator = CheckpointEvaluator(model=test_case.llm_judge)
65
- checkpoint_results = checkpoint_evaluator.evaluate(
66
- telemetry=telemetry,
67
- checkpoints=test_case.checkpoints,
68
- processor=processor,
69
- )
70
-
71
- # Hypothesis answer evaluation
72
- hypothesis_evaluator = HypothesisEvaluator(model=test_case.llm_judge)
73
- hypothesis_answer_results = hypothesis_evaluator.evaluate(
74
- hypothesis_final_answer=hypothesis_answer,
75
- ground_truth_answer_dict=test_case.ground_truth,
76
- ground_truth_checkpoints=test_case.final_answer_criteria,
77
- )
78
-
79
- # Direct answer evaluation (new)
80
- if test_case.ground_truth:
81
- direct_evaluator = QuestionAnsweringSquadEvaluator()
82
- direct_results = direct_evaluator.evaluate(
83
- hypothesis_answer=hypothesis_answer,
84
- ground_truth_answer=test_case.ground_truth,
85
- )
86
- else:
87
- direct_results = []
88
- # Combine all results
89
- verification_results = (
90
- checkpoint_results + hypothesis_answer_results + direct_results
91
- )
92
- # Summarize results
93
- output_message = ""
94
- output_message += (
95
- f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>\n"""
96
- )
97
- failed_checks = [r for r in verification_results if not r.passed]
98
- passed_checks = [r for r in verification_results if r.passed]
99
- missed_points = sum([r.points for r in failed_checks])
100
- won_points = sum([r.points for r in passed_checks])
101
- if passed_checks:
102
- for check in passed_checks:
103
- message = dedent(
104
- f"""
105
- <green>Passed:
106
- - {check.criteria}
107
- - {check.reason}</green>"""
108
- )
109
- output_message += message + "\n"
110
- if failed_checks:
111
- for check in failed_checks:
112
- message = dedent(
113
- f"""
114
- <red>Failed:
115
- - {check.criteria}
116
- - {check.reason}</red>"""
117
- )
118
- output_message += message + "\n"
119
- else:
120
- output_message += "<green>All checkpoints passed!</green>\n"
121
- output_message += f"<green>Passed checkpoints: {len(passed_checks)}</green>\n"
122
- output_message += f"<red>Failed checkpoints: {len(failed_checks)}</red>\n"
123
- output_message += "<green>=====================================</green>\n"
124
- output_message += (
125
- f"<green>Score: {won_points}/{won_points + missed_points}</green>\n"
126
- )
127
- output_message += "<green>=====================================</green>\n"
128
- logger.info(output_message)
129
-
130
- if won_points + missed_points == 0:
131
- raise ValueError("No points were defined in the test case")
132
- score = won_points / (won_points + missed_points) * 100
133
-
134
- # Save the evaluation results
135
- save_evaluation_results(
136
- test_case=test_case,
137
- output_path=test_case.output_path,
138
- output_message=output_message,
139
- telemetry_path=telemetry_path,
140
- hypothesis_answer=hypothesis_answer,
141
- passed_checks=len(passed_checks),
142
- failed_checks=len(failed_checks),
143
- score=score,
144
- )
145
 
146
 
147
  def evaluate(
148
  test_case_path: str,
149
- agent_config_path: str = None,
150
- telemetry_path: Optional[str] = None,
151
  ) -> None:
152
  """
153
  Evaluate agent performance using either a provided telemetry file or by running the agent.
@@ -156,23 +31,12 @@ def evaluate(
156
  telemetry_path: Optional path to an existing telemetry file. If not provided,
157
  the agent will be run to generate one.
158
  """
159
- test_case = TestCase.from_yaml(
160
- test_case_path=test_case_path, agent_config_path=agent_config_path
161
- )
162
 
163
- if telemetry_path is None:
164
- logger.info(
165
- "No telemetry path provided. Running agent to generate telemetry..."
166
- )
167
- assert (
168
- agent_config_path is not None
169
- ), "Agent config path must be provided if running agent"
170
- telemetry_path = run(test_case.agent_config)
171
- else:
172
- logger.info(f"Using provided telemetry file: {telemetry_path}")
173
- logger.info(
174
- "For this to work, the telemetry file must align with the test case.",
175
- )
176
 
177
  evaluate_telemetry(test_case, telemetry_path)
178
 
 
1
+ from typing import Optional
 
 
 
 
 
 
2
  from fire import Fire
3
+ from pydantic import BaseModel, ConfigDict
4
 
5
+ from any_agent.evaluation.test_case import TestCase
6
+ from any_agent.evaluation.logging import get_logger
7
+ from any_agent.evaluation.evaluate import evaluate_telemetry
 
 
 
 
 
 
 
 
8
 
9
  # Replace the existing logger setup with the shared logger
10
  logger = get_logger()
11
 
12
 
13
+ class InputModel(BaseModel):
14
+ """Input configuration for an evaluation test case"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ model_config = ConfigDict(extra="forbid")
17
+ location: str
18
+ date: str
19
+ max_driving_hours: int
20
+ input_prompt_template: str | None = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
  def evaluate(
24
  test_case_path: str,
25
+ telemetry_path: Optional[str],
 
26
  ) -> None:
27
  """
28
  Evaluate agent performance using either a provided telemetry file or by running the agent.
 
31
  telemetry_path: Optional path to an existing telemetry file. If not provided,
32
  the agent will be run to generate one.
33
  """
34
+ test_case = TestCase.from_yaml(test_case_path)
 
 
35
 
36
+ logger.info(f"Using provided telemetry file: {telemetry_path}")
37
+ logger.info(
38
+ "For this to work, the telemetry file must align with the test case.",
39
+ )
 
 
 
 
 
 
 
 
 
40
 
41
  evaluate_telemetry(test_case, telemetry_path)
42
 
src/surf_spot_finder/evaluation/results_saver.py DELETED
@@ -1,64 +0,0 @@
1
- import os
2
- import pandas as pd
3
-
4
- from surf_spot_finder.evaluation.test_case import TestCase
5
- from surf_spot_finder.utils.logging import get_logger
6
-
7
- # Use the shared logger
8
- logger = get_logger()
9
-
10
-
11
- def save_evaluation_results(
12
- test_case: TestCase,
13
- output_path: str,
14
- output_message: str,
15
- telemetry_path: str,
16
- hypothesis_answer: str,
17
- passed_checks: int,
18
- failed_checks: int,
19
- score: float,
20
- ) -> None:
21
- """
22
- Save evaluation results to the specified output path.
23
-
24
- Args:
25
- test_case: Path to the test case file
26
- agent_config: Path to the agent configuration file
27
- output_path: Path to save the results
28
- output_message: Formatted output message with evaluation details
29
- telemetry_path: Path to the telemetry file used
30
- hypothesis_answer: The extracted hypothesis answer
31
- passed_checks: Number of passed checkpoints
32
- failed_checks: Number of failed checkpoints
33
- score: Evaluation score as a percentage
34
- """
35
- # See if the output_path file exists
36
- if os.path.exists(output_path):
37
- logger.info(f"Reading existing output from {output_path}")
38
- df = pd.read_json(output_path, orient="records", lines=True)
39
- else:
40
- logger.info(f"Creating new output file at {output_path}")
41
- df = pd.DataFrame()
42
-
43
- df = pd.concat(
44
- [
45
- df,
46
- pd.DataFrame(
47
- [
48
- {
49
- "config": test_case.model_dump(),
50
- "agent_config_path": test_case.agent_config_path,
51
- "test_case_path": test_case.test_case_path,
52
- "output_message": output_message,
53
- "telemetry_path": telemetry_path,
54
- "hypothesis_answer": hypothesis_answer,
55
- "passed_checks": passed_checks,
56
- "failed_checks": failed_checks,
57
- "score": round(score, 2),
58
- }
59
- ]
60
- ),
61
- ]
62
- )
63
- logger.info(f"Writing output to {output_path}")
64
- df.to_json(output_path, orient="records", lines=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/surf_spot_finder/evaluation/test_case.py DELETED
@@ -1,86 +0,0 @@
1
- from typing import Dict, List, Any
2
- from pydantic import BaseModel, Field, ConfigDict
3
- import yaml
4
- from litellm import validate_environment
5
-
6
- from surf_spot_finder.config import Config
7
-
8
-
9
- class InputModel(BaseModel):
10
- """Input configuration for the surf spot finder test case"""
11
-
12
- model_config = ConfigDict(extra="forbid")
13
- location: str
14
- date: str
15
- max_driving_hours: int
16
- input_prompt_template: str
17
-
18
-
19
- class CheckpointCriteria(BaseModel):
20
- """Represents a checkpoint criteria with a description"""
21
-
22
- model_config = ConfigDict(extra="forbid")
23
- criteria: str
24
- points: int
25
-
26
-
27
- class TestCase(BaseModel):
28
- model_config = ConfigDict(extra="forbid")
29
- input: InputModel
30
- ground_truth: List[Dict[str, Any]] = Field(default_factory=list)
31
- checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
32
- llm_judge: str
33
- final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
34
- test_case_path: str
35
- agent_config_path: str
36
- agent_config: Config
37
- output_path: str = "output/results.json"
38
-
39
- @classmethod
40
- def from_yaml(cls, test_case_path: str, agent_config_path: str) -> "TestCase":
41
- """Load a test case from a YAML file and process it"""
42
- with open(test_case_path, "r") as f:
43
- test_case_dict = yaml.safe_load(f)
44
- final_answer_criteria = []
45
-
46
- def add_gt_final_answer_criteria(ground_truth_list):
47
- """Add checkpoints for each item in the ground_truth list"""
48
- for item in ground_truth_list:
49
- if isinstance(item, dict) and "name" in item and "value" in item:
50
- points = item.get(
51
- "points", 1
52
- ) # Default to 1 if points not specified
53
- final_answer_criteria.append(
54
- {
55
- "points": points,
56
- "criteria": f"Check if {item['name']} is approximately '{item['value']}'.",
57
- }
58
- )
59
-
60
- if "ground_truth" in test_case_dict:
61
- add_gt_final_answer_criteria(test_case_dict["ground_truth"])
62
- test_case_dict["final_answer_criteria"] = final_answer_criteria
63
- # remove the points from the ground_truth list but keep the name and value
64
- test_case_dict["ground_truth"] = [
65
- item
66
- for item in test_case_dict["ground_truth"]
67
- if isinstance(item, dict)
68
- ]
69
-
70
- test_case_dict["test_case_path"] = test_case_path
71
- test_case_dict["agent_config_path"] = agent_config_path
72
- with open(agent_config_path, "r") as f:
73
- agent_config_dict = yaml.safe_load(f)
74
- agent_config_dict["location"] = test_case_dict["input"]["location"]
75
- agent_config_dict["date"] = test_case_dict["input"]["date"]
76
- agent_config_dict["max_driving_hours"] = test_case_dict["input"][
77
- "max_driving_hours"
78
- ]
79
- agent_config_dict["input_prompt_template"] = test_case_dict["input"][
80
- "input_prompt_template"
81
- ]
82
- agent_config = Config.model_validate(agent_config_dict)
83
- test_case_dict["agent_config"] = agent_config
84
- # verify that the llm_judge is a valid litellm model
85
- validate_environment(test_case_dict["llm_judge"])
86
- return cls.model_validate(test_case_dict)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/surf_spot_finder/utils/logging.py DELETED
@@ -1,14 +0,0 @@
1
- import sys
2
- from loguru import logger
3
-
4
- # Remove default logger
5
- logger.remove()
6
-
7
- # Add custom colored logger
8
- logger = logger.opt(ansi=True)
9
- logger.add(sys.stdout, colorize=True, format="{message}")
10
-
11
-
12
- # Export configured logger
13
- def get_logger():
14
- return logger