Spaces:
Running
Running
Nathan Brake
commited on
Brake/any agent eval code (#52)
Browse files* remove eval code
* Only do eval with existing trace
- examples/openai_single_agent_vertical.yaml +4 -5
- notebooks/experiment/test_cases/alpha.yaml +26 -33
- src/surf_spot_finder/evaluation/evaluators/CheckpointEvaluator.py +0 -44
- src/surf_spot_finder/evaluation/evaluators/HypothesisEvaluator.py +0 -29
- src/surf_spot_finder/evaluation/evaluators/LLMEvaluator.py +0 -95
- src/surf_spot_finder/evaluation/evaluators/QuestionAnsweringSquadEvaluator.py +0 -40
- src/surf_spot_finder/evaluation/evaluators/__init__.py +0 -9
- src/surf_spot_finder/evaluation/evaluators/schemas.py +0 -11
- src/surf_spot_finder/evaluation/main.py +18 -154
- src/surf_spot_finder/evaluation/results_saver.py +0 -64
- src/surf_spot_finder/evaluation/test_case.py +0 -86
- src/surf_spot_finder/utils/logging.py +0 -14
examples/openai_single_agent_vertical.yaml
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
-
location:
|
2 |
-
date: 2025-04-
|
3 |
-
max_driving_hours:
|
4 |
|
5 |
framework: openai
|
6 |
|
7 |
main_agent:
|
8 |
-
model_id:
|
9 |
tools:
|
10 |
- "surf_spot_finder.tools.driving_hours_to_meters"
|
11 |
- "surf_spot_finder.tools.get_area_lat_lon"
|
@@ -13,5 +13,4 @@ main_agent:
|
|
13 |
- "surf_spot_finder.tools.get_wave_forecast"
|
14 |
- "surf_spot_finder.tools.get_wind_forecast"
|
15 |
- "any_agent.tools.search_web"
|
16 |
-
- "any_agent.tools.show_plan"
|
17 |
- "any_agent.tools.visit_webpage"
|
|
|
1 |
+
location: Lisbon
|
2 |
+
date: 2025-04-08 19:00
|
3 |
+
max_driving_hours: 1
|
4 |
|
5 |
framework: openai
|
6 |
|
7 |
main_agent:
|
8 |
+
model_id: gpt-4o
|
9 |
tools:
|
10 |
- "surf_spot_finder.tools.driving_hours_to_meters"
|
11 |
- "surf_spot_finder.tools.get_area_lat_lon"
|
|
|
13 |
- "surf_spot_finder.tools.get_wave_forecast"
|
14 |
- "surf_spot_finder.tools.get_wind_forecast"
|
15 |
- "any_agent.tools.search_web"
|
|
|
16 |
- "any_agent.tools.visit_webpage"
|
notebooks/experiment/test_cases/alpha.yaml
CHANGED
@@ -1,44 +1,37 @@
|
|
1 |
-
output_path: "results.json"
|
2 |
-
|
3 |
-
# You only need this input data if you want to run the test case, if you pass in a path to a telemetry file this
|
4 |
-
# is ignored
|
5 |
-
input:
|
6 |
-
location: "Huntington Beach, California"
|
7 |
-
date: "2025-04-10 14:00"
|
8 |
-
max_driving_hours: 1
|
9 |
-
input_prompt_template: |
|
10 |
-
Please help find the best place to surf around {LOCATION},
|
11 |
-
in a {MAX_DRIVING_HOURS} hour driving radius, at {DATE}?
|
12 |
-
Find a few options and then message David de la Iglesia Castro to discuss it with him.
|
13 |
-
Make sure he agrees with the choice. Your final answer should be a detailed description of the surf location, wind, wave, and weather conditions.
|
14 |
-
In addition to making it your final answer, also include description of the weather at the chosen location by writing to the file /projects/final_answer.txt
|
15 |
-
|
16 |
-
# Optionally I can check whether it picked the right response
|
17 |
-
ground_truth:
|
18 |
-
- name: "Surf location"
|
19 |
-
points: 5
|
20 |
-
value: "T Street, San Clemente, California"
|
21 |
-
|
22 |
# Base checkpoints for agent behavior
|
23 |
-
#
|
24 |
-
|
25 |
checkpoints:
|
26 |
- points: 1
|
27 |
-
criteria:
|
|
|
|
|
28 |
- points: 1
|
29 |
-
criteria:
|
30 |
- points: 1
|
31 |
-
criteria:
|
32 |
- points: 1
|
33 |
-
criteria:
|
34 |
- points: 1
|
35 |
-
criteria:
|
|
|
|
|
36 |
- points: 1
|
37 |
-
criteria:
|
38 |
- points: 1
|
39 |
-
criteria:
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
42 |
|
|
|
43 |
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Base checkpoints for agent behavior
|
2 |
+
# The evaluators for these checkpoints will not consider the hypothesis answer or final answer in their decision making
|
3 |
+
llm_judge: openai/gpt-4o
|
4 |
checkpoints:
|
5 |
- points: 1
|
6 |
+
criteria: |
|
7 |
+
Check if the agent used the get_surfing_spots tool and it succeeded,
|
8 |
+
and that the tool was used before the get_wave_forecast and get_wind_forecast tools
|
9 |
- points: 1
|
10 |
+
criteria: Check if the agent used the get_wave_forecast tool and it succeeded
|
11 |
- points: 1
|
12 |
+
criteria: Check if the agent used the get_wind_forecast tool and it succeeded
|
13 |
- points: 1
|
14 |
+
criteria: Check if the agent used the get_area_lat_lon tool and it succeeded
|
15 |
- points: 1
|
16 |
+
criteria: |
|
17 |
+
Check if the agent used the driving_hours_to_meters tool to convert
|
18 |
+
the driving hours to meters and it succeeded
|
19 |
- points: 1
|
20 |
+
criteria: Check if the final answer contains any description about the weather at the chosen location
|
21 |
- points: 1
|
22 |
+
criteria: Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool
|
23 |
+
|
24 |
+
# Optionally, you can check whether the final answer is what was expected. Checking this value does not use an LLM
|
25 |
+
ground_truth:
|
26 |
+
- name: Surf location
|
27 |
+
points: 5
|
28 |
+
value: Fonte da Telha
|
29 |
|
30 |
+
output_path: results.json
|
31 |
|
32 |
+
# You only need this input data if you want to run the test case, if you pass in a path to a telemetry file this
|
33 |
+
# is ignored
|
34 |
+
input:
|
35 |
+
location: "Lisbon"
|
36 |
+
date: "2025-04-08 19:00"
|
37 |
+
max_driving_hours: 1
|
src/surf_spot_finder/evaluation/evaluators/CheckpointEvaluator.py
DELETED
@@ -1,44 +0,0 @@
|
|
1 |
-
from typing import Dict, List, Any
|
2 |
-
|
3 |
-
from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
|
4 |
-
from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
|
5 |
-
from any_agent.telemetry import TelemetryProcessor
|
6 |
-
from surf_spot_finder.evaluation.test_case import CheckpointCriteria
|
7 |
-
from surf_spot_finder.utils.logging import get_logger
|
8 |
-
|
9 |
-
logger = get_logger()
|
10 |
-
|
11 |
-
|
12 |
-
class CheckpointEvaluator(LLMEvaluator):
|
13 |
-
"""Evaluates checkpoints against telemetry"""
|
14 |
-
|
15 |
-
def evaluate(
|
16 |
-
self,
|
17 |
-
telemetry: List[Dict[str, Any]],
|
18 |
-
checkpoints: List[CheckpointCriteria],
|
19 |
-
processor: TelemetryProcessor,
|
20 |
-
) -> List[EvaluationResult]:
|
21 |
-
"""
|
22 |
-
Verify each checkpoint against the telemetry data using LLM
|
23 |
-
|
24 |
-
Args:
|
25 |
-
telemetry: The telemetry data to evaluate
|
26 |
-
checkpoints: List of checkpoint criteria to verify
|
27 |
-
processor: Telemetry processor to extract evidence
|
28 |
-
|
29 |
-
Returns:
|
30 |
-
List of evaluation results
|
31 |
-
"""
|
32 |
-
evidence = processor.extract_evidence(telemetry)
|
33 |
-
logger.info(f"""<yellow>Evidence\n{evidence}</yellow>\n""")
|
34 |
-
results = []
|
35 |
-
|
36 |
-
for checkpoint in checkpoints:
|
37 |
-
evaluation = self.llm_evaluate_with_criterion(
|
38 |
-
criteria=checkpoint.criteria,
|
39 |
-
points=checkpoint.points,
|
40 |
-
evidence=evidence,
|
41 |
-
)
|
42 |
-
results.append(evaluation)
|
43 |
-
|
44 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/surf_spot_finder/evaluation/evaluators/HypothesisEvaluator.py
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
from typing import Dict, List, Any
|
2 |
-
from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
|
3 |
-
from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
|
4 |
-
from surf_spot_finder.evaluation.test_case import CheckpointCriteria
|
5 |
-
|
6 |
-
|
7 |
-
class HypothesisEvaluator(LLMEvaluator):
|
8 |
-
"""Evaluates the final answer against ground truth"""
|
9 |
-
|
10 |
-
def evaluate(
|
11 |
-
self,
|
12 |
-
hypothesis_final_answer: str,
|
13 |
-
ground_truth_answer_dict: Dict[str, Any],
|
14 |
-
ground_truth_checkpoints: List[CheckpointCriteria],
|
15 |
-
) -> List[EvaluationResult]:
|
16 |
-
"""Verify if the final answer meets all specified criteria"""
|
17 |
-
results = []
|
18 |
-
|
19 |
-
for criterion in ground_truth_checkpoints:
|
20 |
-
evaluation = self.llm_evaluate_with_criterion(
|
21 |
-
criteria=criterion.criteria,
|
22 |
-
points=criterion.points,
|
23 |
-
ground_truth_output=ground_truth_answer_dict,
|
24 |
-
hypothesis_final_answer=hypothesis_final_answer,
|
25 |
-
)
|
26 |
-
|
27 |
-
results.append(evaluation)
|
28 |
-
|
29 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/surf_spot_finder/evaluation/evaluators/LLMEvaluator.py
DELETED
@@ -1,95 +0,0 @@
|
|
1 |
-
from abc import ABC
|
2 |
-
import json
|
3 |
-
import re
|
4 |
-
from typing import Dict, List, Any, Optional, Union
|
5 |
-
from textwrap import dedent
|
6 |
-
|
7 |
-
from litellm import completion
|
8 |
-
from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
|
9 |
-
from surf_spot_finder.evaluation.test_case import CheckpointCriteria
|
10 |
-
|
11 |
-
|
12 |
-
class LLMEvaluator(ABC):
|
13 |
-
"""Base class for evaluators that use LLM-as-judge"""
|
14 |
-
|
15 |
-
def __init__(self, model: str):
|
16 |
-
self.model = model
|
17 |
-
|
18 |
-
def llm_evaluate_with_criterion(
|
19 |
-
self,
|
20 |
-
criteria: str,
|
21 |
-
points: int,
|
22 |
-
ground_truth_output: Optional[
|
23 |
-
Union[List[CheckpointCriteria], Dict[str, Any]]
|
24 |
-
] = None,
|
25 |
-
hypothesis_final_answer: Optional[str] = None,
|
26 |
-
evidence: Optional[str] = None,
|
27 |
-
) -> EvaluationResult:
|
28 |
-
"""Evaluate a single criterion using LLM"""
|
29 |
-
|
30 |
-
prompt = dedent(f"""
|
31 |
-
Evaluate if the following criterion was met {"based on the provided evidence" if evidence else "in the agent's answer"}.
|
32 |
-
|
33 |
-
Criterion: {criteria}
|
34 |
-
""")
|
35 |
-
|
36 |
-
if ground_truth_output:
|
37 |
-
prompt += dedent(f"""
|
38 |
-
Expected output: {json.dumps(ground_truth_output)}
|
39 |
-
""")
|
40 |
-
if hypothesis_final_answer:
|
41 |
-
prompt += dedent(f"""
|
42 |
-
Agent's answer: {hypothesis_final_answer}
|
43 |
-
""")
|
44 |
-
|
45 |
-
if evidence:
|
46 |
-
prompt += dedent(f"""
|
47 |
-
Telemetry evidence:
|
48 |
-
{evidence}
|
49 |
-
""")
|
50 |
-
|
51 |
-
prompt += f"""
|
52 |
-
|
53 |
-
Based on the {"evidence" if evidence else "comparison between the expected output and the actual final answer"},
|
54 |
-
was this criterion satisfied? Answer with:
|
55 |
-
1. "passed": true or false
|
56 |
-
2. "reason": Brief explanation for your decision
|
57 |
-
"""
|
58 |
-
prompt += """
|
59 |
-
Output valid JSON with these three fields only, in the format:
|
60 |
-
```json
|
61 |
-
{
|
62 |
-
"passed": true,
|
63 |
-
"reason": "I have them"
|
64 |
-
}
|
65 |
-
```
|
66 |
-
"""
|
67 |
-
|
68 |
-
response = completion(
|
69 |
-
model=self.model, messages=[{"role": "user", "content": prompt}]
|
70 |
-
)
|
71 |
-
content = response.choices[0].message.content
|
72 |
-
|
73 |
-
try:
|
74 |
-
# Extract JSON from the response - looks for patterns like ```json {...} ``` or just {...}
|
75 |
-
json_match = re.search(
|
76 |
-
r"```(?:json)?\s*(\{.*?\})\s*```|(\{.*?\})", content, re.DOTALL
|
77 |
-
)
|
78 |
-
|
79 |
-
if json_match:
|
80 |
-
# Use the first matching group that captured content
|
81 |
-
json_str = next(group for group in json_match.groups() if group)
|
82 |
-
evaluation = json.loads(json_str)
|
83 |
-
else:
|
84 |
-
# Fallback: try parsing the whole content as JSON
|
85 |
-
evaluation = json.loads(content)
|
86 |
-
|
87 |
-
evaluation["criteria"] = criteria
|
88 |
-
except (json.JSONDecodeError, AttributeError, StopIteration) as e:
|
89 |
-
evaluation = {
|
90 |
-
"passed": False,
|
91 |
-
"reason": f"Failed to evaluate due to parsing: {str(e)} \n Response: {content}",
|
92 |
-
"criteria": criteria,
|
93 |
-
}
|
94 |
-
evaluation["points"] = points
|
95 |
-
return EvaluationResult.model_validate(evaluation)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/surf_spot_finder/evaluation/evaluators/QuestionAnsweringSquadEvaluator.py
DELETED
@@ -1,40 +0,0 @@
|
|
1 |
-
from typing import List
|
2 |
-
import evaluate
|
3 |
-
from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
|
4 |
-
|
5 |
-
|
6 |
-
class QuestionAnsweringSquadEvaluator:
|
7 |
-
"""Directly compares answers without using LLM-as-judge"""
|
8 |
-
|
9 |
-
def __init__(self):
|
10 |
-
self.metric = evaluate.load("squad")
|
11 |
-
|
12 |
-
def evaluate(
|
13 |
-
self, hypothesis_answer: str, ground_truth_answer: list
|
14 |
-
) -> List[EvaluationResult]:
|
15 |
-
"""Directly compare answers using simple matching"""
|
16 |
-
|
17 |
-
# format the answers so that they're dicts with 'id' and 'prediction' keys for hypo
|
18 |
-
# and the ref has id and answers keys
|
19 |
-
hypothesis_answer = [{"id": "1", "prediction_text": hypothesis_answer}]
|
20 |
-
ground_truth_answer = [
|
21 |
-
{
|
22 |
-
"id": "1",
|
23 |
-
"answers": {
|
24 |
-
"answer_start": [0],
|
25 |
-
"text": [ground_truth_answer[0]["value"]],
|
26 |
-
},
|
27 |
-
}
|
28 |
-
]
|
29 |
-
# Use the SQuAD metric to compare answers
|
30 |
-
result = self.metric.compute(
|
31 |
-
predictions=hypothesis_answer, references=ground_truth_answer
|
32 |
-
)
|
33 |
-
|
34 |
-
match = EvaluationResult(
|
35 |
-
passed=True if int(result["exact_match"]) == 1 else False,
|
36 |
-
reason=f"Partial Match (F1) score is {round(result['f1'], 2)}",
|
37 |
-
criteria="Is the answer a direct match?",
|
38 |
-
points=1,
|
39 |
-
)
|
40 |
-
return [match]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/surf_spot_finder/evaluation/evaluators/__init__.py
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
from .CheckpointEvaluator import CheckpointEvaluator
|
2 |
-
from .QuestionAnsweringSquadEvaluator import QuestionAnsweringSquadEvaluator
|
3 |
-
from .HypothesisEvaluator import HypothesisEvaluator
|
4 |
-
|
5 |
-
__all__ = [
|
6 |
-
"CheckpointEvaluator",
|
7 |
-
"QuestionAnsweringSquadEvaluator",
|
8 |
-
"HypothesisEvaluator",
|
9 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/surf_spot_finder/evaluation/evaluators/schemas.py
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
from pydantic import BaseModel, ConfigDict
|
2 |
-
|
3 |
-
|
4 |
-
class EvaluationResult(BaseModel):
|
5 |
-
"""Represents the result of evaluating a criterion"""
|
6 |
-
|
7 |
-
model_config = ConfigDict(extra="forbid")
|
8 |
-
passed: bool
|
9 |
-
reason: str
|
10 |
-
criteria: str
|
11 |
-
points: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/surf_spot_finder/evaluation/main.py
CHANGED
@@ -1,153 +1,28 @@
|
|
1 |
-
import
|
2 |
-
from textwrap import dedent
|
3 |
-
from typing import Any, Dict, List, Optional
|
4 |
-
|
5 |
-
from any_agent import AnyAgent
|
6 |
-
from any_agent.telemetry import TelemetryProcessor
|
7 |
-
from any_agent.tracing import setup_tracing
|
8 |
from fire import Fire
|
|
|
9 |
|
10 |
-
from
|
11 |
-
|
12 |
-
|
13 |
-
from surf_spot_finder.evaluation.evaluators import (
|
14 |
-
CheckpointEvaluator,
|
15 |
-
HypothesisEvaluator,
|
16 |
-
QuestionAnsweringSquadEvaluator,
|
17 |
-
)
|
18 |
-
from surf_spot_finder.evaluation.test_case import TestCase
|
19 |
-
from surf_spot_finder.evaluation.results_saver import save_evaluation_results
|
20 |
-
from surf_spot_finder.utils.logging import get_logger
|
21 |
|
22 |
# Replace the existing logger setup with the shared logger
|
23 |
logger = get_logger()
|
24 |
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
tracing_path = setup_tracing(agent_config.framework, "output")
|
29 |
-
|
30 |
-
logger.info(f"Loading {agent_config.framework} agent")
|
31 |
-
logger.info(f"{agent_config.managed_agents}")
|
32 |
-
agent = AnyAgent.create(
|
33 |
-
agent_framework=agent_config.framework,
|
34 |
-
agent_config=agent_config.main_agent,
|
35 |
-
managed_agents=agent_config.managed_agents,
|
36 |
-
)
|
37 |
-
|
38 |
-
query = agent_config.input_prompt_template.format(
|
39 |
-
LOCATION=agent_config.location,
|
40 |
-
MAX_DRIVING_HOURS=agent_config.max_driving_hours,
|
41 |
-
DATE=agent_config.date,
|
42 |
-
)
|
43 |
-
logger.info(f"Running agent with query:\n{query}")
|
44 |
-
agent.run(query)
|
45 |
-
|
46 |
-
logger.success("Done!")
|
47 |
-
|
48 |
-
return tracing_path
|
49 |
-
|
50 |
-
|
51 |
-
def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
|
52 |
-
# load the json file
|
53 |
-
with open(telemetry_path, "r") as f:
|
54 |
-
telemetry: List[Dict[str, Any]] = json.loads(f.read())
|
55 |
-
logger.info(f"Telemetry loaded from {telemetry_path}")
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
# Checkpoint evaluation
|
64 |
-
checkpoint_evaluator = CheckpointEvaluator(model=test_case.llm_judge)
|
65 |
-
checkpoint_results = checkpoint_evaluator.evaluate(
|
66 |
-
telemetry=telemetry,
|
67 |
-
checkpoints=test_case.checkpoints,
|
68 |
-
processor=processor,
|
69 |
-
)
|
70 |
-
|
71 |
-
# Hypothesis answer evaluation
|
72 |
-
hypothesis_evaluator = HypothesisEvaluator(model=test_case.llm_judge)
|
73 |
-
hypothesis_answer_results = hypothesis_evaluator.evaluate(
|
74 |
-
hypothesis_final_answer=hypothesis_answer,
|
75 |
-
ground_truth_answer_dict=test_case.ground_truth,
|
76 |
-
ground_truth_checkpoints=test_case.final_answer_criteria,
|
77 |
-
)
|
78 |
-
|
79 |
-
# Direct answer evaluation (new)
|
80 |
-
if test_case.ground_truth:
|
81 |
-
direct_evaluator = QuestionAnsweringSquadEvaluator()
|
82 |
-
direct_results = direct_evaluator.evaluate(
|
83 |
-
hypothesis_answer=hypothesis_answer,
|
84 |
-
ground_truth_answer=test_case.ground_truth,
|
85 |
-
)
|
86 |
-
else:
|
87 |
-
direct_results = []
|
88 |
-
# Combine all results
|
89 |
-
verification_results = (
|
90 |
-
checkpoint_results + hypothesis_answer_results + direct_results
|
91 |
-
)
|
92 |
-
# Summarize results
|
93 |
-
output_message = ""
|
94 |
-
output_message += (
|
95 |
-
f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>\n"""
|
96 |
-
)
|
97 |
-
failed_checks = [r for r in verification_results if not r.passed]
|
98 |
-
passed_checks = [r for r in verification_results if r.passed]
|
99 |
-
missed_points = sum([r.points for r in failed_checks])
|
100 |
-
won_points = sum([r.points for r in passed_checks])
|
101 |
-
if passed_checks:
|
102 |
-
for check in passed_checks:
|
103 |
-
message = dedent(
|
104 |
-
f"""
|
105 |
-
<green>Passed:
|
106 |
-
- {check.criteria}
|
107 |
-
- {check.reason}</green>"""
|
108 |
-
)
|
109 |
-
output_message += message + "\n"
|
110 |
-
if failed_checks:
|
111 |
-
for check in failed_checks:
|
112 |
-
message = dedent(
|
113 |
-
f"""
|
114 |
-
<red>Failed:
|
115 |
-
- {check.criteria}
|
116 |
-
- {check.reason}</red>"""
|
117 |
-
)
|
118 |
-
output_message += message + "\n"
|
119 |
-
else:
|
120 |
-
output_message += "<green>All checkpoints passed!</green>\n"
|
121 |
-
output_message += f"<green>Passed checkpoints: {len(passed_checks)}</green>\n"
|
122 |
-
output_message += f"<red>Failed checkpoints: {len(failed_checks)}</red>\n"
|
123 |
-
output_message += "<green>=====================================</green>\n"
|
124 |
-
output_message += (
|
125 |
-
f"<green>Score: {won_points}/{won_points + missed_points}</green>\n"
|
126 |
-
)
|
127 |
-
output_message += "<green>=====================================</green>\n"
|
128 |
-
logger.info(output_message)
|
129 |
-
|
130 |
-
if won_points + missed_points == 0:
|
131 |
-
raise ValueError("No points were defined in the test case")
|
132 |
-
score = won_points / (won_points + missed_points) * 100
|
133 |
-
|
134 |
-
# Save the evaluation results
|
135 |
-
save_evaluation_results(
|
136 |
-
test_case=test_case,
|
137 |
-
output_path=test_case.output_path,
|
138 |
-
output_message=output_message,
|
139 |
-
telemetry_path=telemetry_path,
|
140 |
-
hypothesis_answer=hypothesis_answer,
|
141 |
-
passed_checks=len(passed_checks),
|
142 |
-
failed_checks=len(failed_checks),
|
143 |
-
score=score,
|
144 |
-
)
|
145 |
|
146 |
|
147 |
def evaluate(
|
148 |
test_case_path: str,
|
149 |
-
|
150 |
-
telemetry_path: Optional[str] = None,
|
151 |
) -> None:
|
152 |
"""
|
153 |
Evaluate agent performance using either a provided telemetry file or by running the agent.
|
@@ -156,23 +31,12 @@ def evaluate(
|
|
156 |
telemetry_path: Optional path to an existing telemetry file. If not provided,
|
157 |
the agent will be run to generate one.
|
158 |
"""
|
159 |
-
test_case = TestCase.from_yaml(
|
160 |
-
test_case_path=test_case_path, agent_config_path=agent_config_path
|
161 |
-
)
|
162 |
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
assert (
|
168 |
-
agent_config_path is not None
|
169 |
-
), "Agent config path must be provided if running agent"
|
170 |
-
telemetry_path = run(test_case.agent_config)
|
171 |
-
else:
|
172 |
-
logger.info(f"Using provided telemetry file: {telemetry_path}")
|
173 |
-
logger.info(
|
174 |
-
"For this to work, the telemetry file must align with the test case.",
|
175 |
-
)
|
176 |
|
177 |
evaluate_telemetry(test_case, telemetry_path)
|
178 |
|
|
|
1 |
+
from typing import Optional
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from fire import Fire
|
3 |
+
from pydantic import BaseModel, ConfigDict
|
4 |
|
5 |
+
from any_agent.evaluation.test_case import TestCase
|
6 |
+
from any_agent.evaluation.logging import get_logger
|
7 |
+
from any_agent.evaluation.evaluate import evaluate_telemetry
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
# Replace the existing logger setup with the shared logger
|
10 |
logger = get_logger()
|
11 |
|
12 |
|
13 |
+
class InputModel(BaseModel):
|
14 |
+
"""Input configuration for an evaluation test case"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
model_config = ConfigDict(extra="forbid")
|
17 |
+
location: str
|
18 |
+
date: str
|
19 |
+
max_driving_hours: int
|
20 |
+
input_prompt_template: str | None = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
def evaluate(
|
24 |
test_case_path: str,
|
25 |
+
telemetry_path: Optional[str],
|
|
|
26 |
) -> None:
|
27 |
"""
|
28 |
Evaluate agent performance using either a provided telemetry file or by running the agent.
|
|
|
31 |
telemetry_path: Optional path to an existing telemetry file. If not provided,
|
32 |
the agent will be run to generate one.
|
33 |
"""
|
34 |
+
test_case = TestCase.from_yaml(test_case_path)
|
|
|
|
|
35 |
|
36 |
+
logger.info(f"Using provided telemetry file: {telemetry_path}")
|
37 |
+
logger.info(
|
38 |
+
"For this to work, the telemetry file must align with the test case.",
|
39 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
evaluate_telemetry(test_case, telemetry_path)
|
42 |
|
src/surf_spot_finder/evaluation/results_saver.py
DELETED
@@ -1,64 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import pandas as pd
|
3 |
-
|
4 |
-
from surf_spot_finder.evaluation.test_case import TestCase
|
5 |
-
from surf_spot_finder.utils.logging import get_logger
|
6 |
-
|
7 |
-
# Use the shared logger
|
8 |
-
logger = get_logger()
|
9 |
-
|
10 |
-
|
11 |
-
def save_evaluation_results(
|
12 |
-
test_case: TestCase,
|
13 |
-
output_path: str,
|
14 |
-
output_message: str,
|
15 |
-
telemetry_path: str,
|
16 |
-
hypothesis_answer: str,
|
17 |
-
passed_checks: int,
|
18 |
-
failed_checks: int,
|
19 |
-
score: float,
|
20 |
-
) -> None:
|
21 |
-
"""
|
22 |
-
Save evaluation results to the specified output path.
|
23 |
-
|
24 |
-
Args:
|
25 |
-
test_case: Path to the test case file
|
26 |
-
agent_config: Path to the agent configuration file
|
27 |
-
output_path: Path to save the results
|
28 |
-
output_message: Formatted output message with evaluation details
|
29 |
-
telemetry_path: Path to the telemetry file used
|
30 |
-
hypothesis_answer: The extracted hypothesis answer
|
31 |
-
passed_checks: Number of passed checkpoints
|
32 |
-
failed_checks: Number of failed checkpoints
|
33 |
-
score: Evaluation score as a percentage
|
34 |
-
"""
|
35 |
-
# See if the output_path file exists
|
36 |
-
if os.path.exists(output_path):
|
37 |
-
logger.info(f"Reading existing output from {output_path}")
|
38 |
-
df = pd.read_json(output_path, orient="records", lines=True)
|
39 |
-
else:
|
40 |
-
logger.info(f"Creating new output file at {output_path}")
|
41 |
-
df = pd.DataFrame()
|
42 |
-
|
43 |
-
df = pd.concat(
|
44 |
-
[
|
45 |
-
df,
|
46 |
-
pd.DataFrame(
|
47 |
-
[
|
48 |
-
{
|
49 |
-
"config": test_case.model_dump(),
|
50 |
-
"agent_config_path": test_case.agent_config_path,
|
51 |
-
"test_case_path": test_case.test_case_path,
|
52 |
-
"output_message": output_message,
|
53 |
-
"telemetry_path": telemetry_path,
|
54 |
-
"hypothesis_answer": hypothesis_answer,
|
55 |
-
"passed_checks": passed_checks,
|
56 |
-
"failed_checks": failed_checks,
|
57 |
-
"score": round(score, 2),
|
58 |
-
}
|
59 |
-
]
|
60 |
-
),
|
61 |
-
]
|
62 |
-
)
|
63 |
-
logger.info(f"Writing output to {output_path}")
|
64 |
-
df.to_json(output_path, orient="records", lines=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/surf_spot_finder/evaluation/test_case.py
DELETED
@@ -1,86 +0,0 @@
|
|
1 |
-
from typing import Dict, List, Any
|
2 |
-
from pydantic import BaseModel, Field, ConfigDict
|
3 |
-
import yaml
|
4 |
-
from litellm import validate_environment
|
5 |
-
|
6 |
-
from surf_spot_finder.config import Config
|
7 |
-
|
8 |
-
|
9 |
-
class InputModel(BaseModel):
|
10 |
-
"""Input configuration for the surf spot finder test case"""
|
11 |
-
|
12 |
-
model_config = ConfigDict(extra="forbid")
|
13 |
-
location: str
|
14 |
-
date: str
|
15 |
-
max_driving_hours: int
|
16 |
-
input_prompt_template: str
|
17 |
-
|
18 |
-
|
19 |
-
class CheckpointCriteria(BaseModel):
|
20 |
-
"""Represents a checkpoint criteria with a description"""
|
21 |
-
|
22 |
-
model_config = ConfigDict(extra="forbid")
|
23 |
-
criteria: str
|
24 |
-
points: int
|
25 |
-
|
26 |
-
|
27 |
-
class TestCase(BaseModel):
|
28 |
-
model_config = ConfigDict(extra="forbid")
|
29 |
-
input: InputModel
|
30 |
-
ground_truth: List[Dict[str, Any]] = Field(default_factory=list)
|
31 |
-
checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
|
32 |
-
llm_judge: str
|
33 |
-
final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
|
34 |
-
test_case_path: str
|
35 |
-
agent_config_path: str
|
36 |
-
agent_config: Config
|
37 |
-
output_path: str = "output/results.json"
|
38 |
-
|
39 |
-
@classmethod
|
40 |
-
def from_yaml(cls, test_case_path: str, agent_config_path: str) -> "TestCase":
|
41 |
-
"""Load a test case from a YAML file and process it"""
|
42 |
-
with open(test_case_path, "r") as f:
|
43 |
-
test_case_dict = yaml.safe_load(f)
|
44 |
-
final_answer_criteria = []
|
45 |
-
|
46 |
-
def add_gt_final_answer_criteria(ground_truth_list):
|
47 |
-
"""Add checkpoints for each item in the ground_truth list"""
|
48 |
-
for item in ground_truth_list:
|
49 |
-
if isinstance(item, dict) and "name" in item and "value" in item:
|
50 |
-
points = item.get(
|
51 |
-
"points", 1
|
52 |
-
) # Default to 1 if points not specified
|
53 |
-
final_answer_criteria.append(
|
54 |
-
{
|
55 |
-
"points": points,
|
56 |
-
"criteria": f"Check if {item['name']} is approximately '{item['value']}'.",
|
57 |
-
}
|
58 |
-
)
|
59 |
-
|
60 |
-
if "ground_truth" in test_case_dict:
|
61 |
-
add_gt_final_answer_criteria(test_case_dict["ground_truth"])
|
62 |
-
test_case_dict["final_answer_criteria"] = final_answer_criteria
|
63 |
-
# remove the points from the ground_truth list but keep the name and value
|
64 |
-
test_case_dict["ground_truth"] = [
|
65 |
-
item
|
66 |
-
for item in test_case_dict["ground_truth"]
|
67 |
-
if isinstance(item, dict)
|
68 |
-
]
|
69 |
-
|
70 |
-
test_case_dict["test_case_path"] = test_case_path
|
71 |
-
test_case_dict["agent_config_path"] = agent_config_path
|
72 |
-
with open(agent_config_path, "r") as f:
|
73 |
-
agent_config_dict = yaml.safe_load(f)
|
74 |
-
agent_config_dict["location"] = test_case_dict["input"]["location"]
|
75 |
-
agent_config_dict["date"] = test_case_dict["input"]["date"]
|
76 |
-
agent_config_dict["max_driving_hours"] = test_case_dict["input"][
|
77 |
-
"max_driving_hours"
|
78 |
-
]
|
79 |
-
agent_config_dict["input_prompt_template"] = test_case_dict["input"][
|
80 |
-
"input_prompt_template"
|
81 |
-
]
|
82 |
-
agent_config = Config.model_validate(agent_config_dict)
|
83 |
-
test_case_dict["agent_config"] = agent_config
|
84 |
-
# verify that the llm_judge is a valid litellm model
|
85 |
-
validate_environment(test_case_dict["llm_judge"])
|
86 |
-
return cls.model_validate(test_case_dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/surf_spot_finder/utils/logging.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
import sys
|
2 |
-
from loguru import logger
|
3 |
-
|
4 |
-
# Remove default logger
|
5 |
-
logger.remove()
|
6 |
-
|
7 |
-
# Add custom colored logger
|
8 |
-
logger = logger.opt(ansi=True)
|
9 |
-
logger.add(sys.stdout, colorize=True, format="{message}")
|
10 |
-
|
11 |
-
|
12 |
-
# Export configured logger
|
13 |
-
def get_logger():
|
14 |
-
return logger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|