Spaces:
Running
Running
Nathan Brake
commited on
Refactor evaluation logging and save results to a new module (#49)
Browse files* Refactor evaluation logging and save results to a new module
* add div by zero check
* docs
pyproject.toml
CHANGED
@@ -9,11 +9,12 @@ license = {text = "Apache-2.0"}
|
|
9 |
requires-python = ">=3.11"
|
10 |
dynamic = ["version"]
|
11 |
dependencies = [
|
12 |
-
"any-agent @ git+ssh://[email protected]/mozilla-ai/any-agent",
|
13 |
"fire",
|
14 |
"loguru",
|
15 |
"pydantic",
|
16 |
"pyyaml",
|
|
|
17 |
]
|
18 |
|
19 |
[project.optional-dependencies]
|
|
|
9 |
requires-python = ">=3.11"
|
10 |
dynamic = ["version"]
|
11 |
dependencies = [
|
12 |
+
"any-agent[smolagents,mcp,openai,langchain,llama_index] @ git+ssh://[email protected]/mozilla-ai/any-agent",
|
13 |
"fire",
|
14 |
"loguru",
|
15 |
"pydantic",
|
16 |
"pyyaml",
|
17 |
+
"litellm",
|
18 |
]
|
19 |
|
20 |
[project.optional-dependencies]
|
src/surf_spot_finder/evaluation/evaluate.py
CHANGED
@@ -1,15 +1,11 @@
|
|
1 |
import json
|
2 |
-
import os
|
3 |
-
import sys
|
4 |
from textwrap import dedent
|
5 |
from typing import Any, Dict, List, Optional
|
6 |
|
7 |
-
import pandas as pd
|
8 |
from any_agent import AnyAgent
|
9 |
from any_agent.telemetry import TelemetryProcessor
|
10 |
from any_agent.tracing import get_tracer_provider, setup_tracing
|
11 |
from fire import Fire
|
12 |
-
from loguru import logger
|
13 |
|
14 |
from surf_spot_finder.config import (
|
15 |
Config,
|
@@ -20,10 +16,11 @@ from surf_spot_finder.evaluation.evaluators import (
|
|
20 |
QuestionAnsweringSquadEvaluator,
|
21 |
)
|
22 |
from surf_spot_finder.evaluation.test_case import TestCase
|
|
|
|
|
23 |
|
24 |
-
logger
|
25 |
-
logger =
|
26 |
-
logger.add(sys.stdout, colorize=True, format="{message}")
|
27 |
|
28 |
|
29 |
def run(test_case: TestCase, agent_config_path: str) -> str:
|
@@ -139,32 +136,22 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
|
|
139 |
)
|
140 |
output_message += "<green>=====================================</green>\n"
|
141 |
logger.info(output_message)
|
142 |
-
|
143 |
-
if
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
"passed_checks": len(passed_checks),
|
158 |
-
"failed_checks": len(failed_checks),
|
159 |
-
"score": round(
|
160 |
-
won_points / (won_points + missed_points) * 100, 2
|
161 |
-
),
|
162 |
-
}
|
163 |
-
]
|
164 |
-
),
|
165 |
-
]
|
166 |
)
|
167 |
-
df.to_json(test_case.output_path, orient="records", lines=True)
|
168 |
|
169 |
|
170 |
def evaluate(
|
|
|
1 |
import json
|
|
|
|
|
2 |
from textwrap import dedent
|
3 |
from typing import Any, Dict, List, Optional
|
4 |
|
|
|
5 |
from any_agent import AnyAgent
|
6 |
from any_agent.telemetry import TelemetryProcessor
|
7 |
from any_agent.tracing import get_tracer_provider, setup_tracing
|
8 |
from fire import Fire
|
|
|
9 |
|
10 |
from surf_spot_finder.config import (
|
11 |
Config,
|
|
|
16 |
QuestionAnsweringSquadEvaluator,
|
17 |
)
|
18 |
from surf_spot_finder.evaluation.test_case import TestCase
|
19 |
+
from surf_spot_finder.evaluation.results_saver import save_evaluation_results
|
20 |
+
from surf_spot_finder.utils.logging import get_logger
|
21 |
|
22 |
+
# Replace the existing logger setup with the shared logger
|
23 |
+
logger = get_logger()
|
|
|
24 |
|
25 |
|
26 |
def run(test_case: TestCase, agent_config_path: str) -> str:
|
|
|
136 |
)
|
137 |
output_message += "<green>=====================================</green>\n"
|
138 |
logger.info(output_message)
|
139 |
+
|
140 |
+
if won_points + missed_points == 0:
|
141 |
+
raise ValueError("No points were defined in the test case")
|
142 |
+
score = won_points / (won_points + missed_points) * 100
|
143 |
+
|
144 |
+
# Save the evaluation results
|
145 |
+
save_evaluation_results(
|
146 |
+
test_case_path=test_case.test_case_path,
|
147 |
+
output_path=test_case.output_path,
|
148 |
+
output_message=output_message,
|
149 |
+
telemetry_path=telemetry_path,
|
150 |
+
hypothesis_answer=hypothesis_answer,
|
151 |
+
passed_checks=len(passed_checks),
|
152 |
+
failed_checks=len(failed_checks),
|
153 |
+
score=score,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
)
|
|
|
155 |
|
156 |
|
157 |
def evaluate(
|
src/surf_spot_finder/evaluation/evaluators/CheckpointEvaluator.py
CHANGED
@@ -2,8 +2,11 @@ from typing import Dict, List, Any
|
|
2 |
|
3 |
from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
|
4 |
from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
|
5 |
-
from
|
6 |
from surf_spot_finder.evaluation.test_case import CheckpointCriteria
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
class CheckpointEvaluator(LLMEvaluator):
|
@@ -27,6 +30,7 @@ class CheckpointEvaluator(LLMEvaluator):
|
|
27 |
List of evaluation results
|
28 |
"""
|
29 |
evidence = processor.extract_evidence(telemetry)
|
|
|
30 |
results = []
|
31 |
|
32 |
for checkpoint in checkpoints:
|
|
|
2 |
|
3 |
from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
|
4 |
from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
|
5 |
+
from any_agent.telemetry import TelemetryProcessor
|
6 |
from surf_spot_finder.evaluation.test_case import CheckpointCriteria
|
7 |
+
from surf_spot_finder.utils.logging import get_logger
|
8 |
+
|
9 |
+
logger = get_logger()
|
10 |
|
11 |
|
12 |
class CheckpointEvaluator(LLMEvaluator):
|
|
|
30 |
List of evaluation results
|
31 |
"""
|
32 |
evidence = processor.extract_evidence(telemetry)
|
33 |
+
logger.info(f"""<yellow>Evidence\n{evidence}</yellow>\n""")
|
34 |
results = []
|
35 |
|
36 |
for checkpoint in checkpoints:
|
src/surf_spot_finder/evaluation/results_saver.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
from surf_spot_finder.utils.logging import get_logger
|
5 |
+
|
6 |
+
# Use the shared logger
|
7 |
+
logger = get_logger()
|
8 |
+
|
9 |
+
|
10 |
+
def save_evaluation_results(
|
11 |
+
test_case_path: str,
|
12 |
+
output_path: str,
|
13 |
+
output_message: str,
|
14 |
+
telemetry_path: str,
|
15 |
+
hypothesis_answer: str,
|
16 |
+
passed_checks: int,
|
17 |
+
failed_checks: int,
|
18 |
+
score: float,
|
19 |
+
) -> None:
|
20 |
+
"""
|
21 |
+
Save evaluation results to the specified output path.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
test_case_path: Path to the test case file
|
25 |
+
output_path: Path to save the results
|
26 |
+
output_message: Formatted output message with evaluation details
|
27 |
+
telemetry_path: Path to the telemetry file used
|
28 |
+
hypothesis_answer: The extracted hypothesis answer
|
29 |
+
passed_checks: Number of passed checkpoints
|
30 |
+
failed_checks: Number of failed checkpoints
|
31 |
+
score: Evaluation score as a percentage
|
32 |
+
"""
|
33 |
+
# See if the output_path file exists
|
34 |
+
if os.path.exists(output_path):
|
35 |
+
logger.info(f"Reading existing output from {output_path}")
|
36 |
+
df = pd.read_json(output_path, orient="records", lines=True)
|
37 |
+
else:
|
38 |
+
logger.info(f"Creating new output file at {output_path}")
|
39 |
+
df = pd.DataFrame()
|
40 |
+
|
41 |
+
df = pd.concat(
|
42 |
+
[
|
43 |
+
df,
|
44 |
+
pd.DataFrame(
|
45 |
+
[
|
46 |
+
{
|
47 |
+
"test_case_path": test_case_path,
|
48 |
+
"output_message": output_message,
|
49 |
+
"telemetry_path": telemetry_path,
|
50 |
+
"hypothesis_answer": hypothesis_answer,
|
51 |
+
"passed_checks": passed_checks,
|
52 |
+
"failed_checks": failed_checks,
|
53 |
+
"score": round(score, 2),
|
54 |
+
}
|
55 |
+
]
|
56 |
+
),
|
57 |
+
]
|
58 |
+
)
|
59 |
+
logger.info(f"Writing output to {output_path}")
|
60 |
+
df.to_json(output_path, orient="records", lines=True)
|
src/surf_spot_finder/utils/logging.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from loguru import logger
|
3 |
+
|
4 |
+
# Remove default logger
|
5 |
+
logger.remove()
|
6 |
+
|
7 |
+
# Add custom colored logger
|
8 |
+
logger = logger.opt(ansi=True)
|
9 |
+
logger.add(sys.stdout, colorize=True, format="{message}")
|
10 |
+
|
11 |
+
|
12 |
+
# Export configured logger
|
13 |
+
def get_logger():
|
14 |
+
return logger
|