Nathan Brake commited on
Commit
9a0610b
·
unverified ·
1 Parent(s): dcd814d

Refactor evaluation logging and save results to a new module (#49)

Browse files

* Refactor evaluation logging and save results to a new module

* add div by zero check

* docs

pyproject.toml CHANGED
@@ -9,11 +9,12 @@ license = {text = "Apache-2.0"}
9
  requires-python = ">=3.11"
10
  dynamic = ["version"]
11
  dependencies = [
12
- "any-agent @ git+ssh://[email protected]/mozilla-ai/any-agent",
13
  "fire",
14
  "loguru",
15
  "pydantic",
16
  "pyyaml",
 
17
  ]
18
 
19
  [project.optional-dependencies]
 
9
  requires-python = ">=3.11"
10
  dynamic = ["version"]
11
  dependencies = [
12
+ "any-agent[smolagents,mcp,openai,langchain,llama_index] @ git+ssh://[email protected]/mozilla-ai/any-agent",
13
  "fire",
14
  "loguru",
15
  "pydantic",
16
  "pyyaml",
17
+ "litellm",
18
  ]
19
 
20
  [project.optional-dependencies]
src/surf_spot_finder/evaluation/evaluate.py CHANGED
@@ -1,15 +1,11 @@
1
  import json
2
- import os
3
- import sys
4
  from textwrap import dedent
5
  from typing import Any, Dict, List, Optional
6
 
7
- import pandas as pd
8
  from any_agent import AnyAgent
9
  from any_agent.telemetry import TelemetryProcessor
10
  from any_agent.tracing import get_tracer_provider, setup_tracing
11
  from fire import Fire
12
- from loguru import logger
13
 
14
  from surf_spot_finder.config import (
15
  Config,
@@ -20,10 +16,11 @@ from surf_spot_finder.evaluation.evaluators import (
20
  QuestionAnsweringSquadEvaluator,
21
  )
22
  from surf_spot_finder.evaluation.test_case import TestCase
 
 
23
 
24
- logger.remove()
25
- logger = logger.opt(ansi=True)
26
- logger.add(sys.stdout, colorize=True, format="{message}")
27
 
28
 
29
  def run(test_case: TestCase, agent_config_path: str) -> str:
@@ -139,32 +136,22 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
139
  )
140
  output_message += "<green>=====================================</green>\n"
141
  logger.info(output_message)
142
- # See if the test_case.output_path file exists.
143
- if os.path.exists(test_case.output_path):
144
- df = pd.read_json(test_case.output_path, orient="records", lines=True)
145
- else:
146
- df = pd.DataFrame()
147
- df = pd.concat(
148
- [
149
- df,
150
- pd.DataFrame(
151
- [
152
- {
153
- "test_case_path": test_case.test_case_path,
154
- "output_message": output_message,
155
- "telemetry_path": telemetry_path,
156
- "hypothesis_answer": hypothesis_answer,
157
- "passed_checks": len(passed_checks),
158
- "failed_checks": len(failed_checks),
159
- "score": round(
160
- won_points / (won_points + missed_points) * 100, 2
161
- ),
162
- }
163
- ]
164
- ),
165
- ]
166
  )
167
- df.to_json(test_case.output_path, orient="records", lines=True)
168
 
169
 
170
  def evaluate(
 
1
  import json
 
 
2
  from textwrap import dedent
3
  from typing import Any, Dict, List, Optional
4
 
 
5
  from any_agent import AnyAgent
6
  from any_agent.telemetry import TelemetryProcessor
7
  from any_agent.tracing import get_tracer_provider, setup_tracing
8
  from fire import Fire
 
9
 
10
  from surf_spot_finder.config import (
11
  Config,
 
16
  QuestionAnsweringSquadEvaluator,
17
  )
18
  from surf_spot_finder.evaluation.test_case import TestCase
19
+ from surf_spot_finder.evaluation.results_saver import save_evaluation_results
20
+ from surf_spot_finder.utils.logging import get_logger
21
 
22
+ # Replace the existing logger setup with the shared logger
23
+ logger = get_logger()
 
24
 
25
 
26
  def run(test_case: TestCase, agent_config_path: str) -> str:
 
136
  )
137
  output_message += "<green>=====================================</green>\n"
138
  logger.info(output_message)
139
+
140
+ if won_points + missed_points == 0:
141
+ raise ValueError("No points were defined in the test case")
142
+ score = won_points / (won_points + missed_points) * 100
143
+
144
+ # Save the evaluation results
145
+ save_evaluation_results(
146
+ test_case_path=test_case.test_case_path,
147
+ output_path=test_case.output_path,
148
+ output_message=output_message,
149
+ telemetry_path=telemetry_path,
150
+ hypothesis_answer=hypothesis_answer,
151
+ passed_checks=len(passed_checks),
152
+ failed_checks=len(failed_checks),
153
+ score=score,
 
 
 
 
 
 
 
 
 
154
  )
 
155
 
156
 
157
  def evaluate(
src/surf_spot_finder/evaluation/evaluators/CheckpointEvaluator.py CHANGED
@@ -2,8 +2,11 @@ from typing import Dict, List, Any
2
 
3
  from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
4
  from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
5
- from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
6
  from surf_spot_finder.evaluation.test_case import CheckpointCriteria
 
 
 
7
 
8
 
9
  class CheckpointEvaluator(LLMEvaluator):
@@ -27,6 +30,7 @@ class CheckpointEvaluator(LLMEvaluator):
27
  List of evaluation results
28
  """
29
  evidence = processor.extract_evidence(telemetry)
 
30
  results = []
31
 
32
  for checkpoint in checkpoints:
 
2
 
3
  from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
4
  from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
5
+ from any_agent.telemetry import TelemetryProcessor
6
  from surf_spot_finder.evaluation.test_case import CheckpointCriteria
7
+ from surf_spot_finder.utils.logging import get_logger
8
+
9
+ logger = get_logger()
10
 
11
 
12
  class CheckpointEvaluator(LLMEvaluator):
 
30
  List of evaluation results
31
  """
32
  evidence = processor.extract_evidence(telemetry)
33
+ logger.info(f"""<yellow>Evidence\n{evidence}</yellow>\n""")
34
  results = []
35
 
36
  for checkpoint in checkpoints:
src/surf_spot_finder/evaluation/results_saver.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+
4
+ from surf_spot_finder.utils.logging import get_logger
5
+
6
+ # Use the shared logger
7
+ logger = get_logger()
8
+
9
+
10
+ def save_evaluation_results(
11
+ test_case_path: str,
12
+ output_path: str,
13
+ output_message: str,
14
+ telemetry_path: str,
15
+ hypothesis_answer: str,
16
+ passed_checks: int,
17
+ failed_checks: int,
18
+ score: float,
19
+ ) -> None:
20
+ """
21
+ Save evaluation results to the specified output path.
22
+
23
+ Args:
24
+ test_case_path: Path to the test case file
25
+ output_path: Path to save the results
26
+ output_message: Formatted output message with evaluation details
27
+ telemetry_path: Path to the telemetry file used
28
+ hypothesis_answer: The extracted hypothesis answer
29
+ passed_checks: Number of passed checkpoints
30
+ failed_checks: Number of failed checkpoints
31
+ score: Evaluation score as a percentage
32
+ """
33
+ # See if the output_path file exists
34
+ if os.path.exists(output_path):
35
+ logger.info(f"Reading existing output from {output_path}")
36
+ df = pd.read_json(output_path, orient="records", lines=True)
37
+ else:
38
+ logger.info(f"Creating new output file at {output_path}")
39
+ df = pd.DataFrame()
40
+
41
+ df = pd.concat(
42
+ [
43
+ df,
44
+ pd.DataFrame(
45
+ [
46
+ {
47
+ "test_case_path": test_case_path,
48
+ "output_message": output_message,
49
+ "telemetry_path": telemetry_path,
50
+ "hypothesis_answer": hypothesis_answer,
51
+ "passed_checks": passed_checks,
52
+ "failed_checks": failed_checks,
53
+ "score": round(score, 2),
54
+ }
55
+ ]
56
+ ),
57
+ ]
58
+ )
59
+ logger.info(f"Writing output to {output_path}")
60
+ df.to_json(output_path, orient="records", lines=True)
src/surf_spot_finder/utils/logging.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from loguru import logger
3
+
4
+ # Remove default logger
5
+ logger.remove()
6
+
7
+ # Add custom colored logger
8
+ logger = logger.opt(ansi=True)
9
+ logger.add(sys.stdout, colorize=True, format="{message}")
10
+
11
+
12
+ # Export configured logger
13
+ def get_logger():
14
+ return logger