Nathan Brake commited on
Commit
94a64b0
·
unverified ·
1 Parent(s): 98df0d0

Default output to 'output' instead of 'telemetry_output'. Save eval output to DF (#32)

Browse files
.gitignore CHANGED
@@ -167,4 +167,4 @@ cython_debug/
167
  .idea/
168
  .vscode/
169
 
170
- telemetry_output
 
167
  .idea/
168
  .vscode/
169
 
170
+ output
src/surf_spot_finder/evaluation/evaluate.py CHANGED
@@ -1,9 +1,11 @@
1
  import json
 
2
  import sys
3
  from textwrap import dedent
4
  from typing import Any, Dict, List, Optional
5
  from loguru import logger
6
  from fire import Fire
 
7
  from surf_spot_finder.cli import find_surf_spot
8
  from surf_spot_finder.config import (
9
  Config,
@@ -53,9 +55,6 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
53
  # Extract the final answer from the telemetry
54
  processor = TelemetryProcessor.create(agent_type)
55
  hypothesis_answer = processor.extract_hypothesis_answer(trace=telemetry)
56
- logger.info(
57
- f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>"""
58
- )
59
  # Verify agent behavior against checkpoints using llm-as-a-judge
60
  llm_judge = "openai/gpt-4o"
61
  checkpoint_results = verify_checkpoints(
@@ -72,6 +71,10 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
72
  model=llm_judge,
73
  )
74
  # Summarize results
 
 
 
 
75
 
76
  verification_results = checkpoint_results + hypothesis_answer_results
77
  failed_checks = [r for r in verification_results if not r.passed]
@@ -86,7 +89,7 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
86
  - {check.criteria}
87
  - {check.reason}</green>"""
88
  )
89
- logger.info(message)
90
  if failed_checks:
91
  for check in failed_checks:
92
  message = dedent(
@@ -95,14 +98,43 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
95
  - {check.criteria}
96
  - {check.reason}</red>"""
97
  )
98
- logger.error(message)
99
  else:
100
- logger.info("<green>All checkpoints passed!</green>")
101
- logger.info(f"<green>Passed checkpoints: {len(passed_checks)}</green>")
102
- logger.info(f"<red>Failed checkpoints: {len(failed_checks)}</red>")
103
- logger.info("<green>=====================================</green>")
104
- logger.info(f"<green>Score: {won_points}/{won_points + missed_points}</green>")
105
- logger.info("<green>=====================================</green>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
 
108
  def evaluate(
 
1
  import json
2
+ import os
3
  import sys
4
  from textwrap import dedent
5
  from typing import Any, Dict, List, Optional
6
  from loguru import logger
7
  from fire import Fire
8
+ import pandas as pd
9
  from surf_spot_finder.cli import find_surf_spot
10
  from surf_spot_finder.config import (
11
  Config,
 
55
  # Extract the final answer from the telemetry
56
  processor = TelemetryProcessor.create(agent_type)
57
  hypothesis_answer = processor.extract_hypothesis_answer(trace=telemetry)
 
 
 
58
  # Verify agent behavior against checkpoints using llm-as-a-judge
59
  llm_judge = "openai/gpt-4o"
60
  checkpoint_results = verify_checkpoints(
 
71
  model=llm_judge,
72
  )
73
  # Summarize results
74
+ output_message = ""
75
+ output_message += (
76
+ f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>\n"""
77
+ )
78
 
79
  verification_results = checkpoint_results + hypothesis_answer_results
80
  failed_checks = [r for r in verification_results if not r.passed]
 
89
  - {check.criteria}
90
  - {check.reason}</green>"""
91
  )
92
+ output_message += message + "\n"
93
  if failed_checks:
94
  for check in failed_checks:
95
  message = dedent(
 
98
  - {check.criteria}
99
  - {check.reason}</red>"""
100
  )
101
+ output_message += message + "\n"
102
  else:
103
+ output_message += "<green>All checkpoints passed!</green>\n"
104
+ output_message += f"<green>Passed checkpoints: {len(passed_checks)}</green>\n"
105
+ output_message += f"<red>Failed checkpoints: {len(failed_checks)}</red>\n"
106
+ output_message += "<green>=====================================</green>\n"
107
+ output_message += (
108
+ f"<green>Score: {won_points}/{won_points + missed_points}</green>\n"
109
+ )
110
+ output_message += "<green>=====================================</green>\n"
111
+ logger.info(output_message)
112
+ # See if the test_case.output_path file exists.
113
+ if os.path.exists(test_case.output_path):
114
+ df = pd.read_json(test_case.output_path, orient="records", lines=True)
115
+ else:
116
+ df = pd.DataFrame()
117
+ df = pd.concat(
118
+ [
119
+ df,
120
+ pd.DataFrame(
121
+ [
122
+ {
123
+ "test_case_path": test_case.test_case_path,
124
+ "output_message": output_message,
125
+ "telemetry_path": telemetry_path,
126
+ "hypothesis_answer": hypothesis_answer,
127
+ "passed_checks": len(passed_checks),
128
+ "failed_checks": len(failed_checks),
129
+ "score": round(
130
+ won_points / (won_points + missed_points) * 100, 2
131
+ ),
132
+ }
133
+ ]
134
+ ),
135
+ ]
136
+ )
137
+ df.to_json(test_case.output_path, orient="records", lines=True)
138
 
139
 
140
  def evaluate(
src/surf_spot_finder/evaluation/telemetry/langchain_telemetry.py CHANGED
@@ -20,8 +20,15 @@ class LangchainTelemetryProcessor(TelemetryProcessor):
20
  message = json.loads(content)["messages"][0]
21
  message = self.parse_generic_key_value_string(message)
22
  base_message = BaseMessage(content=message["content"], type="AGENT")
23
- print(base_message.text())
24
- return base_message.text()
 
 
 
 
 
 
 
25
 
26
  raise ValueError("No agent final answer found in trace")
27
 
 
20
  message = json.loads(content)["messages"][0]
21
  message = self.parse_generic_key_value_string(message)
22
  base_message = BaseMessage(content=message["content"], type="AGENT")
23
+ # Use the interpreted string for printing
24
+ final_text = base_message.text()
25
+ # Either decode escape sequences if they're present
26
+ try:
27
+ final_text = final_text.encode().decode("unicode_escape")
28
+ except UnicodeDecodeError:
29
+ # If that fails, the escape sequences might already be interpreted
30
+ pass
31
+ return final_text
32
 
33
  raise ValueError("No agent final answer found in trace")
34
 
src/surf_spot_finder/evaluation/telemetry/openai_telemetry.py CHANGED
@@ -70,7 +70,8 @@ class OpenAITelemetryProcessor(TelemetryProcessor):
70
  "tool_name": tool_name,
71
  "input": attributes.get("input.value", ""),
72
  "output": tool_output,
73
- "status": span.get("status", {}).get("status_code"),
 
74
  }
75
  span_info["input"] = json.loads(span_info["input"])
76
 
 
70
  "tool_name": tool_name,
71
  "input": attributes.get("input.value", ""),
72
  "output": tool_output,
73
+ # Can't add status yet because it isn't being set by openinference
74
+ # "status": span.get("status", {}).get("status_code"),
75
  }
76
  span_info["input"] = json.loads(span_info["input"])
77
 
src/surf_spot_finder/evaluation/test_case.py CHANGED
@@ -27,6 +27,8 @@ class TestCase(BaseModel):
27
  ground_truth: List[Dict[str, Any]] = Field(default_factory=list)
28
  checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
29
  final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
 
 
30
 
31
  @classmethod
32
  def from_yaml(cls, test_case_path: str) -> "TestCase":
@@ -56,4 +58,6 @@ class TestCase(BaseModel):
56
  item for item in test_case_dict["ground_truth"] if isinstance(item, dict)
57
  ]
58
 
 
 
59
  return cls.model_validate(test_case_dict)
 
27
  ground_truth: List[Dict[str, Any]] = Field(default_factory=list)
28
  checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
29
  final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
30
+ test_case_path: str
31
+ output_path: str = "output/results.json"
32
 
33
  @classmethod
34
  def from_yaml(cls, test_case_path: str) -> "TestCase":
 
58
  item for item in test_case_dict["ground_truth"] if isinstance(item, dict)
59
  ]
60
 
61
+ test_case_dict["test_case_path"] = test_case_path
62
+
63
  return cls.model_validate(test_case_dict)
src/surf_spot_finder/tracing.py CHANGED
@@ -46,7 +46,7 @@ def get_tracer_provider(
46
  project_name: str,
47
  json_tracer: bool,
48
  agent_type: AgentType,
49
- output_dir: str = "telemetry_output",
50
  ) -> tuple[TracerProvider, str | None]:
51
  """
52
  Create a tracer_provider based on the selected mode.
@@ -57,7 +57,7 @@ def get_tracer_provider(
57
  agent_type: The type of agent being used.
58
  output_dir: The directory where the telemetry output will be stored.
59
  Only used if `json_tracer=True`.
60
- Defaults to "telemetry_output".
61
 
62
  Returns:
63
  tracer_provider: The configured tracer provider
 
46
  project_name: str,
47
  json_tracer: bool,
48
  agent_type: AgentType,
49
+ output_dir: str = "output",
50
  ) -> tuple[TracerProvider, str | None]:
51
  """
52
  Create a tracer_provider based on the selected mode.
 
57
  agent_type: The type of agent being used.
58
  output_dir: The directory where the telemetry output will be stored.
59
  Only used if `json_tracer=True`.
60
+ Defaults to "output".
61
 
62
  Returns:
63
  tracer_provider: The configured tracer provider