Spaces:
Running
Running
Nathan Brake
commited on
Default output to 'output' instead of 'telemetry_output'. Save eval output to DF (#32)
Browse files
.gitignore
CHANGED
@@ -167,4 +167,4 @@ cython_debug/
|
|
167 |
.idea/
|
168 |
.vscode/
|
169 |
|
170 |
-
|
|
|
167 |
.idea/
|
168 |
.vscode/
|
169 |
|
170 |
+
output
|
src/surf_spot_finder/evaluation/evaluate.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
import json
|
|
|
2 |
import sys
|
3 |
from textwrap import dedent
|
4 |
from typing import Any, Dict, List, Optional
|
5 |
from loguru import logger
|
6 |
from fire import Fire
|
|
|
7 |
from surf_spot_finder.cli import find_surf_spot
|
8 |
from surf_spot_finder.config import (
|
9 |
Config,
|
@@ -53,9 +55,6 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
|
|
53 |
# Extract the final answer from the telemetry
|
54 |
processor = TelemetryProcessor.create(agent_type)
|
55 |
hypothesis_answer = processor.extract_hypothesis_answer(trace=telemetry)
|
56 |
-
logger.info(
|
57 |
-
f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>"""
|
58 |
-
)
|
59 |
# Verify agent behavior against checkpoints using llm-as-a-judge
|
60 |
llm_judge = "openai/gpt-4o"
|
61 |
checkpoint_results = verify_checkpoints(
|
@@ -72,6 +71,10 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
|
|
72 |
model=llm_judge,
|
73 |
)
|
74 |
# Summarize results
|
|
|
|
|
|
|
|
|
75 |
|
76 |
verification_results = checkpoint_results + hypothesis_answer_results
|
77 |
failed_checks = [r for r in verification_results if not r.passed]
|
@@ -86,7 +89,7 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
|
|
86 |
- {check.criteria}
|
87 |
- {check.reason}</green>"""
|
88 |
)
|
89 |
-
|
90 |
if failed_checks:
|
91 |
for check in failed_checks:
|
92 |
message = dedent(
|
@@ -95,14 +98,43 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
|
|
95 |
- {check.criteria}
|
96 |
- {check.reason}</red>"""
|
97 |
)
|
98 |
-
|
99 |
else:
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
|
108 |
def evaluate(
|
|
|
1 |
import json
|
2 |
+
import os
|
3 |
import sys
|
4 |
from textwrap import dedent
|
5 |
from typing import Any, Dict, List, Optional
|
6 |
from loguru import logger
|
7 |
from fire import Fire
|
8 |
+
import pandas as pd
|
9 |
from surf_spot_finder.cli import find_surf_spot
|
10 |
from surf_spot_finder.config import (
|
11 |
Config,
|
|
|
55 |
# Extract the final answer from the telemetry
|
56 |
processor = TelemetryProcessor.create(agent_type)
|
57 |
hypothesis_answer = processor.extract_hypothesis_answer(trace=telemetry)
|
|
|
|
|
|
|
58 |
# Verify agent behavior against checkpoints using llm-as-a-judge
|
59 |
llm_judge = "openai/gpt-4o"
|
60 |
checkpoint_results = verify_checkpoints(
|
|
|
71 |
model=llm_judge,
|
72 |
)
|
73 |
# Summarize results
|
74 |
+
output_message = ""
|
75 |
+
output_message += (
|
76 |
+
f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>\n"""
|
77 |
+
)
|
78 |
|
79 |
verification_results = checkpoint_results + hypothesis_answer_results
|
80 |
failed_checks = [r for r in verification_results if not r.passed]
|
|
|
89 |
- {check.criteria}
|
90 |
- {check.reason}</green>"""
|
91 |
)
|
92 |
+
output_message += message + "\n"
|
93 |
if failed_checks:
|
94 |
for check in failed_checks:
|
95 |
message = dedent(
|
|
|
98 |
- {check.criteria}
|
99 |
- {check.reason}</red>"""
|
100 |
)
|
101 |
+
output_message += message + "\n"
|
102 |
else:
|
103 |
+
output_message += "<green>All checkpoints passed!</green>\n"
|
104 |
+
output_message += f"<green>Passed checkpoints: {len(passed_checks)}</green>\n"
|
105 |
+
output_message += f"<red>Failed checkpoints: {len(failed_checks)}</red>\n"
|
106 |
+
output_message += "<green>=====================================</green>\n"
|
107 |
+
output_message += (
|
108 |
+
f"<green>Score: {won_points}/{won_points + missed_points}</green>\n"
|
109 |
+
)
|
110 |
+
output_message += "<green>=====================================</green>\n"
|
111 |
+
logger.info(output_message)
|
112 |
+
# See if the test_case.output_path file exists.
|
113 |
+
if os.path.exists(test_case.output_path):
|
114 |
+
df = pd.read_json(test_case.output_path, orient="records", lines=True)
|
115 |
+
else:
|
116 |
+
df = pd.DataFrame()
|
117 |
+
df = pd.concat(
|
118 |
+
[
|
119 |
+
df,
|
120 |
+
pd.DataFrame(
|
121 |
+
[
|
122 |
+
{
|
123 |
+
"test_case_path": test_case.test_case_path,
|
124 |
+
"output_message": output_message,
|
125 |
+
"telemetry_path": telemetry_path,
|
126 |
+
"hypothesis_answer": hypothesis_answer,
|
127 |
+
"passed_checks": len(passed_checks),
|
128 |
+
"failed_checks": len(failed_checks),
|
129 |
+
"score": round(
|
130 |
+
won_points / (won_points + missed_points) * 100, 2
|
131 |
+
),
|
132 |
+
}
|
133 |
+
]
|
134 |
+
),
|
135 |
+
]
|
136 |
+
)
|
137 |
+
df.to_json(test_case.output_path, orient="records", lines=True)
|
138 |
|
139 |
|
140 |
def evaluate(
|
src/surf_spot_finder/evaluation/telemetry/langchain_telemetry.py
CHANGED
@@ -20,8 +20,15 @@ class LangchainTelemetryProcessor(TelemetryProcessor):
|
|
20 |
message = json.loads(content)["messages"][0]
|
21 |
message = self.parse_generic_key_value_string(message)
|
22 |
base_message = BaseMessage(content=message["content"], type="AGENT")
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
raise ValueError("No agent final answer found in trace")
|
27 |
|
|
|
20 |
message = json.loads(content)["messages"][0]
|
21 |
message = self.parse_generic_key_value_string(message)
|
22 |
base_message = BaseMessage(content=message["content"], type="AGENT")
|
23 |
+
# Use the interpreted string for printing
|
24 |
+
final_text = base_message.text()
|
25 |
+
# Either decode escape sequences if they're present
|
26 |
+
try:
|
27 |
+
final_text = final_text.encode().decode("unicode_escape")
|
28 |
+
except UnicodeDecodeError:
|
29 |
+
# If that fails, the escape sequences might already be interpreted
|
30 |
+
pass
|
31 |
+
return final_text
|
32 |
|
33 |
raise ValueError("No agent final answer found in trace")
|
34 |
|
src/surf_spot_finder/evaluation/telemetry/openai_telemetry.py
CHANGED
@@ -70,7 +70,8 @@ class OpenAITelemetryProcessor(TelemetryProcessor):
|
|
70 |
"tool_name": tool_name,
|
71 |
"input": attributes.get("input.value", ""),
|
72 |
"output": tool_output,
|
73 |
-
|
|
|
74 |
}
|
75 |
span_info["input"] = json.loads(span_info["input"])
|
76 |
|
|
|
70 |
"tool_name": tool_name,
|
71 |
"input": attributes.get("input.value", ""),
|
72 |
"output": tool_output,
|
73 |
+
# Can't add status yet because it isn't being set by openinference
|
74 |
+
# "status": span.get("status", {}).get("status_code"),
|
75 |
}
|
76 |
span_info["input"] = json.loads(span_info["input"])
|
77 |
|
src/surf_spot_finder/evaluation/test_case.py
CHANGED
@@ -27,6 +27,8 @@ class TestCase(BaseModel):
|
|
27 |
ground_truth: List[Dict[str, Any]] = Field(default_factory=list)
|
28 |
checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
|
29 |
final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
|
|
|
|
|
30 |
|
31 |
@classmethod
|
32 |
def from_yaml(cls, test_case_path: str) -> "TestCase":
|
@@ -56,4 +58,6 @@ class TestCase(BaseModel):
|
|
56 |
item for item in test_case_dict["ground_truth"] if isinstance(item, dict)
|
57 |
]
|
58 |
|
|
|
|
|
59 |
return cls.model_validate(test_case_dict)
|
|
|
27 |
ground_truth: List[Dict[str, Any]] = Field(default_factory=list)
|
28 |
checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
|
29 |
final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
|
30 |
+
test_case_path: str
|
31 |
+
output_path: str = "output/results.json"
|
32 |
|
33 |
@classmethod
|
34 |
def from_yaml(cls, test_case_path: str) -> "TestCase":
|
|
|
58 |
item for item in test_case_dict["ground_truth"] if isinstance(item, dict)
|
59 |
]
|
60 |
|
61 |
+
test_case_dict["test_case_path"] = test_case_path
|
62 |
+
|
63 |
return cls.model_validate(test_case_dict)
|
src/surf_spot_finder/tracing.py
CHANGED
@@ -46,7 +46,7 @@ def get_tracer_provider(
|
|
46 |
project_name: str,
|
47 |
json_tracer: bool,
|
48 |
agent_type: AgentType,
|
49 |
-
output_dir: str = "
|
50 |
) -> tuple[TracerProvider, str | None]:
|
51 |
"""
|
52 |
Create a tracer_provider based on the selected mode.
|
@@ -57,7 +57,7 @@ def get_tracer_provider(
|
|
57 |
agent_type: The type of agent being used.
|
58 |
output_dir: The directory where the telemetry output will be stored.
|
59 |
Only used if `json_tracer=True`.
|
60 |
-
Defaults to "
|
61 |
|
62 |
Returns:
|
63 |
tracer_provider: The configured tracer provider
|
|
|
46 |
project_name: str,
|
47 |
json_tracer: bool,
|
48 |
agent_type: AgentType,
|
49 |
+
output_dir: str = "output",
|
50 |
) -> tuple[TracerProvider, str | None]:
|
51 |
"""
|
52 |
Create a tracer_provider based on the selected mode.
|
|
|
57 |
agent_type: The type of agent being used.
|
58 |
output_dir: The directory where the telemetry output will be stored.
|
59 |
Only used if `json_tracer=True`.
|
60 |
+
Defaults to "output".
|
61 |
|
62 |
Returns:
|
63 |
tracer_provider: The configured tracer provider
|