Nathan Brake commited on
Commit
5e4d59b
·
unverified ·
1 Parent(s): 5e252e7

telemetry (#48)

Browse files
pyproject.toml CHANGED
@@ -35,6 +35,10 @@ tests = [
35
  "evaluate>=0.4.3",
36
  ]
37
 
 
 
 
 
38
  [project.urls]
39
  Documentation = "https://mozilla-ai.github.io/surf-spot-finder/"
40
  Issues = "https://github.com/mozilla-ai/surf-spot-finder/issues"
@@ -47,11 +51,6 @@ namespaces = false
47
 
48
  [tool.setuptools_scm]
49
 
50
- [dependency-groups]
51
- dev = [
52
- "pre-commit>=4.1.0",
53
- ]
54
-
55
  [project.scripts]
56
  surf-spot-finder = "surf_spot_finder.cli:main"
57
  surf-spot-finder-no-framework = "surf_spot_finder.no_framework:main"
 
35
  "evaluate>=0.4.3",
36
  ]
37
 
38
+ dev = [
39
+ "pre-commit>=4.1.0",
40
+ ]
41
+
42
  [project.urls]
43
  Documentation = "https://mozilla-ai.github.io/surf-spot-finder/"
44
  Issues = "https://github.com/mozilla-ai/surf-spot-finder/issues"
 
51
 
52
  [tool.setuptools_scm]
53
 
 
 
 
 
 
54
  [project.scripts]
55
  surf-spot-finder = "surf_spot_finder.cli:main"
56
  surf-spot-finder-no-framework = "surf_spot_finder.no_framework:main"
src/surf_spot_finder/evaluation/evaluate.py CHANGED
@@ -3,21 +3,23 @@ import os
3
  import sys
4
  from textwrap import dedent
5
  from typing import Any, Dict, List, Optional
6
- from loguru import logger
7
- from fire import Fire
8
  import pandas as pd
 
 
 
 
 
 
9
  from surf_spot_finder.config import (
10
  Config,
11
  )
12
- from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
13
  from surf_spot_finder.evaluation.evaluators import (
14
  CheckpointEvaluator,
15
- QuestionAnsweringSquadEvaluator,
16
  HypothesisEvaluator,
 
17
  )
18
  from surf_spot_finder.evaluation.test_case import TestCase
19
- from any_agent import AnyAgent
20
- from any_agent.tracing import get_tracer_provider, setup_tracing
21
 
22
  logger.remove()
23
  logger = logger.opt(ansi=True)
 
3
  import sys
4
  from textwrap import dedent
5
  from typing import Any, Dict, List, Optional
6
+
 
7
  import pandas as pd
8
+ from any_agent import AnyAgent
9
+ from any_agent.telemetry import TelemetryProcessor
10
+ from any_agent.tracing import get_tracer_provider, setup_tracing
11
+ from fire import Fire
12
+ from loguru import logger
13
+
14
  from surf_spot_finder.config import (
15
  Config,
16
  )
 
17
  from surf_spot_finder.evaluation.evaluators import (
18
  CheckpointEvaluator,
 
19
  HypothesisEvaluator,
20
+ QuestionAnsweringSquadEvaluator,
21
  )
22
  from surf_spot_finder.evaluation.test_case import TestCase
 
 
23
 
24
  logger.remove()
25
  logger = logger.opt(ansi=True)
src/surf_spot_finder/evaluation/telemetry/__init__.py DELETED
@@ -1,3 +0,0 @@
1
- from .telemetry import TelemetryProcessor
2
-
3
- __all__ = ["TelemetryProcessor"]
 
 
 
 
src/surf_spot_finder/evaluation/telemetry/langchain_telemetry.py DELETED
@@ -1,88 +0,0 @@
1
- from typing import Any, Dict, List
2
- import json
3
- from any_agent import AgentFramework
4
- from langchain_core.messages import BaseMessage
5
-
6
-
7
- from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
8
-
9
-
10
- class LangchainTelemetryProcessor(TelemetryProcessor):
11
- """Processor for Langchain agent telemetry data."""
12
-
13
- def _get_agent_framework(self) -> AgentFramework:
14
- return AgentFramework.LANGCHAIN
15
-
16
- def extract_hypothesis_answer(self, trace: List[Dict[str, Any]]) -> str:
17
- for span in reversed(trace):
18
- if span["attributes"]["openinference.span.kind"] == "AGENT":
19
- content = span["attributes"]["output.value"]
20
- # Extract content from serialized langchain message
21
- message = json.loads(content)["messages"][0]
22
- message = self.parse_generic_key_value_string(message)
23
- base_message = BaseMessage(content=message["content"], type="AGENT")
24
- # Use the interpreted string for printing
25
- final_text = base_message.text()
26
- # Either decode escape sequences if they're present
27
- try:
28
- final_text = final_text.encode().decode("unicode_escape")
29
- except UnicodeDecodeError:
30
- # If that fails, the escape sequences might already be interpreted
31
- pass
32
- return final_text
33
-
34
- raise ValueError("No agent final answer found in trace")
35
-
36
- def _extract_telemetry_data(self, telemetry: List[Dict[str, Any]]) -> List[Dict]:
37
- """Extract LLM calls and tool calls from LangChain telemetry."""
38
- calls = []
39
-
40
- for span in telemetry:
41
- if "attributes" not in span:
42
- continue
43
-
44
- attributes = span.get("attributes", {})
45
- span_kind = attributes.get("openinference.span.kind", "")
46
-
47
- # Collect LLM calls
48
- if (
49
- span_kind == "LLM"
50
- and "llm.output_messages.0.message.content" in attributes
51
- ):
52
- llm_info = {
53
- "model": attributes.get("llm.model_name", "Unknown model"),
54
- "input": attributes.get("llm.input_messages.0.message.content", ""),
55
- "output": attributes.get(
56
- "llm.output_messages.0.message.content", ""
57
- ),
58
- "type": "reasoning",
59
- }
60
- calls.append(llm_info)
61
-
62
- # Try to find tool calls
63
- if "tool.name" in attributes or span.get("name", "").endswith("Tool"):
64
- tool_info = {
65
- "tool_name": attributes.get(
66
- "tool.name", span.get("name", "Unknown tool")
67
- ),
68
- "status": "success"
69
- if span.get("status", {}).get("status_code") == "OK"
70
- else "error",
71
- "error": span.get("status", {}).get("description", None),
72
- }
73
-
74
- if "input.value" in attributes:
75
- try:
76
- input_value = json.loads(attributes["input.value"])
77
- tool_info["input"] = input_value
78
- except Exception:
79
- tool_info["input"] = attributes["input.value"]
80
-
81
- if "output.value" in attributes:
82
- tool_info["output"] = self.parse_generic_key_value_string(
83
- json.loads(attributes["output.value"])["output"]
84
- )["content"]
85
-
86
- calls.append(tool_info)
87
-
88
- return calls
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/surf_spot_finder/evaluation/telemetry/openai_telemetry.py DELETED
@@ -1,106 +0,0 @@
1
- from typing import Any, Dict, List
2
- import json
3
-
4
- from any_agent import AgentFramework
5
- from loguru import logger
6
- from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
7
-
8
-
9
- class OpenAITelemetryProcessor(TelemetryProcessor):
10
- """Processor for OpenAI agent telemetry data."""
11
-
12
- def _get_agent_framework(self) -> AgentFramework:
13
- return AgentFramework.OPENAI
14
-
15
- def extract_hypothesis_answer(self, trace: List[Dict[str, Any]]) -> str:
16
- for span in reversed(trace):
17
- # Looking for the final response that has the summary answer
18
- if (
19
- "attributes" in span
20
- and span.get("attributes", {}).get("openinference.span.kind") == "LLM"
21
- ):
22
- output_key = (
23
- "llm.output_messages.0.message.contents.0.message_content.text"
24
- )
25
- if output_key in span["attributes"]:
26
- return span["attributes"][output_key]
27
- logger.warning("No agent final answer found in trace")
28
- return "NO FINAL ANSWER FOUND"
29
-
30
- def _extract_telemetry_data(self, telemetry: List[Dict[str, Any]]) -> list:
31
- """Extract LLM calls and tool calls from OpenAI telemetry."""
32
- calls = []
33
-
34
- for span in telemetry:
35
- if "attributes" not in span:
36
- continue
37
-
38
- attributes = span.get("attributes", {})
39
- span_kind = attributes.get("openinference.span.kind", "")
40
-
41
- # Collect LLM interactions - look for direct message content first
42
- if span_kind == "LLM":
43
- # Initialize the LLM info dictionary
44
- span_info = {}
45
-
46
- # Try to get input message
47
- input_key = "llm.input_messages.1.message.content" # User message is usually at index 1
48
- if input_key in attributes:
49
- span_info["input"] = attributes[input_key]
50
-
51
- # Try to get output message directly
52
- output_content = None
53
- # Try in multiple possible locations
54
- for key in [
55
- "llm.output_messages.0.message.content",
56
- "llm.output_messages.0.message.contents.0.message_content.text",
57
- ]:
58
- if key in attributes:
59
- output_content = attributes[key]
60
- break
61
-
62
- # If we found direct output content, use it
63
- if output_content:
64
- span_info["output"] = output_content
65
- calls.append(span_info)
66
- elif span_kind == "TOOL":
67
- tool_name = attributes.get("tool.name", "Unknown tool")
68
- tool_output = attributes.get("output.value", "")
69
-
70
- span_info = {
71
- "tool_name": tool_name,
72
- "input": attributes.get("input.value", ""),
73
- "output": tool_output,
74
- # Can't add status yet because it isn't being set by openinference
75
- # "status": span.get("status", {}).get("status_code"),
76
- }
77
- span_info["input"] = json.loads(span_info["input"])
78
-
79
- calls.append(span_info)
80
-
81
- return calls
82
-
83
-
84
- # Backward compatibility functions that use the new class structure
85
- def extract_hypothesis_answer(
86
- trace: List[Dict[str, Any]], agent_framework: AgentFramework
87
- ) -> str:
88
- """Extract the hypothesis agent final answer from the trace"""
89
- processor = TelemetryProcessor.create(agent_framework)
90
- return processor.extract_hypothesis_answer(trace)
91
-
92
-
93
- def parse_generic_key_value_string(text: str) -> Dict[str, str]:
94
- """
95
- Parse a string that has items of a dict with key-value pairs separated by '='.
96
- Only splits on '=' signs, handling quoted strings properly.
97
- """
98
- return TelemetryProcessor.parse_generic_key_value_string(text)
99
-
100
-
101
- def extract_evidence(
102
- telemetry: List[Dict[str, Any]], agent_framework: AgentFramework
103
- ) -> str:
104
- """Extract relevant telemetry evidence based on the agent type."""
105
- processor = TelemetryProcessor.create(agent_framework)
106
- return processor.extract_evidence(telemetry)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/surf_spot_finder/evaluation/telemetry/smolagents_telemetry.py DELETED
@@ -1,104 +0,0 @@
1
- from typing import Any, Dict, List
2
- import json
3
-
4
- from any_agent import AgentFramework
5
-
6
- from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
7
-
8
-
9
- class SmolagentsTelemetryProcessor(TelemetryProcessor):
10
- """Processor for SmoL Agents telemetry data."""
11
-
12
- def _get_agent_framework(self) -> AgentFramework:
13
- return AgentFramework.SMOLAGENTS
14
-
15
- def extract_hypothesis_answer(self, trace: List[Dict[str, Any]]) -> str:
16
- for span in reversed(trace):
17
- if span["attributes"]["openinference.span.kind"] == "AGENT":
18
- content = span["attributes"]["output.value"]
19
- return content
20
-
21
- raise ValueError("No agent final answer found in trace")
22
-
23
- def _extract_telemetry_data(self, telemetry: List[Dict[str, Any]]) -> List[Dict]:
24
- """Extract LLM calls and tool calls from SmoL Agents telemetry."""
25
- calls = []
26
-
27
- for span in telemetry:
28
- # Skip spans without attributes
29
- if "attributes" not in span:
30
- continue
31
-
32
- attributes = span["attributes"]
33
-
34
- # Extract tool information
35
- if "tool.name" in attributes or span.get("name", "").startswith(
36
- "SimpleTool"
37
- ):
38
- tool_info = {
39
- "tool_name": attributes.get(
40
- "tool.name", span.get("name", "Unknown tool")
41
- ),
42
- "status": "success"
43
- if span.get("status", {}).get("status_code") == "OK"
44
- else "error",
45
- "error": span.get("status", {}).get("description", None),
46
- }
47
-
48
- # Extract input if available
49
- if "input.value" in attributes:
50
- try:
51
- input_value = json.loads(attributes["input.value"])
52
- if "kwargs" in input_value:
53
- # For SmoLAgents, the actual input is often in the kwargs field
54
- tool_info["input"] = input_value["kwargs"]
55
- else:
56
- tool_info["input"] = input_value
57
- except (json.JSONDecodeError, TypeError):
58
- tool_info["input"] = attributes["input.value"]
59
-
60
- # Extract output if available
61
- if "output.value" in attributes:
62
- try:
63
- # Try to parse JSON output
64
- output_value = (
65
- json.loads(attributes["output.value"])
66
- if isinstance(attributes["output.value"], str)
67
- else attributes["output.value"]
68
- )
69
- tool_info["output"] = output_value
70
- except (json.JSONDecodeError, TypeError):
71
- tool_info["output"] = attributes["output.value"]
72
- else:
73
- tool_info["output"] = "No output found"
74
-
75
- calls.append(tool_info)
76
-
77
- # Extract LLM calls to see reasoning
78
- elif "LiteLLMModel.__call__" in span.get("name", ""):
79
- # The LLM output may be in different places depending on the implementation
80
- output_content = None
81
-
82
- # Try to get the output from the llm.output_messages.0.message.content attribute
83
- if "llm.output_messages.0.message.content" in attributes:
84
- output_content = attributes["llm.output_messages.0.message.content"]
85
-
86
- # Or try to parse it from the output.value as JSON
87
- elif "output.value" in attributes:
88
- try:
89
- output_value = json.loads(attributes["output.value"])
90
- if "content" in output_value:
91
- output_content = output_value["content"]
92
- except (json.JSONDecodeError, TypeError):
93
- pass
94
-
95
- if output_content:
96
- calls.append(
97
- {
98
- "model": attributes.get("llm.model_name", "Unknown model"),
99
- "output": output_content,
100
- "type": "reasoning",
101
- }
102
- )
103
-
104
- return calls
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/surf_spot_finder/evaluation/telemetry/telemetry.py DELETED
@@ -1,125 +0,0 @@
1
- from typing import Any, Dict, List, ClassVar
2
- import json
3
- import re
4
- from abc import ABC, abstractmethod
5
- from any_agent import AgentFramework
6
- from loguru import logger
7
-
8
-
9
- class TelemetryProcessor(ABC):
10
- """Base class for processing telemetry data from different agent types."""
11
-
12
- MAX_EVIDENCE_LENGTH: ClassVar[int] = 400
13
-
14
- @classmethod
15
- def create(cls, agent_framework: AgentFramework) -> "TelemetryProcessor":
16
- """Factory method to create the appropriate telemetry processor."""
17
- if agent_framework == AgentFramework.LANGCHAIN:
18
- from surf_spot_finder.evaluation.telemetry.langchain_telemetry import (
19
- LangchainTelemetryProcessor,
20
- )
21
-
22
- return LangchainTelemetryProcessor()
23
- elif agent_framework == AgentFramework.SMOLAGENTS:
24
- from surf_spot_finder.evaluation.telemetry.smolagents_telemetry import (
25
- SmolagentsTelemetryProcessor,
26
- )
27
-
28
- return SmolagentsTelemetryProcessor()
29
- elif agent_framework == AgentFramework.OPENAI:
30
- from surf_spot_finder.evaluation.telemetry.openai_telemetry import (
31
- OpenAITelemetryProcessor,
32
- )
33
-
34
- return OpenAITelemetryProcessor()
35
- else:
36
- raise ValueError(f"Unsupported agent type {agent_framework}")
37
-
38
- @staticmethod
39
- def determine_agent_framework(trace: List[Dict[str, Any]]) -> AgentFramework:
40
- """Determine the agent type based on the trace.
41
- These are not really stable ways to find it, because we're waiting on some
42
- reliable method for determining the agent type. This is a temporary solution.
43
- """
44
- for span in trace:
45
- if "langchain" in span.get("attributes", {}).get("input.value", ""):
46
- logger.info("Agent type is LANGCHAIN")
47
- return AgentFramework.LANGCHAIN
48
- if span.get("attributes", {}).get("smolagents.max_steps"):
49
- logger.info("Agent type is SMOLAGENTS")
50
- return AgentFramework.SMOLAGENTS
51
- # This is extremely fragile but there currently isn't
52
- # any specific key to indicate the agent type
53
- if span.get("name") == "response":
54
- logger.info("Agent type is OPENAI")
55
- return AgentFramework.OPENAI
56
- raise ValueError(
57
- "Could not determine agent type from trace, or agent type not supported"
58
- )
59
-
60
- @abstractmethod
61
- def extract_hypothesis_answer(self, trace: List[Dict[str, Any]]) -> str:
62
- """Extract the hypothesis agent final answer from the trace."""
63
- pass
64
-
65
- @abstractmethod
66
- def _extract_telemetry_data(self, telemetry: List[Dict[str, Any]]) -> List[Dict]:
67
- """Extract the agent-specific data from telemetry."""
68
- pass
69
-
70
- def extract_evidence(self, telemetry: List[Dict[str, Any]]) -> str:
71
- """Extract relevant telemetry evidence."""
72
- calls = self._extract_telemetry_data(telemetry)
73
- return self._format_evidence(calls)
74
-
75
- def _format_evidence(self, calls: List[Dict]) -> str:
76
- """Format extracted data into a standardized output format."""
77
- evidence = f"## {self._get_agent_framework().name} Agent Execution\n\n"
78
-
79
- for idx, call in enumerate(calls, start=1):
80
- evidence += f"### Call {idx}\n"
81
-
82
- # Truncate any values that are too long
83
- call = {
84
- k: (
85
- v[: self.MAX_EVIDENCE_LENGTH] + "..."
86
- if isinstance(v, str) and len(v) > self.MAX_EVIDENCE_LENGTH
87
- else v
88
- )
89
- for k, v in call.items()
90
- }
91
-
92
- # Use ensure_ascii=False to prevent escaping Unicode characters
93
- evidence += json.dumps(call, indent=2, ensure_ascii=False) + "\n\n"
94
-
95
- return evidence
96
-
97
- @abstractmethod
98
- def _get_agent_framework(self) -> AgentFramework:
99
- """Get the agent type associated with this processor."""
100
- pass
101
-
102
- @staticmethod
103
- def parse_generic_key_value_string(text: str) -> Dict[str, str]:
104
- """
105
- Parse a string that has items of a dict with key-value pairs separated by '='.
106
- Only splits on '=' signs, handling quoted strings properly.
107
- """
108
- pattern = r"(\w+)=('.*?'|\".*?\"|[^'\"=]*?)(?=\s+\w+=|\s*$)"
109
- result = {}
110
-
111
- matches = re.findall(pattern, text)
112
- for key, value in matches:
113
- # Clean up the key
114
- key = key.strip()
115
-
116
- # Clean up the value - remove surrounding quotes if present
117
- if (value.startswith("'") and value.endswith("'")) or (
118
- value.startswith('"') and value.endswith('"')
119
- ):
120
- value = value[1:-1]
121
-
122
- # Store in result dictionary
123
- result[key] = value
124
-
125
- return result