Spaces:

mozilla-ai
/

surf-spot-finder

Running

App Files Files Community

Nathan Brake commited on Mar 28

Commit

5e4d59b

unverified ·

1 Parent(s): 5e252e7

telemetry (#48)

Browse files

Files changed (7) hide show

pyproject.toml +4 -5
src/surf_spot_finder/evaluation/evaluate.py +8 -6
src/surf_spot_finder/evaluation/telemetry/__init__.py +0 -3
src/surf_spot_finder/evaluation/telemetry/langchain_telemetry.py +0 -88
src/surf_spot_finder/evaluation/telemetry/openai_telemetry.py +0 -106
src/surf_spot_finder/evaluation/telemetry/smolagents_telemetry.py +0 -104
src/surf_spot_finder/evaluation/telemetry/telemetry.py +0 -125

pyproject.toml CHANGED Viewed

@@ -35,6 +35,10 @@ tests = [
   "evaluate>=0.4.3",
 ]
 [project.urls]
 Documentation = "https://mozilla-ai.github.io/surf-spot-finder/"
 Issues = "https://github.com/mozilla-ai/surf-spot-finder/issues"
@@ -47,11 +51,6 @@ namespaces = false
 [tool.setuptools_scm]
-[dependency-groups]
-dev = [
-    "pre-commit>=4.1.0",
-]
 [project.scripts]
 surf-spot-finder = "surf_spot_finder.cli:main"
 surf-spot-finder-no-framework = "surf_spot_finder.no_framework:main"

   "evaluate>=0.4.3",
 ]
+dev = [
+    "pre-commit>=4.1.0",
+]
 [project.urls]
 Documentation = "https://mozilla-ai.github.io/surf-spot-finder/"
 Issues = "https://github.com/mozilla-ai/surf-spot-finder/issues"
 [tool.setuptools_scm]
 [project.scripts]
 surf-spot-finder = "surf_spot_finder.cli:main"
 surf-spot-finder-no-framework = "surf_spot_finder.no_framework:main"

src/surf_spot_finder/evaluation/evaluate.py CHANGED Viewed

@@ -3,21 +3,23 @@ import os
 import sys
 from textwrap import dedent
 from typing import Any, Dict, List, Optional
-from loguru import logger
-from fire import Fire
 import pandas as pd
 from surf_spot_finder.config import (
     Config,
 )
-from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
 from surf_spot_finder.evaluation.evaluators import (
     CheckpointEvaluator,
-    QuestionAnsweringSquadEvaluator,
     HypothesisEvaluator,
 )
 from surf_spot_finder.evaluation.test_case import TestCase
-from any_agent import AnyAgent
-from any_agent.tracing import get_tracer_provider, setup_tracing
 logger.remove()
 logger = logger.opt(ansi=True)

 import sys
 from textwrap import dedent
 from typing import Any, Dict, List, Optional
 import pandas as pd
+from any_agent import AnyAgent
+from any_agent.telemetry import TelemetryProcessor
+from any_agent.tracing import get_tracer_provider, setup_tracing
+from fire import Fire
+from loguru import logger
 from surf_spot_finder.config import (
     Config,
 )
 from surf_spot_finder.evaluation.evaluators import (
     CheckpointEvaluator,
     HypothesisEvaluator,
+    QuestionAnsweringSquadEvaluator,
 )
 from surf_spot_finder.evaluation.test_case import TestCase
 logger.remove()
 logger = logger.opt(ansi=True)

src/surf_spot_finder/evaluation/telemetry/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .telemetry import TelemetryProcessor
-__all__ = ["TelemetryProcessor"]

src/surf_spot_finder/evaluation/telemetry/langchain_telemetry.py DELETED Viewed

@@ -1,88 +0,0 @@
-from typing import Any, Dict, List
-import json
-from any_agent import AgentFramework
-from langchain_core.messages import BaseMessage
-from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
-class LangchainTelemetryProcessor(TelemetryProcessor):
-    """Processor for Langchain agent telemetry data."""
-    def _get_agent_framework(self) -> AgentFramework:
-        return AgentFramework.LANGCHAIN
-    def extract_hypothesis_answer(self, trace: List[Dict[str, Any]]) -> str:
-        for span in reversed(trace):
-            if span["attributes"]["openinference.span.kind"] == "AGENT":
-                content = span["attributes"]["output.value"]
-                # Extract content from serialized langchain message
-                message = json.loads(content)["messages"][0]
-                message = self.parse_generic_key_value_string(message)
-                base_message = BaseMessage(content=message["content"], type="AGENT")
-                # Use the interpreted string for printing
-                final_text = base_message.text()
-                # Either decode escape sequences if they're present
-                try:
-                    final_text = final_text.encode().decode("unicode_escape")
-                except UnicodeDecodeError:
-                    # If that fails, the escape sequences might already be interpreted
-                    pass
-                return final_text
-        raise ValueError("No agent final answer found in trace")
-    def _extract_telemetry_data(self, telemetry: List[Dict[str, Any]]) -> List[Dict]:
-        """Extract LLM calls and tool calls from LangChain telemetry."""
-        calls = []
-        for span in telemetry:
-            if "attributes" not in span:
-                continue
-            attributes = span.get("attributes", {})
-            span_kind = attributes.get("openinference.span.kind", "")
-            # Collect LLM calls
-            if (
-                span_kind == "LLM"
-                and "llm.output_messages.0.message.content" in attributes
-            ):
-                llm_info = {
-                    "model": attributes.get("llm.model_name", "Unknown model"),
-                    "input": attributes.get("llm.input_messages.0.message.content", ""),
-                    "output": attributes.get(
-                        "llm.output_messages.0.message.content", ""
-                    ),
-                    "type": "reasoning",
-                }
-                calls.append(llm_info)
-            # Try to find tool calls
-            if "tool.name" in attributes or span.get("name", "").endswith("Tool"):
-                tool_info = {
-                    "tool_name": attributes.get(
-                        "tool.name", span.get("name", "Unknown tool")
-                    ),
-                    "status": "success"
-                    if span.get("status", {}).get("status_code") == "OK"
-                    else "error",
-                    "error": span.get("status", {}).get("description", None),
-                }
-                if "input.value" in attributes:
-                    try:
-                        input_value = json.loads(attributes["input.value"])
-                        tool_info["input"] = input_value
-                    except Exception:
-                        tool_info["input"] = attributes["input.value"]
-                if "output.value" in attributes:
-                    tool_info["output"] = self.parse_generic_key_value_string(
-                        json.loads(attributes["output.value"])["output"]
-                    )["content"]
-                calls.append(tool_info)
-        return calls

src/surf_spot_finder/evaluation/telemetry/openai_telemetry.py DELETED Viewed

@@ -1,106 +0,0 @@
-from typing import Any, Dict, List
-import json
-from any_agent import AgentFramework
-from loguru import logger
-from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
-class OpenAITelemetryProcessor(TelemetryProcessor):
-    """Processor for OpenAI agent telemetry data."""
-    def _get_agent_framework(self) -> AgentFramework:
-        return AgentFramework.OPENAI
-    def extract_hypothesis_answer(self, trace: List[Dict[str, Any]]) -> str:
-        for span in reversed(trace):
-            # Looking for the final response that has the summary answer
-            if (
-                "attributes" in span
-                and span.get("attributes", {}).get("openinference.span.kind") == "LLM"
-            ):
-                output_key = (
-                    "llm.output_messages.0.message.contents.0.message_content.text"
-                )
-                if output_key in span["attributes"]:
-                    return span["attributes"][output_key]
-        logger.warning("No agent final answer found in trace")
-        return "NO FINAL ANSWER FOUND"
-    def _extract_telemetry_data(self, telemetry: List[Dict[str, Any]]) -> list:
-        """Extract LLM calls and tool calls from OpenAI telemetry."""
-        calls = []
-        for span in telemetry:
-            if "attributes" not in span:
-                continue
-            attributes = span.get("attributes", {})
-            span_kind = attributes.get("openinference.span.kind", "")
-            # Collect LLM interactions - look for direct message content first
-            if span_kind == "LLM":
-                # Initialize the LLM info dictionary
-                span_info = {}
-                # Try to get input message
-                input_key = "llm.input_messages.1.message.content"  # User message is usually at index 1
-                if input_key in attributes:
-                    span_info["input"] = attributes[input_key]
-                # Try to get output message directly
-                output_content = None
-                # Try in multiple possible locations
-                for key in [
-                    "llm.output_messages.0.message.content",
-                    "llm.output_messages.0.message.contents.0.message_content.text",
-                ]:
-                    if key in attributes:
-                        output_content = attributes[key]
-                        break
-                # If we found direct output content, use it
-                if output_content:
-                    span_info["output"] = output_content
-                    calls.append(span_info)
-            elif span_kind == "TOOL":
-                tool_name = attributes.get("tool.name", "Unknown tool")
-                tool_output = attributes.get("output.value", "")
-                span_info = {
-                    "tool_name": tool_name,
-                    "input": attributes.get("input.value", ""),
-                    "output": tool_output,
-                    # Can't add status yet because it isn't being set by openinference
-                    # "status": span.get("status", {}).get("status_code"),
-                }
-                span_info["input"] = json.loads(span_info["input"])
-                calls.append(span_info)
-        return calls
-# Backward compatibility functions that use the new class structure
-def extract_hypothesis_answer(
-    trace: List[Dict[str, Any]], agent_framework: AgentFramework
-) -> str:
-    """Extract the hypothesis agent final answer from the trace"""
-    processor = TelemetryProcessor.create(agent_framework)
-    return processor.extract_hypothesis_answer(trace)
-def parse_generic_key_value_string(text: str) -> Dict[str, str]:
-    """
-    Parse a string that has items of a dict with key-value pairs separated by '='.
-    Only splits on '=' signs, handling quoted strings properly.
-    """
-    return TelemetryProcessor.parse_generic_key_value_string(text)
-def extract_evidence(
-    telemetry: List[Dict[str, Any]], agent_framework: AgentFramework
-) -> str:
-    """Extract relevant telemetry evidence based on the agent type."""
-    processor = TelemetryProcessor.create(agent_framework)
-    return processor.extract_evidence(telemetry)

src/surf_spot_finder/evaluation/telemetry/smolagents_telemetry.py DELETED Viewed

@@ -1,104 +0,0 @@
-from typing import Any, Dict, List
-import json
-from any_agent import AgentFramework
-from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
-class SmolagentsTelemetryProcessor(TelemetryProcessor):
-    """Processor for SmoL Agents telemetry data."""
-    def _get_agent_framework(self) -> AgentFramework:
-        return AgentFramework.SMOLAGENTS
-    def extract_hypothesis_answer(self, trace: List[Dict[str, Any]]) -> str:
-        for span in reversed(trace):
-            if span["attributes"]["openinference.span.kind"] == "AGENT":
-                content = span["attributes"]["output.value"]
-                return content
-        raise ValueError("No agent final answer found in trace")
-    def _extract_telemetry_data(self, telemetry: List[Dict[str, Any]]) -> List[Dict]:
-        """Extract LLM calls and tool calls from SmoL Agents telemetry."""
-        calls = []
-        for span in telemetry:
-            # Skip spans without attributes
-            if "attributes" not in span:
-                continue
-            attributes = span["attributes"]
-            # Extract tool information
-            if "tool.name" in attributes or span.get("name", "").startswith(
-                "SimpleTool"
-            ):
-                tool_info = {
-                    "tool_name": attributes.get(
-                        "tool.name", span.get("name", "Unknown tool")
-                    ),
-                    "status": "success"
-                    if span.get("status", {}).get("status_code") == "OK"
-                    else "error",
-                    "error": span.get("status", {}).get("description", None),
-                }
-                # Extract input if available
-                if "input.value" in attributes:
-                    try:
-                        input_value = json.loads(attributes["input.value"])
-                        if "kwargs" in input_value:
-                            # For SmoLAgents, the actual input is often in the kwargs field
-                            tool_info["input"] = input_value["kwargs"]
-                        else:
-                            tool_info["input"] = input_value
-                    except (json.JSONDecodeError, TypeError):
-                        tool_info["input"] = attributes["input.value"]
-                # Extract output if available
-                if "output.value" in attributes:
-                    try:
-                        # Try to parse JSON output
-                        output_value = (
-                            json.loads(attributes["output.value"])
-                            if isinstance(attributes["output.value"], str)
-                            else attributes["output.value"]
-                        )
-                        tool_info["output"] = output_value
-                    except (json.JSONDecodeError, TypeError):
-                        tool_info["output"] = attributes["output.value"]
-                else:
-                    tool_info["output"] = "No output found"
-                calls.append(tool_info)
-            # Extract LLM calls to see reasoning
-            elif "LiteLLMModel.__call__" in span.get("name", ""):
-                # The LLM output may be in different places depending on the implementation
-                output_content = None
-                # Try to get the output from the llm.output_messages.0.message.content attribute
-                if "llm.output_messages.0.message.content" in attributes:
-                    output_content = attributes["llm.output_messages.0.message.content"]
-                # Or try to parse it from the output.value as JSON
-                elif "output.value" in attributes:
-                    try:
-                        output_value = json.loads(attributes["output.value"])
-                        if "content" in output_value:
-                            output_content = output_value["content"]
-                    except (json.JSONDecodeError, TypeError):
-                        pass
-                if output_content:
-                    calls.append(
-                        {
-                            "model": attributes.get("llm.model_name", "Unknown model"),
-                            "output": output_content,
-                            "type": "reasoning",
-                        }
-                    )
-        return calls

src/surf_spot_finder/evaluation/telemetry/telemetry.py DELETED Viewed

@@ -1,125 +0,0 @@
-from typing import Any, Dict, List, ClassVar
-import json
-import re
-from abc import ABC, abstractmethod
-from any_agent import AgentFramework
-from loguru import logger
-class TelemetryProcessor(ABC):
-    """Base class for processing telemetry data from different agent types."""
-    MAX_EVIDENCE_LENGTH: ClassVar[int] = 400
-    @classmethod
-    def create(cls, agent_framework: AgentFramework) -> "TelemetryProcessor":
-        """Factory method to create the appropriate telemetry processor."""
-        if agent_framework == AgentFramework.LANGCHAIN:
-            from surf_spot_finder.evaluation.telemetry.langchain_telemetry import (
-                LangchainTelemetryProcessor,
-            )
-            return LangchainTelemetryProcessor()
-        elif agent_framework == AgentFramework.SMOLAGENTS:
-            from surf_spot_finder.evaluation.telemetry.smolagents_telemetry import (
-                SmolagentsTelemetryProcessor,
-            )
-            return SmolagentsTelemetryProcessor()
-        elif agent_framework == AgentFramework.OPENAI:
-            from surf_spot_finder.evaluation.telemetry.openai_telemetry import (
-                OpenAITelemetryProcessor,
-            )
-            return OpenAITelemetryProcessor()
-        else:
-            raise ValueError(f"Unsupported agent type {agent_framework}")
-    @staticmethod
-    def determine_agent_framework(trace: List[Dict[str, Any]]) -> AgentFramework:
-        """Determine the agent type based on the trace.
-        These are not really stable ways to find it, because we're waiting on some
-        reliable method for determining the agent type. This is a temporary solution.
-        """
-        for span in trace:
-            if "langchain" in span.get("attributes", {}).get("input.value", ""):
-                logger.info("Agent type is LANGCHAIN")
-                return AgentFramework.LANGCHAIN
-            if span.get("attributes", {}).get("smolagents.max_steps"):
-                logger.info("Agent type is SMOLAGENTS")
-                return AgentFramework.SMOLAGENTS
-            # This is extremely fragile but there currently isn't
-            # any specific key to indicate the agent type
-            if span.get("name") == "response":
-                logger.info("Agent type is OPENAI")
-                return AgentFramework.OPENAI
-        raise ValueError(
-            "Could not determine agent type from trace, or agent type not supported"
-        )
-    @abstractmethod
-    def extract_hypothesis_answer(self, trace: List[Dict[str, Any]]) -> str:
-        """Extract the hypothesis agent final answer from the trace."""
-        pass
-    @abstractmethod
-    def _extract_telemetry_data(self, telemetry: List[Dict[str, Any]]) -> List[Dict]:
-        """Extract the agent-specific data from telemetry."""
-        pass
-    def extract_evidence(self, telemetry: List[Dict[str, Any]]) -> str:
-        """Extract relevant telemetry evidence."""
-        calls = self._extract_telemetry_data(telemetry)
-        return self._format_evidence(calls)
-    def _format_evidence(self, calls: List[Dict]) -> str:
-        """Format extracted data into a standardized output format."""
-        evidence = f"## {self._get_agent_framework().name} Agent Execution\n\n"
-        for idx, call in enumerate(calls, start=1):
-            evidence += f"### Call {idx}\n"
-            # Truncate any values that are too long
-            call = {
-                k: (
-                    v[: self.MAX_EVIDENCE_LENGTH] + "..."
-                    if isinstance(v, str) and len(v) > self.MAX_EVIDENCE_LENGTH
-                    else v
-                )
-                for k, v in call.items()
-            }
-            # Use ensure_ascii=False to prevent escaping Unicode characters
-            evidence += json.dumps(call, indent=2, ensure_ascii=False) + "\n\n"
-        return evidence
-    @abstractmethod
-    def _get_agent_framework(self) -> AgentFramework:
-        """Get the agent type associated with this processor."""
-        pass
-    @staticmethod
-    def parse_generic_key_value_string(text: str) -> Dict[str, str]:
-        """
-        Parse a string that has items of a dict with key-value pairs separated by '='.
-        Only splits on '=' signs, handling quoted strings properly.
-        """
-        pattern = r"(\w+)=('.*?'|\".*?\"|[^'\"=]*?)(?=\s+\w+=|\s*$)"
-        result = {}
-        matches = re.findall(pattern, text)
-        for key, value in matches:
-            # Clean up the key
-            key = key.strip()
-            # Clean up the value - remove surrounding quotes if present
-            if (value.startswith("'") and value.endswith("'")) or (
-                value.startswith('"') and value.endswith('"')
-            ):
-                value = value[1:-1]
-            # Store in result dictionary
-            result[key] = value
-        return result