Spaces:

mozilla-ai
/

surf-spot-finder

Running

App Files Files Community

Nathan Brake commited on Mar 28

Commit

9a0610b

unverified ·

1 Parent(s): dcd814d

Refactor evaluation logging and save results to a new module (#49)

Browse files

* Refactor evaluation logging and save results to a new module

* add div by zero check

* docs

Files changed (5) hide show

pyproject.toml +2 -1
src/surf_spot_finder/evaluation/evaluate.py +19 -32
src/surf_spot_finder/evaluation/evaluators/CheckpointEvaluator.py +5 -1
src/surf_spot_finder/evaluation/results_saver.py +60 -0
src/surf_spot_finder/utils/logging.py +14 -0

pyproject.toml CHANGED Viewed

@@ -9,11 +9,12 @@ license = {text = "Apache-2.0"}
 requires-python = ">=3.11"
 dynamic = ["version"]
 dependencies = [
-  "any-agent  @ git+ssh://[email protected]/mozilla-ai/any-agent",
   "fire",
   "loguru",
   "pydantic",
   "pyyaml",
 ]
 [project.optional-dependencies]

 requires-python = ">=3.11"
 dynamic = ["version"]
 dependencies = [
+  "any-agent[smolagents,mcp,openai,langchain,llama_index]  @ git+ssh://[email protected]/mozilla-ai/any-agent",
   "fire",
   "loguru",
   "pydantic",
   "pyyaml",
+  "litellm",
 ]
 [project.optional-dependencies]

src/surf_spot_finder/evaluation/evaluate.py CHANGED Viewed

@@ -1,15 +1,11 @@
 import json
-import os
-import sys
 from textwrap import dedent
 from typing import Any, Dict, List, Optional
-import pandas as pd
 from any_agent import AnyAgent
 from any_agent.telemetry import TelemetryProcessor
 from any_agent.tracing import get_tracer_provider, setup_tracing
 from fire import Fire
-from loguru import logger
 from surf_spot_finder.config import (
     Config,
@@ -20,10 +16,11 @@ from surf_spot_finder.evaluation.evaluators import (
     QuestionAnsweringSquadEvaluator,
 )
 from surf_spot_finder.evaluation.test_case import TestCase
-logger.remove()
-logger = logger.opt(ansi=True)
-logger.add(sys.stdout, colorize=True, format="{message}")
 def run(test_case: TestCase, agent_config_path: str) -> str:
@@ -139,32 +136,22 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
     )
     output_message += "<green>=====================================</green>\n"
     logger.info(output_message)
-    # See if the test_case.output_path file exists.
-    if os.path.exists(test_case.output_path):
-        df = pd.read_json(test_case.output_path, orient="records", lines=True)
-    else:
-        df = pd.DataFrame()
-    df = pd.concat(
-        [
-            df,
-            pd.DataFrame(
-                [
-                    {
-                        "test_case_path": test_case.test_case_path,
-                        "output_message": output_message,
-                        "telemetry_path": telemetry_path,
-                        "hypothesis_answer": hypothesis_answer,
-                        "passed_checks": len(passed_checks),
-                        "failed_checks": len(failed_checks),
-                        "score": round(
-                            won_points / (won_points + missed_points) * 100, 2
-                        ),
-                    }
-                ]
-            ),
-        ]
     )
-    df.to_json(test_case.output_path, orient="records", lines=True)
 def evaluate(

 import json
 from textwrap import dedent
 from typing import Any, Dict, List, Optional
 from any_agent import AnyAgent
 from any_agent.telemetry import TelemetryProcessor
 from any_agent.tracing import get_tracer_provider, setup_tracing
 from fire import Fire
 from surf_spot_finder.config import (
     Config,
     QuestionAnsweringSquadEvaluator,
 )
 from surf_spot_finder.evaluation.test_case import TestCase
+from surf_spot_finder.evaluation.results_saver import save_evaluation_results
+from surf_spot_finder.utils.logging import get_logger
+# Replace the existing logger setup with the shared logger
+logger = get_logger()
 def run(test_case: TestCase, agent_config_path: str) -> str:
     )
     output_message += "<green>=====================================</green>\n"
     logger.info(output_message)
+    if won_points + missed_points == 0:
+        raise ValueError("No points were defined in the test case")
+    score = won_points / (won_points + missed_points) * 100
+    # Save the evaluation results
+    save_evaluation_results(
+        test_case_path=test_case.test_case_path,
+        output_path=test_case.output_path,
+        output_message=output_message,
+        telemetry_path=telemetry_path,
+        hypothesis_answer=hypothesis_answer,
+        passed_checks=len(passed_checks),
+        failed_checks=len(failed_checks),
+        score=score,
     )
 def evaluate(

src/surf_spot_finder/evaluation/evaluators/CheckpointEvaluator.py CHANGED Viewed

@@ -2,8 +2,11 @@ from typing import Dict, List, Any
 from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
 from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
-from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
 from surf_spot_finder.evaluation.test_case import CheckpointCriteria
 class CheckpointEvaluator(LLMEvaluator):
@@ -27,6 +30,7 @@ class CheckpointEvaluator(LLMEvaluator):
             List of evaluation results
         """
         evidence = processor.extract_evidence(telemetry)
         results = []
         for checkpoint in checkpoints:

 from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
 from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
+from any_agent.telemetry import TelemetryProcessor
 from surf_spot_finder.evaluation.test_case import CheckpointCriteria
+from surf_spot_finder.utils.logging import get_logger
+logger = get_logger()
 class CheckpointEvaluator(LLMEvaluator):
             List of evaluation results
         """
         evidence = processor.extract_evidence(telemetry)
+        logger.info(f"""<yellow>Evidence\n{evidence}</yellow>\n""")
         results = []
         for checkpoint in checkpoints:

src/surf_spot_finder/evaluation/results_saver.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import pandas as pd
+from surf_spot_finder.utils.logging import get_logger
+# Use the shared logger
+logger = get_logger()
+def save_evaluation_results(
+    test_case_path: str,
+    output_path: str,
+    output_message: str,
+    telemetry_path: str,
+    hypothesis_answer: str,
+    passed_checks: int,
+    failed_checks: int,
+    score: float,
+) -> None:
+    """
+    Save evaluation results to the specified output path.
+    Args:
+        test_case_path: Path to the test case file
+        output_path: Path to save the results
+        output_message: Formatted output message with evaluation details
+        telemetry_path: Path to the telemetry file used
+        hypothesis_answer: The extracted hypothesis answer
+        passed_checks: Number of passed checkpoints
+        failed_checks: Number of failed checkpoints
+        score: Evaluation score as a percentage
+    """
+    # See if the output_path file exists
+    if os.path.exists(output_path):
+        logger.info(f"Reading existing output from {output_path}")
+        df = pd.read_json(output_path, orient="records", lines=True)
+    else:
+        logger.info(f"Creating new output file at {output_path}")
+        df = pd.DataFrame()
+    df = pd.concat(
+        [
+            df,
+            pd.DataFrame(
+                [
+                    {
+                        "test_case_path": test_case_path,
+                        "output_message": output_message,
+                        "telemetry_path": telemetry_path,
+                        "hypothesis_answer": hypothesis_answer,
+                        "passed_checks": passed_checks,
+                        "failed_checks": failed_checks,
+                        "score": round(score, 2),
+                    }
+                ]
+            ),
+        ]
+    )
+    logger.info(f"Writing output to {output_path}")
+    df.to_json(output_path, orient="records", lines=True)

src/surf_spot_finder/utils/logging.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import sys
+from loguru import logger
+# Remove default logger
+logger.remove()
+# Add custom colored logger
+logger = logger.opt(ansi=True)
+logger.add(sys.stdout, colorize=True, format="{message}")
+# Export configured logger
+def get_logger():
+    return logger