File size: 3,181 Bytes
7758a19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from abc import ABC
import json
import re
from typing import Dict, List, Any, Optional, Union
from textwrap import dedent

from litellm import completion
from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
from surf_spot_finder.evaluation.test_case import CheckpointCriteria


class LLMEvaluator(ABC):
    """Base class for evaluators that use LLM-as-judge"""

    def __init__(self, model: str):
        self.model = model

    def llm_evaluate_with_criterion(
        self,
        criteria: str,
        points: int,
        ground_truth_output: Optional[
            Union[List[CheckpointCriteria], Dict[str, Any]]
        ] = None,
        hypothesis_final_answer: Optional[str] = None,
        evidence: Optional[str] = None,
    ) -> EvaluationResult:
        """Evaluate a single criterion using LLM"""

        prompt = dedent(f"""
        Evaluate if the following criterion was met {"based on the provided evidence" if evidence else "in the agent's answer"}.

        Criterion: {criteria}
        """)

        if ground_truth_output:
            prompt += dedent(f"""
            Expected output: {json.dumps(ground_truth_output)}
            """)
        if hypothesis_final_answer:
            prompt += dedent(f"""
            Agent's answer: {hypothesis_final_answer}
            """)

        if evidence:
            prompt += dedent(f"""
            Telemetry evidence:
            {evidence}
            """)

        prompt += f"""

        Based on the {"evidence" if evidence else "comparison between the expected output and the actual final answer"},
        was this criterion satisfied? Answer with:
        1. "passed": true or false
        2. "reason": Brief explanation for your decision
        """
        prompt += """
        Output valid JSON with these three fields only, in the format:
        ```json
        {
            "passed": true,
            "reason": "I have them"
        }
        ```
        """

        response = completion(
            model=self.model, messages=[{"role": "user", "content": prompt}]
        )
        content = response.choices[0].message.content

        try:
            # Extract JSON from the response - looks for patterns like ```json {...} ``` or just {...}
            json_match = re.search(
                r"```(?:json)?\s*(\{.*?\})\s*```|(\{.*?\})", content, re.DOTALL
            )

            if json_match:
                # Use the first matching group that captured content
                json_str = next(group for group in json_match.groups() if group)
                evaluation = json.loads(json_str)
            else:
                # Fallback: try parsing the whole content as JSON
                evaluation = json.loads(content)

            evaluation["criteria"] = criteria
        except (json.JSONDecodeError, AttributeError, StopIteration) as e:
            evaluation = {
                "passed": False,
                "reason": f"Failed to evaluate due to parsing: {str(e)} \n Response: {content}",
                "criteria": criteria,
            }
        evaluation["points"] = points
        return EvaluationResult.model_validate(evaluation)