Spaces:
Running
Running
File size: 3,181 Bytes
7758a19 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
from abc import ABC
import json
import re
from typing import Dict, List, Any, Optional, Union
from textwrap import dedent
from litellm import completion
from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
from surf_spot_finder.evaluation.test_case import CheckpointCriteria
class LLMEvaluator(ABC):
"""Base class for evaluators that use LLM-as-judge"""
def __init__(self, model: str):
self.model = model
def llm_evaluate_with_criterion(
self,
criteria: str,
points: int,
ground_truth_output: Optional[
Union[List[CheckpointCriteria], Dict[str, Any]]
] = None,
hypothesis_final_answer: Optional[str] = None,
evidence: Optional[str] = None,
) -> EvaluationResult:
"""Evaluate a single criterion using LLM"""
prompt = dedent(f"""
Evaluate if the following criterion was met {"based on the provided evidence" if evidence else "in the agent's answer"}.
Criterion: {criteria}
""")
if ground_truth_output:
prompt += dedent(f"""
Expected output: {json.dumps(ground_truth_output)}
""")
if hypothesis_final_answer:
prompt += dedent(f"""
Agent's answer: {hypothesis_final_answer}
""")
if evidence:
prompt += dedent(f"""
Telemetry evidence:
{evidence}
""")
prompt += f"""
Based on the {"evidence" if evidence else "comparison between the expected output and the actual final answer"},
was this criterion satisfied? Answer with:
1. "passed": true or false
2. "reason": Brief explanation for your decision
"""
prompt += """
Output valid JSON with these three fields only, in the format:
```json
{
"passed": true,
"reason": "I have them"
}
```
"""
response = completion(
model=self.model, messages=[{"role": "user", "content": prompt}]
)
content = response.choices[0].message.content
try:
# Extract JSON from the response - looks for patterns like ```json {...} ``` or just {...}
json_match = re.search(
r"```(?:json)?\s*(\{.*?\})\s*```|(\{.*?\})", content, re.DOTALL
)
if json_match:
# Use the first matching group that captured content
json_str = next(group for group in json_match.groups() if group)
evaluation = json.loads(json_str)
else:
# Fallback: try parsing the whole content as JSON
evaluation = json.loads(content)
evaluation["criteria"] = criteria
except (json.JSONDecodeError, AttributeError, StopIteration) as e:
evaluation = {
"passed": False,
"reason": f"Failed to evaluate due to parsing: {str(e)} \n Response: {content}",
"criteria": criteria,
}
evaluation["points"] = points
return EvaluationResult.model_validate(evaluation)
|