Spaces:

mozilla-ai
/

surf-spot-finder

Running

surf-spot-finder / src /surf_spot_finder /evaluation /evaluators /LLMEvaluator.py

Nathan Brake

Add SquadQA metric, split out the files and support for ollama LLM-as-judge (#35)

7758a19 unverified 3 months ago

3.18 kB

	from abc import ABC
	import json
	import re
	from typing import Dict, List, Any, Optional, Union
	from textwrap import dedent

	from litellm import completion
	from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
	from surf_spot_finder.evaluation.test_case import CheckpointCriteria


	class LLMEvaluator(ABC):
	"""Base class for evaluators that use LLM-as-judge"""

	def __init__(self, model: str):
	self.model = model

	def llm_evaluate_with_criterion(
	self,
	criteria: str,
	points: int,
	ground_truth_output: Optional[
	Union[List[CheckpointCriteria], Dict[str, Any]]
	] = None,
	hypothesis_final_answer: Optional[str] = None,
	evidence: Optional[str] = None,
	) -> EvaluationResult:
	"""Evaluate a single criterion using LLM"""

	prompt = dedent(f"""
	Evaluate if the following criterion was met {"based on the provided evidence" if evidence else "in the agent's answer"}.

	Criterion: {criteria}
	""")

	if ground_truth_output:
	prompt += dedent(f"""
	Expected output: {json.dumps(ground_truth_output)}
	""")
	if hypothesis_final_answer:
	prompt += dedent(f"""
	Agent's answer: {hypothesis_final_answer}
	""")

	if evidence:
	prompt += dedent(f"""
	Telemetry evidence:
	{evidence}
	""")

	prompt += f"""

	Based on the {"evidence" if evidence else "comparison between the expected output and the actual final answer"},
	was this criterion satisfied? Answer with:
	1. "passed": true or false
	2. "reason": Brief explanation for your decision
	"""
	prompt += """
	Output valid JSON with these three fields only, in the format:
	```json
	{
	"passed": true,
	"reason": "I have them"
	}
	```
	"""

	response = completion(
	model=self.model, messages=[{"role": "user", "content": prompt}]
	)
	content = response.choices[0].message.content

	try:
	# Extract JSON from the response - looks for patterns like ```json {...} ``` or just {...}
	json_match = re.search(
	r"```(?:json)?\s(\{.?\})\s```\|(\{.?\})", content, re.DOTALL
	)

	if json_match:
	# Use the first matching group that captured content
	json_str = next(group for group in json_match.groups() if group)
	evaluation = json.loads(json_str)
	else:
	# Fallback: try parsing the whole content as JSON
	evaluation = json.loads(content)

	evaluation["criteria"] = criteria
	except (json.JSONDecodeError, AttributeError, StopIteration) as e:
	evaluation = {
	"passed": False,
	"reason": f"Failed to evaluate due to parsing: {str(e)} \n Response: {content}",
	"criteria": criteria,
	}
	evaluation["points"] = points
	return EvaluationResult.model_validate(evaluation)