Nathan Brake commited on
Commit
7758a19
·
unverified ·
1 Parent(s): 94a64b0

Add SquadQA metric, split out the files and support for ollama LLM-as-judge (#35)

Browse files

* Split out the files and support for ollama LLM-as-judge

* lint

* no test case

* rename to better answer

pyproject.toml CHANGED
@@ -53,6 +53,7 @@ tests = [
53
  "pytest>=8,<9",
54
  "pytest-sugar>=0.9.6",
55
  "debugpy>=1.8.13",
 
56
  ]
57
 
58
  # TODO maybe we don't want to keep this, or we want to swap this to Lumigator SDK
 
53
  "pytest>=8,<9",
54
  "pytest-sugar>=0.9.6",
55
  "debugpy>=1.8.13",
56
+ "evaluate>=0.4.3",
57
  ]
58
 
59
  # TODO maybe we don't want to keep this, or we want to swap this to Lumigator SDK
src/surf_spot_finder/evaluation/evaluate.py CHANGED
@@ -11,9 +11,10 @@ from surf_spot_finder.config import (
11
  Config,
12
  )
13
  from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
14
- from surf_spot_finder.evaluation.utils import (
15
- verify_checkpoints,
16
- verify_hypothesis_answer,
 
17
  )
18
  from surf_spot_finder.evaluation.test_case import TestCase
19
 
@@ -55,28 +56,39 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
55
  # Extract the final answer from the telemetry
56
  processor = TelemetryProcessor.create(agent_type)
57
  hypothesis_answer = processor.extract_hypothesis_answer(trace=telemetry)
58
- # Verify agent behavior against checkpoints using llm-as-a-judge
59
- llm_judge = "openai/gpt-4o"
60
- checkpoint_results = verify_checkpoints(
 
61
  telemetry=telemetry,
62
  checkpoints=test_case.checkpoints,
63
- model=llm_judge,
64
  processor=processor,
65
  )
66
 
67
- hypothesis_answer_results = verify_hypothesis_answer(
 
 
68
  hypothesis_final_answer=hypothesis_answer,
69
  ground_truth_answer_dict=test_case.ground_truth,
70
  ground_truth_checkpoints=test_case.final_answer_criteria,
71
- model=llm_judge,
 
 
 
 
 
 
 
 
 
 
 
72
  )
73
  # Summarize results
74
  output_message = ""
75
  output_message += (
76
  f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>\n"""
77
  )
78
-
79
- verification_results = checkpoint_results + hypothesis_answer_results
80
  failed_checks = [r for r in verification_results if not r.passed]
81
  passed_checks = [r for r in verification_results if r.passed]
82
  missed_points = sum([r.points for r in failed_checks])
 
11
  Config,
12
  )
13
  from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
14
+ from surf_spot_finder.evaluation.evaluators import (
15
+ CheckpointEvaluator,
16
+ QuestionAnsweringSquadEvaluator,
17
+ HypothesisEvaluator,
18
  )
19
  from surf_spot_finder.evaluation.test_case import TestCase
20
 
 
56
  # Extract the final answer from the telemetry
57
  processor = TelemetryProcessor.create(agent_type)
58
  hypothesis_answer = processor.extract_hypothesis_answer(trace=telemetry)
59
+
60
+ # Checkpoint evaluation
61
+ checkpoint_evaluator = CheckpointEvaluator(model=test_case.llm_judge)
62
+ checkpoint_results = checkpoint_evaluator.evaluate(
63
  telemetry=telemetry,
64
  checkpoints=test_case.checkpoints,
 
65
  processor=processor,
66
  )
67
 
68
+ # Hypothesis answer evaluation
69
+ hypothesis_evaluator = HypothesisEvaluator(model=test_case.llm_judge)
70
+ hypothesis_answer_results = hypothesis_evaluator.evaluate(
71
  hypothesis_final_answer=hypothesis_answer,
72
  ground_truth_answer_dict=test_case.ground_truth,
73
  ground_truth_checkpoints=test_case.final_answer_criteria,
74
+ )
75
+
76
+ # Direct answer evaluation (new)
77
+ direct_evaluator = QuestionAnsweringSquadEvaluator()
78
+ direct_results = direct_evaluator.evaluate(
79
+ hypothesis_answer=hypothesis_answer,
80
+ ground_truth_answer=test_case.ground_truth,
81
+ )
82
+
83
+ # Combine all results
84
+ verification_results = (
85
+ checkpoint_results + hypothesis_answer_results + direct_results
86
  )
87
  # Summarize results
88
  output_message = ""
89
  output_message += (
90
  f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>\n"""
91
  )
 
 
92
  failed_checks = [r for r in verification_results if not r.passed]
93
  passed_checks = [r for r in verification_results if r.passed]
94
  missed_points = sum([r.points for r in failed_checks])
src/surf_spot_finder/evaluation/evaluators/CheckpointEvaluator.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+
3
+ from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
4
+ from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
5
+ from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
6
+ from surf_spot_finder.evaluation.test_case import CheckpointCriteria
7
+
8
+
9
+ class CheckpointEvaluator(LLMEvaluator):
10
+ """Evaluates checkpoints against telemetry"""
11
+
12
+ def evaluate(
13
+ self,
14
+ telemetry: List[Dict[str, Any]],
15
+ checkpoints: List[CheckpointCriteria],
16
+ processor: TelemetryProcessor,
17
+ ) -> List[EvaluationResult]:
18
+ """
19
+ Verify each checkpoint against the telemetry data using LLM
20
+
21
+ Args:
22
+ telemetry: The telemetry data to evaluate
23
+ checkpoints: List of checkpoint criteria to verify
24
+ processor: Telemetry processor to extract evidence
25
+
26
+ Returns:
27
+ List of evaluation results
28
+ """
29
+ evidence = processor.extract_evidence(telemetry)
30
+ results = []
31
+
32
+ for checkpoint in checkpoints:
33
+ evaluation = self.llm_evaluate_with_criterion(
34
+ criteria=checkpoint.criteria,
35
+ points=checkpoint.points,
36
+ evidence=evidence,
37
+ )
38
+ results.append(evaluation)
39
+
40
+ return results
src/surf_spot_finder/evaluation/evaluators/HypothesisEvaluator.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from surf_spot_finder.evaluation.evaluators.LLMEvaluator import LLMEvaluator
3
+ from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
4
+ from surf_spot_finder.evaluation.test_case import CheckpointCriteria
5
+
6
+
7
+ class HypothesisEvaluator(LLMEvaluator):
8
+ """Evaluates the final answer against ground truth"""
9
+
10
+ def evaluate(
11
+ self,
12
+ hypothesis_final_answer: str,
13
+ ground_truth_answer_dict: Dict[str, Any],
14
+ ground_truth_checkpoints: List[CheckpointCriteria],
15
+ ) -> List[EvaluationResult]:
16
+ """Verify if the final answer meets all specified criteria"""
17
+ results = []
18
+
19
+ for criterion in ground_truth_checkpoints:
20
+ evaluation = self.llm_evaluate_with_criterion(
21
+ criteria=criterion.criteria,
22
+ points=criterion.points,
23
+ ground_truth_output=ground_truth_answer_dict,
24
+ hypothesis_final_answer=hypothesis_final_answer,
25
+ )
26
+
27
+ results.append(evaluation)
28
+
29
+ return results
src/surf_spot_finder/evaluation/evaluators/LLMEvaluator.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC
2
+ import json
3
+ import re
4
+ from typing import Dict, List, Any, Optional, Union
5
+ from textwrap import dedent
6
+
7
+ from litellm import completion
8
+ from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
9
+ from surf_spot_finder.evaluation.test_case import CheckpointCriteria
10
+
11
+
12
+ class LLMEvaluator(ABC):
13
+ """Base class for evaluators that use LLM-as-judge"""
14
+
15
+ def __init__(self, model: str):
16
+ self.model = model
17
+
18
+ def llm_evaluate_with_criterion(
19
+ self,
20
+ criteria: str,
21
+ points: int,
22
+ ground_truth_output: Optional[
23
+ Union[List[CheckpointCriteria], Dict[str, Any]]
24
+ ] = None,
25
+ hypothesis_final_answer: Optional[str] = None,
26
+ evidence: Optional[str] = None,
27
+ ) -> EvaluationResult:
28
+ """Evaluate a single criterion using LLM"""
29
+
30
+ prompt = dedent(f"""
31
+ Evaluate if the following criterion was met {"based on the provided evidence" if evidence else "in the agent's answer"}.
32
+
33
+ Criterion: {criteria}
34
+ """)
35
+
36
+ if ground_truth_output:
37
+ prompt += dedent(f"""
38
+ Expected output: {json.dumps(ground_truth_output)}
39
+ """)
40
+ if hypothesis_final_answer:
41
+ prompt += dedent(f"""
42
+ Agent's answer: {hypothesis_final_answer}
43
+ """)
44
+
45
+ if evidence:
46
+ prompt += dedent(f"""
47
+ Telemetry evidence:
48
+ {evidence}
49
+ """)
50
+
51
+ prompt += f"""
52
+
53
+ Based on the {"evidence" if evidence else "comparison between the expected output and the actual final answer"},
54
+ was this criterion satisfied? Answer with:
55
+ 1. "passed": true or false
56
+ 2. "reason": Brief explanation for your decision
57
+ """
58
+ prompt += """
59
+ Output valid JSON with these three fields only, in the format:
60
+ ```json
61
+ {
62
+ "passed": true,
63
+ "reason": "I have them"
64
+ }
65
+ ```
66
+ """
67
+
68
+ response = completion(
69
+ model=self.model, messages=[{"role": "user", "content": prompt}]
70
+ )
71
+ content = response.choices[0].message.content
72
+
73
+ try:
74
+ # Extract JSON from the response - looks for patterns like ```json {...} ``` or just {...}
75
+ json_match = re.search(
76
+ r"```(?:json)?\s*(\{.*?\})\s*```|(\{.*?\})", content, re.DOTALL
77
+ )
78
+
79
+ if json_match:
80
+ # Use the first matching group that captured content
81
+ json_str = next(group for group in json_match.groups() if group)
82
+ evaluation = json.loads(json_str)
83
+ else:
84
+ # Fallback: try parsing the whole content as JSON
85
+ evaluation = json.loads(content)
86
+
87
+ evaluation["criteria"] = criteria
88
+ except (json.JSONDecodeError, AttributeError, StopIteration) as e:
89
+ evaluation = {
90
+ "passed": False,
91
+ "reason": f"Failed to evaluate due to parsing: {str(e)} \n Response: {content}",
92
+ "criteria": criteria,
93
+ }
94
+ evaluation["points"] = points
95
+ return EvaluationResult.model_validate(evaluation)
src/surf_spot_finder/evaluation/evaluators/QuestionAnsweringSquadEvaluator.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import evaluate
3
+ from surf_spot_finder.evaluation.evaluators.schemas import EvaluationResult
4
+
5
+
6
+ class QuestionAnsweringSquadEvaluator:
7
+ """Directly compares answers without using LLM-as-judge"""
8
+
9
+ def __init__(self):
10
+ self.metric = evaluate.load("squad")
11
+
12
+ def evaluate(
13
+ self, hypothesis_answer: str, ground_truth_answer: list
14
+ ) -> List[EvaluationResult]:
15
+ """Directly compare answers using simple matching"""
16
+
17
+ # format the answers so that they're dicts with 'id' and 'prediction' keys for hypo
18
+ # and the ref has id and answers keys
19
+ hypothesis_answer = [{"id": "1", "prediction_text": hypothesis_answer}]
20
+ ground_truth_answer = [
21
+ {
22
+ "id": "1",
23
+ "answers": {
24
+ "answer_start": [0],
25
+ "text": [ground_truth_answer[0]["value"]],
26
+ },
27
+ }
28
+ ]
29
+ # Use the SQuAD metric to compare answers
30
+ result = self.metric.compute(
31
+ predictions=hypothesis_answer, references=ground_truth_answer
32
+ )
33
+ print(result)
34
+
35
+ match = EvaluationResult(
36
+ passed=True if int(result["exact_match"]) == 1 else False,
37
+ reason=f"Partial Match (F1) score is {round(result['f1'], 2)}",
38
+ criteria="Is the answer a direct match?",
39
+ points=1,
40
+ )
41
+ return [match]
src/surf_spot_finder/evaluation/evaluators/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from .CheckpointEvaluator import CheckpointEvaluator
2
+ from .QuestionAnsweringSquadEvaluator import QuestionAnsweringSquadEvaluator
3
+ from .HypothesisEvaluator import HypothesisEvaluator
4
+
5
+ __all__ = [
6
+ "CheckpointEvaluator",
7
+ "QuestionAnsweringSquadEvaluator",
8
+ "HypothesisEvaluator",
9
+ ]
src/surf_spot_finder/evaluation/evaluators/schemas.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, ConfigDict
2
+
3
+
4
+ class EvaluationResult(BaseModel):
5
+ """Represents the result of evaluating a criterion"""
6
+
7
+ model_config = ConfigDict(extra="forbid")
8
+ passed: bool
9
+ reason: str
10
+ criteria: str
11
+ points: int
src/surf_spot_finder/evaluation/test_case.py CHANGED
@@ -1,6 +1,7 @@
1
  from typing import Dict, List, Any
2
  from pydantic import BaseModel, Field, ConfigDict
3
  import yaml
 
4
 
5
 
6
  class InputModel(BaseModel):
@@ -26,6 +27,7 @@ class TestCase(BaseModel):
26
  input: InputModel
27
  ground_truth: List[Dict[str, Any]] = Field(default_factory=list)
28
  checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
 
29
  final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
30
  test_case_path: str
31
  output_path: str = "output/results.json"
@@ -59,5 +61,6 @@ class TestCase(BaseModel):
59
  ]
60
 
61
  test_case_dict["test_case_path"] = test_case_path
62
-
 
63
  return cls.model_validate(test_case_dict)
 
1
  from typing import Dict, List, Any
2
  from pydantic import BaseModel, Field, ConfigDict
3
  import yaml
4
+ from litellm import validate_environment
5
 
6
 
7
  class InputModel(BaseModel):
 
27
  input: InputModel
28
  ground_truth: List[Dict[str, Any]] = Field(default_factory=list)
29
  checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
30
+ llm_judge: str
31
  final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
32
  test_case_path: str
33
  output_path: str = "output/results.json"
 
61
  ]
62
 
63
  test_case_dict["test_case_path"] = test_case_path
64
+ # verify that the llm_judge is a valid litellm model
65
+ validate_environment(test_case_dict["llm_judge"])
66
  return cls.model_validate(test_case_dict)
src/surf_spot_finder/evaluation/test_cases/alpha.yaml CHANGED
@@ -30,3 +30,6 @@ checkpoints:
30
  criteria: "Check if the agent used the get_area_lat_lon tool and it succeeded"
31
  - points: 1
32
  criteria: "Check if the final answer contains any description about the weather at the chosen location"
 
 
 
 
30
  criteria: "Check if the agent used the get_area_lat_lon tool and it succeeded"
31
  - points: 1
32
  criteria: "Check if the final answer contains any description about the weather at the chosen location"
33
+
34
+
35
+ llm_judge: "ollama/gemma3:4b-it-fp16"
src/surf_spot_finder/evaluation/utils.py DELETED
@@ -1,151 +0,0 @@
1
- import json
2
- from typing import Dict, List, Any, Optional
3
- import re
4
-
5
- from litellm import completion
6
- from textwrap import dedent
7
-
8
- from pydantic import BaseModel, ConfigDict
9
- from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
10
- from surf_spot_finder.evaluation.test_case import CheckpointCriteria
11
-
12
-
13
- class EvaluationResult(BaseModel):
14
- """Represents the result of evaluating a criterion"""
15
-
16
- model_config = ConfigDict(extra="forbid")
17
- passed: bool
18
- reason: str
19
- criteria: str
20
- points: int
21
-
22
-
23
- def evaluate_criterion(
24
- criteria: str,
25
- model: str,
26
- points: int,
27
- ground_truth_output: Optional[List[CheckpointCriteria] | Dict[str, Any]] = None,
28
- hypothesis_final_answer: Optional[str] = None,
29
- evidence: Optional[str] = None,
30
- ) -> EvaluationResult:
31
- """Evaluate a single criterion using LLM"""
32
-
33
- prompt = dedent(f"""
34
- Evaluate if the following criterion was met {"based on the provided evidence" if evidence else "in the agent's answer"}.
35
-
36
- Criterion: {criteria}
37
- """)
38
-
39
- if ground_truth_output:
40
- prompt += dedent(f"""
41
- Expected output: {json.dumps(ground_truth_output)}
42
- """)
43
- if hypothesis_final_answer:
44
- prompt += dedent(f"""
45
- Agent's answer: {hypothesis_final_answer}
46
- """)
47
-
48
- if evidence:
49
- prompt += dedent(f"""
50
- Telemetry evidence:
51
- {evidence}
52
- """)
53
-
54
- prompt += f"""
55
-
56
- Based on the {"evidence" if evidence else "comparison between the expected output and the actual final answer"},
57
- was this criterion satisfied? Answer with:
58
- 1. "passed": true or false
59
- 2. "reason": Brief explanation for your decision
60
- """
61
- prompt += """
62
- Output valid JSON with these three fields only, in the format:
63
- ```json
64
- {
65
- "passed": true,
66
- "reason": "I have them"
67
- }
68
- ```
69
- """
70
-
71
- response = completion(model=model, messages=[{"role": "user", "content": prompt}])
72
-
73
- content = response.choices[0].message.content
74
- try:
75
- # Extract JSON from the response - looks for patterns like ```json {...} ``` or just {...}
76
- # Claude helped me with this one, regex is hard
77
- json_match = re.search(
78
- r"```(?:json)?\s*(\{.*?\})\s*```|(\{.*?\})", content, re.DOTALL
79
- )
80
-
81
- if json_match:
82
- # Use the first matching group that captured content
83
- json_str = next(group for group in json_match.groups() if group)
84
- evaluation = json.loads(json_str)
85
- else:
86
- # Fallback: try parsing the whole content as JSON
87
- evaluation = json.loads(content)
88
-
89
- evaluation["criteria"] = criteria
90
- except (json.JSONDecodeError, AttributeError, StopIteration) as e:
91
- evaluation = {
92
- "passed": False,
93
- "reason": f"Failed to evaluate due to parsing: {str(e)} \n Response: {content}",
94
- "criteria": criteria,
95
- }
96
- evaluation["points"] = points
97
- return EvaluationResult.model_validate(evaluation)
98
-
99
-
100
- def verify_checkpoints(
101
- telemetry: List[Dict[str, Any]],
102
- checkpoints: List[CheckpointCriteria],
103
- model: str,
104
- processor: TelemetryProcessor,
105
- ) -> List[EvaluationResult]:
106
- """Verify each checkpoint against the telemetry data using LLM
107
- These checkpoints do not take the ground truth or hyupothesis
108
- answers into account. They are only concerned with the trace and
109
- the specific criteria mentioned.
110
- """
111
- results = []
112
- evidence = processor.extract_evidence(telemetry)
113
- print(evidence)
114
- for checkpoint in checkpoints:
115
- criteria = checkpoint.criteria
116
-
117
- evaluation = evaluate_criterion(
118
- criteria=criteria,
119
- points=checkpoint.points,
120
- model=model,
121
- evidence=evidence,
122
- )
123
-
124
- results.append(evaluation)
125
-
126
- return results
127
-
128
-
129
- def verify_hypothesis_answer(
130
- hypothesis_final_answer: str,
131
- ground_truth_answer_dict: Dict[str, Any],
132
- ground_truth_checkpoints: List[CheckpointCriteria],
133
- model: str,
134
- ) -> List[EvaluationResult]:
135
- """
136
- Verify if the final answer meets all specified criteria
137
- """
138
- results = []
139
-
140
- for criterion in ground_truth_checkpoints:
141
- evaluation = evaluate_criterion(
142
- criteria=criterion.criteria,
143
- points=criterion.points,
144
- ground_truth_output=ground_truth_answer_dict,
145
- hypothesis_final_answer=hypothesis_final_answer,
146
- model=model,
147
- )
148
-
149
- results.append(evaluation)
150
-
151
- return results