gabykim commited on
Commit
1e72ff4
·
1 Parent(s): eede10d

multiple evaluation rounds for each test case

Browse files
src/know_lang_bot/evaluation/chatbot_evaluation.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import List, Dict, Optional
2
  from enum import Enum
3
  from pydantic import BaseModel, Field, computed_field
4
  from pydantic_ai import Agent
@@ -45,11 +45,40 @@ class EvalAgentResponse(MetricScores):
45
  """Raw response from evaluation agent"""
46
  feedback: str
47
 
 
 
 
 
 
 
48
  class EvalResult(BaseModel):
49
- """Evaluation result with scores and feedback"""
50
  evaluator_model: str
51
  case: EvalCase
52
- eval_response: EvalAgentResponse
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  class ChatBotEvaluationContext(EvalCase, ChatResult):
55
  pass
@@ -100,44 +129,38 @@ Format your response as JSON:
100
  async def evaluate_single(
101
  self,
102
  case: EvalCase,
103
- chat_result: ChatResult
 
104
  ) -> EvalResult:
105
- """Evaluate a single case"""
 
106
  # Prepare evaluation context
107
  eval_context = ChatBotEvaluationContext(
108
  **case.model_dump(),
109
  **chat_result.model_dump()
110
  )
111
 
112
- # Get evaluation from the model
113
- result = await self.eval_agent.run(
114
- eval_context.model_dump_json(),
115
- )
116
- eval_response : EvalAgentResponse = result.data
 
 
 
 
 
 
 
 
 
117
 
118
  return EvalResult(
119
  case=case,
120
- eval_response=eval_response,
121
  evaluator_model=f"{self.config.evaluator.model_provider}:{self.config.evaluator.model_name}"
122
  )
123
 
124
- async def evaluate_batch(
125
- self,
126
- cases: List[EvalCase],
127
- process_chat_func,
128
- max_concurrent: int = 2
129
- ) -> List[EvalResult]:
130
- """Run evaluation on multiple cases with concurrency control"""
131
- semaphore = asyncio.Semaphore(max_concurrent)
132
-
133
- async def eval_single_with_limit(case: EvalCase) -> EvalResult:
134
- async with semaphore:
135
- chat_result = await process_chat_func(case.question)
136
- return await self.evaluate_single(case, chat_result)
137
-
138
- return await asyncio.gather(
139
- *[eval_single_with_limit(case) for case in cases]
140
- )
141
 
142
  # src/transformers/quantizers/base.py
143
  TRANSFORMER_QUANTIZER_BASE_CASES = [
@@ -374,7 +397,7 @@ async def main():
374
  summary_list.append(eval_summary)
375
 
376
  import time
377
- time.sleep(5) # Sleep for 5 seconds to avoid rate limiting
378
 
379
  except Exception:
380
  console.print_exception()
 
1
+ from typing import List
2
  from enum import Enum
3
  from pydantic import BaseModel, Field, computed_field
4
  from pydantic_ai import Agent
 
45
  """Raw response from evaluation agent"""
46
  feedback: str
47
 
48
+ class EvalRound(BaseModel):
49
+ """Single evaluation round results"""
50
+ round_id: int
51
+ eval_response: EvalAgentResponse
52
+ timestamp: datetime.datetime
53
+
54
  class EvalResult(BaseModel):
55
+ """Extended evaluation result with multiple rounds"""
56
  evaluator_model: str
57
  case: EvalCase
58
+ eval_rounds: List[EvalRound]
59
+
60
+ @computed_field
61
+ def aggregated_scores(self) -> MetricScores:
62
+ """Calculate mean scores across rounds"""
63
+ chunk_relevance = EvalMetric.CHUNK_RELEVANCE.value
64
+ answer_correctness = EvalMetric.ANSWER_CORRECTNESS.value
65
+ code_reference = EvalMetric.CODE_REFERENCE.value
66
+
67
+ scores = {
68
+ chunk_relevance: [],
69
+ answer_correctness: [],
70
+ code_reference: []
71
+ }
72
+
73
+ for round in self.eval_rounds:
74
+ for metric in scores.keys():
75
+ scores[metric].append(getattr(round.eval_response, metric))
76
+
77
+ return MetricScores(
78
+ chunk_relevance=sum(scores[chunk_relevance]) / len(self.eval_rounds),
79
+ answer_correctness=sum(scores[answer_correctness]) / len(self.eval_rounds),
80
+ code_reference=sum(scores[code_reference]) / len(self.eval_rounds)
81
+ )
82
 
83
  class ChatBotEvaluationContext(EvalCase, ChatResult):
84
  pass
 
129
  async def evaluate_single(
130
  self,
131
  case: EvalCase,
132
+ chat_result: ChatResult,
133
+ num_rounds: int = 1,
134
  ) -> EvalResult:
135
+ """Evaluate a single case for multiple rounds"""
136
+ eval_rounds = []
137
  # Prepare evaluation context
138
  eval_context = ChatBotEvaluationContext(
139
  **case.model_dump(),
140
  **chat_result.model_dump()
141
  )
142
 
143
+ for round_id in range(num_rounds):
144
+ # Get evaluation from the model
145
+ result = await self.eval_agent.run(
146
+ eval_context.model_dump_json(),
147
+ )
148
+
149
+ eval_rounds.append(EvalRound(
150
+ round_id=round_id,
151
+ eval_response=result.data,
152
+ timestamp=datetime.datetime.now()
153
+ ))
154
+
155
+ # Add delay between rounds to avoid rate limits
156
+ await asyncio.sleep(2)
157
 
158
  return EvalResult(
159
  case=case,
160
+ eval_rounds=eval_rounds,
161
  evaluator_model=f"{self.config.evaluator.model_provider}:{self.config.evaluator.model_name}"
162
  )
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  # src/transformers/quantizers/base.py
166
  TRANSFORMER_QUANTIZER_BASE_CASES = [
 
397
  summary_list.append(eval_summary)
398
 
399
  import time
400
+ time.sleep(3) # Sleep for 5 seconds to avoid rate limiting
401
 
402
  except Exception:
403
  console.print_exception()
src/know_lang_bot/evaluation/chatbot_evaluation_visualize.py CHANGED
@@ -11,6 +11,7 @@ class RetrievalMethod(str, Enum):
11
  EMBEDDING = "embedding"
12
  EMBEDDING_RERANKING = "embedding_reranking"
13
  EMBEDDING_WITH_CODE = "embedding_with_code"
 
14
  EMBEDDING_RERANKING_WITH_CODE = "embedding_reranking_with_code"
15
 
16
  class ResultAnalyzer:
@@ -21,9 +22,10 @@ class ResultAnalyzer:
21
  # Map each method to its directory
22
  self.method_dirs = {
23
  RetrievalMethod.EMBEDDING: self.base_dir / RetrievalMethod.EMBEDDING.value,
24
- RetrievalMethod.EMBEDDING_RERANKING: self.base_dir / RetrievalMethod.EMBEDDING_RERANKING.value,
25
  RetrievalMethod.EMBEDDING_WITH_CODE: self.base_dir / RetrievalMethod.EMBEDDING_WITH_CODE.value,
26
- RetrievalMethod.EMBEDDING_RERANKING_WITH_CODE: self.base_dir / RetrievalMethod.EMBEDDING_RERANKING_WITH_CODE.value,
 
27
  }
28
 
29
  def load_results(self, file_path: Path) -> List[EvalSummary]:
@@ -36,17 +38,26 @@ class ResultAnalyzer:
36
  """Convert results to pandas DataFrame with flattened metrics"""
37
  rows = []
38
  for result in results:
39
- row = {
 
40
  "evaluator_model": result.evaluator_model,
41
  "question": result.case.question,
42
  "difficulty": result.case.difficulty,
43
- "chunk_relevance": result.eval_response.chunk_relevance,
44
- "answer_correctness": result.eval_response.answer_correctness,
45
- "code_reference": result.eval_response.code_reference,
46
- "weighted_total": result.eval_response.weighted_total,
47
- "environment": getattr(result.case, 'environment', 'default') # Added environment
48
  }
49
- rows.append(row)
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  return pd.DataFrame(rows)
52
 
@@ -71,8 +82,17 @@ class ResultAnalyzer:
71
  return f"{improvement:+.1f}%" if improvement else "0%"
72
 
73
  def get_stats_by_group(self, df: pd.DataFrame, group_by: str) -> pd.DataFrame:
74
- """Calculate statistics grouped by specified column"""
75
- return df.groupby(group_by).agg({
 
 
 
 
 
 
 
 
 
76
  "chunk_relevance": ["mean", "std"],
77
  "answer_correctness": ["mean", "std"],
78
  "code_reference": ["mean", "std"],
 
11
  EMBEDDING = "embedding"
12
  EMBEDDING_RERANKING = "embedding_reranking"
13
  EMBEDDING_WITH_CODE = "embedding_with_code"
14
+ OPENAI_EMBEDDING_WITH_CODE = "openai_embedding_with_code"
15
  EMBEDDING_RERANKING_WITH_CODE = "embedding_reranking_with_code"
16
 
17
  class ResultAnalyzer:
 
22
  # Map each method to its directory
23
  self.method_dirs = {
24
  RetrievalMethod.EMBEDDING: self.base_dir / RetrievalMethod.EMBEDDING.value,
25
+ # RetrievalMethod.EMBEDDING_RERANKING: self.base_dir / RetrievalMethod.EMBEDDING_RERANKING.value,
26
  RetrievalMethod.EMBEDDING_WITH_CODE: self.base_dir / RetrievalMethod.EMBEDDING_WITH_CODE.value,
27
+ RetrievalMethod.OPENAI_EMBEDDING_WITH_CODE: self.base_dir / RetrievalMethod.OPENAI_EMBEDDING_WITH_CODE.value,
28
+ # RetrievalMethod.EMBEDDING_RERANKING_WITH_CODE: self.base_dir / RetrievalMethod.EMBEDDING_RERANKING_WITH_CODE.value,
29
  }
30
 
31
  def load_results(self, file_path: Path) -> List[EvalSummary]:
 
38
  """Convert results to pandas DataFrame with flattened metrics"""
39
  rows = []
40
  for result in results:
41
+ # Basic metrics
42
+ base_row = {
43
  "evaluator_model": result.evaluator_model,
44
  "question": result.case.question,
45
  "difficulty": result.case.difficulty,
46
+ "environment": getattr(result.case, 'environment', 'default')
 
 
 
 
47
  }
48
+
49
+ # Add metrics for each round
50
+ for round in result.eval_rounds:
51
+ row = base_row.copy()
52
+ row.update({
53
+ "round_id": round.round_id,
54
+ "chunk_relevance": round.eval_response.chunk_relevance,
55
+ "answer_correctness": round.eval_response.answer_correctness,
56
+ "code_reference": round.eval_response.code_reference,
57
+ "weighted_total": round.eval_response.weighted_total,
58
+ "timestamp": round.timestamp
59
+ })
60
+ rows.append(row)
61
 
62
  return pd.DataFrame(rows)
63
 
 
82
  return f"{improvement:+.1f}%" if improvement else "0%"
83
 
84
  def get_stats_by_group(self, df: pd.DataFrame, group_by: str) -> pd.DataFrame:
85
+ """Calculate statistics with round variance"""
86
+ # First get mean per question/round
87
+ question_means = df.groupby([group_by, "question"]).agg({
88
+ "chunk_relevance": "mean",
89
+ "answer_correctness": "mean",
90
+ "code_reference": "mean",
91
+ "weighted_total": "mean"
92
+ })
93
+
94
+ # Then get mean and std across questions
95
+ return question_means.groupby(level=0).agg({
96
  "chunk_relevance": ["mean", "std"],
97
  "answer_correctness": ["mean", "std"],
98
  "code_reference": ["mean", "std"],