Spaces:
Sleeping
Sleeping
multiple evaluation rounds for each test case
Browse files
src/know_lang_bot/evaluation/chatbot_evaluation.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from typing import List
|
2 |
from enum import Enum
|
3 |
from pydantic import BaseModel, Field, computed_field
|
4 |
from pydantic_ai import Agent
|
@@ -45,11 +45,40 @@ class EvalAgentResponse(MetricScores):
|
|
45 |
"""Raw response from evaluation agent"""
|
46 |
feedback: str
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
class EvalResult(BaseModel):
|
49 |
-
"""
|
50 |
evaluator_model: str
|
51 |
case: EvalCase
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
class ChatBotEvaluationContext(EvalCase, ChatResult):
|
55 |
pass
|
@@ -100,44 +129,38 @@ Format your response as JSON:
|
|
100 |
async def evaluate_single(
|
101 |
self,
|
102 |
case: EvalCase,
|
103 |
-
chat_result: ChatResult
|
|
|
104 |
) -> EvalResult:
|
105 |
-
"""Evaluate a single case"""
|
|
|
106 |
# Prepare evaluation context
|
107 |
eval_context = ChatBotEvaluationContext(
|
108 |
**case.model_dump(),
|
109 |
**chat_result.model_dump()
|
110 |
)
|
111 |
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
return EvalResult(
|
119 |
case=case,
|
120 |
-
|
121 |
evaluator_model=f"{self.config.evaluator.model_provider}:{self.config.evaluator.model_name}"
|
122 |
)
|
123 |
|
124 |
-
async def evaluate_batch(
|
125 |
-
self,
|
126 |
-
cases: List[EvalCase],
|
127 |
-
process_chat_func,
|
128 |
-
max_concurrent: int = 2
|
129 |
-
) -> List[EvalResult]:
|
130 |
-
"""Run evaluation on multiple cases with concurrency control"""
|
131 |
-
semaphore = asyncio.Semaphore(max_concurrent)
|
132 |
-
|
133 |
-
async def eval_single_with_limit(case: EvalCase) -> EvalResult:
|
134 |
-
async with semaphore:
|
135 |
-
chat_result = await process_chat_func(case.question)
|
136 |
-
return await self.evaluate_single(case, chat_result)
|
137 |
-
|
138 |
-
return await asyncio.gather(
|
139 |
-
*[eval_single_with_limit(case) for case in cases]
|
140 |
-
)
|
141 |
|
142 |
# src/transformers/quantizers/base.py
|
143 |
TRANSFORMER_QUANTIZER_BASE_CASES = [
|
@@ -374,7 +397,7 @@ async def main():
|
|
374 |
summary_list.append(eval_summary)
|
375 |
|
376 |
import time
|
377 |
-
time.sleep(
|
378 |
|
379 |
except Exception:
|
380 |
console.print_exception()
|
|
|
1 |
+
from typing import List
|
2 |
from enum import Enum
|
3 |
from pydantic import BaseModel, Field, computed_field
|
4 |
from pydantic_ai import Agent
|
|
|
45 |
"""Raw response from evaluation agent"""
|
46 |
feedback: str
|
47 |
|
48 |
+
class EvalRound(BaseModel):
|
49 |
+
"""Single evaluation round results"""
|
50 |
+
round_id: int
|
51 |
+
eval_response: EvalAgentResponse
|
52 |
+
timestamp: datetime.datetime
|
53 |
+
|
54 |
class EvalResult(BaseModel):
|
55 |
+
"""Extended evaluation result with multiple rounds"""
|
56 |
evaluator_model: str
|
57 |
case: EvalCase
|
58 |
+
eval_rounds: List[EvalRound]
|
59 |
+
|
60 |
+
@computed_field
|
61 |
+
def aggregated_scores(self) -> MetricScores:
|
62 |
+
"""Calculate mean scores across rounds"""
|
63 |
+
chunk_relevance = EvalMetric.CHUNK_RELEVANCE.value
|
64 |
+
answer_correctness = EvalMetric.ANSWER_CORRECTNESS.value
|
65 |
+
code_reference = EvalMetric.CODE_REFERENCE.value
|
66 |
+
|
67 |
+
scores = {
|
68 |
+
chunk_relevance: [],
|
69 |
+
answer_correctness: [],
|
70 |
+
code_reference: []
|
71 |
+
}
|
72 |
+
|
73 |
+
for round in self.eval_rounds:
|
74 |
+
for metric in scores.keys():
|
75 |
+
scores[metric].append(getattr(round.eval_response, metric))
|
76 |
+
|
77 |
+
return MetricScores(
|
78 |
+
chunk_relevance=sum(scores[chunk_relevance]) / len(self.eval_rounds),
|
79 |
+
answer_correctness=sum(scores[answer_correctness]) / len(self.eval_rounds),
|
80 |
+
code_reference=sum(scores[code_reference]) / len(self.eval_rounds)
|
81 |
+
)
|
82 |
|
83 |
class ChatBotEvaluationContext(EvalCase, ChatResult):
|
84 |
pass
|
|
|
129 |
async def evaluate_single(
|
130 |
self,
|
131 |
case: EvalCase,
|
132 |
+
chat_result: ChatResult,
|
133 |
+
num_rounds: int = 1,
|
134 |
) -> EvalResult:
|
135 |
+
"""Evaluate a single case for multiple rounds"""
|
136 |
+
eval_rounds = []
|
137 |
# Prepare evaluation context
|
138 |
eval_context = ChatBotEvaluationContext(
|
139 |
**case.model_dump(),
|
140 |
**chat_result.model_dump()
|
141 |
)
|
142 |
|
143 |
+
for round_id in range(num_rounds):
|
144 |
+
# Get evaluation from the model
|
145 |
+
result = await self.eval_agent.run(
|
146 |
+
eval_context.model_dump_json(),
|
147 |
+
)
|
148 |
+
|
149 |
+
eval_rounds.append(EvalRound(
|
150 |
+
round_id=round_id,
|
151 |
+
eval_response=result.data,
|
152 |
+
timestamp=datetime.datetime.now()
|
153 |
+
))
|
154 |
+
|
155 |
+
# Add delay between rounds to avoid rate limits
|
156 |
+
await asyncio.sleep(2)
|
157 |
|
158 |
return EvalResult(
|
159 |
case=case,
|
160 |
+
eval_rounds=eval_rounds,
|
161 |
evaluator_model=f"{self.config.evaluator.model_provider}:{self.config.evaluator.model_name}"
|
162 |
)
|
163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
# src/transformers/quantizers/base.py
|
166 |
TRANSFORMER_QUANTIZER_BASE_CASES = [
|
|
|
397 |
summary_list.append(eval_summary)
|
398 |
|
399 |
import time
|
400 |
+
time.sleep(3) # Sleep for 5 seconds to avoid rate limiting
|
401 |
|
402 |
except Exception:
|
403 |
console.print_exception()
|
src/know_lang_bot/evaluation/chatbot_evaluation_visualize.py
CHANGED
@@ -11,6 +11,7 @@ class RetrievalMethod(str, Enum):
|
|
11 |
EMBEDDING = "embedding"
|
12 |
EMBEDDING_RERANKING = "embedding_reranking"
|
13 |
EMBEDDING_WITH_CODE = "embedding_with_code"
|
|
|
14 |
EMBEDDING_RERANKING_WITH_CODE = "embedding_reranking_with_code"
|
15 |
|
16 |
class ResultAnalyzer:
|
@@ -21,9 +22,10 @@ class ResultAnalyzer:
|
|
21 |
# Map each method to its directory
|
22 |
self.method_dirs = {
|
23 |
RetrievalMethod.EMBEDDING: self.base_dir / RetrievalMethod.EMBEDDING.value,
|
24 |
-
RetrievalMethod.EMBEDDING_RERANKING: self.base_dir / RetrievalMethod.EMBEDDING_RERANKING.value,
|
25 |
RetrievalMethod.EMBEDDING_WITH_CODE: self.base_dir / RetrievalMethod.EMBEDDING_WITH_CODE.value,
|
26 |
-
RetrievalMethod.
|
|
|
27 |
}
|
28 |
|
29 |
def load_results(self, file_path: Path) -> List[EvalSummary]:
|
@@ -36,17 +38,26 @@ class ResultAnalyzer:
|
|
36 |
"""Convert results to pandas DataFrame with flattened metrics"""
|
37 |
rows = []
|
38 |
for result in results:
|
39 |
-
|
|
|
40 |
"evaluator_model": result.evaluator_model,
|
41 |
"question": result.case.question,
|
42 |
"difficulty": result.case.difficulty,
|
43 |
-
"
|
44 |
-
"answer_correctness": result.eval_response.answer_correctness,
|
45 |
-
"code_reference": result.eval_response.code_reference,
|
46 |
-
"weighted_total": result.eval_response.weighted_total,
|
47 |
-
"environment": getattr(result.case, 'environment', 'default') # Added environment
|
48 |
}
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
return pd.DataFrame(rows)
|
52 |
|
@@ -71,8 +82,17 @@ class ResultAnalyzer:
|
|
71 |
return f"{improvement:+.1f}%" if improvement else "0%"
|
72 |
|
73 |
def get_stats_by_group(self, df: pd.DataFrame, group_by: str) -> pd.DataFrame:
|
74 |
-
"""Calculate statistics
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
"chunk_relevance": ["mean", "std"],
|
77 |
"answer_correctness": ["mean", "std"],
|
78 |
"code_reference": ["mean", "std"],
|
|
|
11 |
EMBEDDING = "embedding"
|
12 |
EMBEDDING_RERANKING = "embedding_reranking"
|
13 |
EMBEDDING_WITH_CODE = "embedding_with_code"
|
14 |
+
OPENAI_EMBEDDING_WITH_CODE = "openai_embedding_with_code"
|
15 |
EMBEDDING_RERANKING_WITH_CODE = "embedding_reranking_with_code"
|
16 |
|
17 |
class ResultAnalyzer:
|
|
|
22 |
# Map each method to its directory
|
23 |
self.method_dirs = {
|
24 |
RetrievalMethod.EMBEDDING: self.base_dir / RetrievalMethod.EMBEDDING.value,
|
25 |
+
# RetrievalMethod.EMBEDDING_RERANKING: self.base_dir / RetrievalMethod.EMBEDDING_RERANKING.value,
|
26 |
RetrievalMethod.EMBEDDING_WITH_CODE: self.base_dir / RetrievalMethod.EMBEDDING_WITH_CODE.value,
|
27 |
+
RetrievalMethod.OPENAI_EMBEDDING_WITH_CODE: self.base_dir / RetrievalMethod.OPENAI_EMBEDDING_WITH_CODE.value,
|
28 |
+
# RetrievalMethod.EMBEDDING_RERANKING_WITH_CODE: self.base_dir / RetrievalMethod.EMBEDDING_RERANKING_WITH_CODE.value,
|
29 |
}
|
30 |
|
31 |
def load_results(self, file_path: Path) -> List[EvalSummary]:
|
|
|
38 |
"""Convert results to pandas DataFrame with flattened metrics"""
|
39 |
rows = []
|
40 |
for result in results:
|
41 |
+
# Basic metrics
|
42 |
+
base_row = {
|
43 |
"evaluator_model": result.evaluator_model,
|
44 |
"question": result.case.question,
|
45 |
"difficulty": result.case.difficulty,
|
46 |
+
"environment": getattr(result.case, 'environment', 'default')
|
|
|
|
|
|
|
|
|
47 |
}
|
48 |
+
|
49 |
+
# Add metrics for each round
|
50 |
+
for round in result.eval_rounds:
|
51 |
+
row = base_row.copy()
|
52 |
+
row.update({
|
53 |
+
"round_id": round.round_id,
|
54 |
+
"chunk_relevance": round.eval_response.chunk_relevance,
|
55 |
+
"answer_correctness": round.eval_response.answer_correctness,
|
56 |
+
"code_reference": round.eval_response.code_reference,
|
57 |
+
"weighted_total": round.eval_response.weighted_total,
|
58 |
+
"timestamp": round.timestamp
|
59 |
+
})
|
60 |
+
rows.append(row)
|
61 |
|
62 |
return pd.DataFrame(rows)
|
63 |
|
|
|
82 |
return f"{improvement:+.1f}%" if improvement else "0%"
|
83 |
|
84 |
def get_stats_by_group(self, df: pd.DataFrame, group_by: str) -> pd.DataFrame:
|
85 |
+
"""Calculate statistics with round variance"""
|
86 |
+
# First get mean per question/round
|
87 |
+
question_means = df.groupby([group_by, "question"]).agg({
|
88 |
+
"chunk_relevance": "mean",
|
89 |
+
"answer_correctness": "mean",
|
90 |
+
"code_reference": "mean",
|
91 |
+
"weighted_total": "mean"
|
92 |
+
})
|
93 |
+
|
94 |
+
# Then get mean and std across questions
|
95 |
+
return question_means.groupby(level=0).agg({
|
96 |
"chunk_relevance": ["mean", "std"],
|
97 |
"answer_correctness": ["mean", "std"],
|
98 |
"code_reference": ["mean", "std"],
|