chat bot evaluation main entrypoint
Browse files
src/know_lang_bot/evaluation/chatbot_evaluation.py
CHANGED
@@ -4,7 +4,7 @@ from pydantic import BaseModel, Field
|
|
4 |
from pydantic_ai import Agent
|
5 |
from know_lang_bot.config import AppConfig
|
6 |
from know_lang_bot.utils.model_provider import create_pydantic_model
|
7 |
-
from know_lang_bot.chat_bot.chat_graph import ChatResult
|
8 |
import json
|
9 |
import asyncio
|
10 |
|
@@ -30,9 +30,9 @@ class EvalResult(BaseModel):
|
|
30 |
polished_question: Optional[str] = None
|
31 |
|
32 |
class EvalAgentResponse(BaseModel):
|
33 |
-
chunk_relevance: float
|
34 |
-
answer_correctness: float
|
35 |
-
code_reference: float
|
36 |
feedback: str
|
37 |
|
38 |
|
@@ -46,8 +46,8 @@ class ChatBotEvaluator:
|
|
46 |
self.config = config
|
47 |
self.eval_agent = Agent(
|
48 |
create_pydantic_model(
|
49 |
-
model_provider=config.
|
50 |
-
model_name=config.
|
51 |
),
|
52 |
system_prompt=self._build_eval_prompt(),
|
53 |
result_type=EvalAgentResponse
|
@@ -68,7 +68,15 @@ Evaluate the response based on these specific criteria:
|
|
68 |
3. Code Reference Quality (0-1):
|
69 |
- Does it properly cite specific code locations?
|
70 |
- Are code references clear and relevant?
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
async def evaluate_single(
|
74 |
self,
|
@@ -84,7 +92,7 @@ Evaluate the response based on these specific criteria:
|
|
84 |
|
85 |
# Get evaluation from the model
|
86 |
result = await self.eval_agent.run(
|
87 |
-
|
88 |
)
|
89 |
metrics = result.data
|
90 |
|
@@ -326,11 +334,31 @@ TRANSFORMER_TRAINER_TEST_CASES = [
|
|
326 |
)
|
327 |
]
|
328 |
|
329 |
-
TRANSFORMER_TEST_CASES = [
|
330 |
*TRANSFORMER_QUANTIZER_BASE_CASES,
|
331 |
*TRANSFORMER_QUANTIZER_AUTO_CASES,
|
332 |
*TRANSFORMER_PIPELINE_BASE_TEST_CASES,
|
333 |
*TRANSFORMER_PIPELINE_TEXT_GENERATION_TEST_CASES,
|
334 |
*TRANSFORMER_LOGITS_PROCESSOR_TEST_CASES,
|
335 |
*TRANSFORMER_TRAINER_TEST_CASES,
|
336 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from pydantic_ai import Agent
|
5 |
from know_lang_bot.config import AppConfig
|
6 |
from know_lang_bot.utils.model_provider import create_pydantic_model
|
7 |
+
from know_lang_bot.chat_bot.chat_graph import ChatResult, process_chat
|
8 |
import json
|
9 |
import asyncio
|
10 |
|
|
|
30 |
polished_question: Optional[str] = None
|
31 |
|
32 |
class EvalAgentResponse(BaseModel):
|
33 |
+
chunk_relevance: float = Field(ge=0.0, le=10.0, description="Score for chunk relevance")
|
34 |
+
answer_correctness: float = Field(ge=0.0, le=10.0, description="Score for answer correctness")
|
35 |
+
code_reference: float = Field(ge=0.0, le=10.0, description="Score for code reference quality")
|
36 |
feedback: str
|
37 |
|
38 |
|
|
|
46 |
self.config = config
|
47 |
self.eval_agent = Agent(
|
48 |
create_pydantic_model(
|
49 |
+
model_provider=config.evaluator.model_provider,
|
50 |
+
model_name=config.evaluator.model_name
|
51 |
),
|
52 |
system_prompt=self._build_eval_prompt(),
|
53 |
result_type=EvalAgentResponse
|
|
|
68 |
3. Code Reference Quality (0-1):
|
69 |
- Does it properly cite specific code locations?
|
70 |
- Are code references clear and relevant?
|
71 |
+
|
72 |
+
Format your response as JSON:
|
73 |
+
{
|
74 |
+
"chunk_relevance": score (from 0.0 to 10.0),
|
75 |
+
"answer_correctness": score (from 0.0 to 10.0),
|
76 |
+
"code_reference": score (from 0.0 to 10.0),
|
77 |
+
"feedback": "Brief explanation of scores"
|
78 |
+
}
|
79 |
+
"""
|
80 |
|
81 |
async def evaluate_single(
|
82 |
self,
|
|
|
92 |
|
93 |
# Get evaluation from the model
|
94 |
result = await self.eval_agent.run(
|
95 |
+
eval_context.model_dump_json(),
|
96 |
)
|
97 |
metrics = result.data
|
98 |
|
|
|
334 |
)
|
335 |
]
|
336 |
|
337 |
+
TRANSFORMER_TEST_CASES : List[EvalCase] = [
|
338 |
*TRANSFORMER_QUANTIZER_BASE_CASES,
|
339 |
*TRANSFORMER_QUANTIZER_AUTO_CASES,
|
340 |
*TRANSFORMER_PIPELINE_BASE_TEST_CASES,
|
341 |
*TRANSFORMER_PIPELINE_TEXT_GENERATION_TEST_CASES,
|
342 |
*TRANSFORMER_LOGITS_PROCESSOR_TEST_CASES,
|
343 |
*TRANSFORMER_TRAINER_TEST_CASES,
|
344 |
+
]
|
345 |
+
|
346 |
+
|
347 |
+
async def main():
|
348 |
+
from rich.console import Console
|
349 |
+
from rich.pretty import Pretty
|
350 |
+
import chromadb
|
351 |
+
console = Console()
|
352 |
+
config = AppConfig()
|
353 |
+
evaluator = ChatBotEvaluator(config)
|
354 |
+
collection = chromadb.PersistentClient(path=str(config.db.persist_directory)).get_collection(name=config.db.collection_name)
|
355 |
+
|
356 |
+
for case in TRANSFORMER_TEST_CASES:
|
357 |
+
chat_result = await process_chat(question=case.question, collection=collection, config=config)
|
358 |
+
result = await evaluator.evaluate_single(case, chat_result)
|
359 |
+
console.print(Pretty(result.model_dump()))
|
360 |
+
|
361 |
+
break
|
362 |
+
|
363 |
+
if __name__ == "__main__":
|
364 |
+
asyncio.run(main())
|