gabykim commited on
Commit
56615c6
·
1 Parent(s): c436c2a

chat bot evaluation main entrypoint

Browse files
src/know_lang_bot/evaluation/chatbot_evaluation.py CHANGED
@@ -4,7 +4,7 @@ from pydantic import BaseModel, Field
4
  from pydantic_ai import Agent
5
  from know_lang_bot.config import AppConfig
6
  from know_lang_bot.utils.model_provider import create_pydantic_model
7
- from know_lang_bot.chat_bot.chat_graph import ChatResult
8
  import json
9
  import asyncio
10
 
@@ -30,9 +30,9 @@ class EvalResult(BaseModel):
30
  polished_question: Optional[str] = None
31
 
32
  class EvalAgentResponse(BaseModel):
33
- chunk_relevance: float
34
- answer_correctness: float
35
- code_reference: float
36
  feedback: str
37
 
38
 
@@ -46,8 +46,8 @@ class ChatBotEvaluator:
46
  self.config = config
47
  self.eval_agent = Agent(
48
  create_pydantic_model(
49
- model_provider=config.llm.model_provider,
50
- model_name=config.llm.model_name
51
  ),
52
  system_prompt=self._build_eval_prompt(),
53
  result_type=EvalAgentResponse
@@ -68,7 +68,15 @@ Evaluate the response based on these specific criteria:
68
  3. Code Reference Quality (0-1):
69
  - Does it properly cite specific code locations?
70
  - Are code references clear and relevant?
71
- }"""
 
 
 
 
 
 
 
 
72
 
73
  async def evaluate_single(
74
  self,
@@ -84,7 +92,7 @@ Evaluate the response based on these specific criteria:
84
 
85
  # Get evaluation from the model
86
  result = await self.eval_agent.run(
87
- json.dumps(eval_context),
88
  )
89
  metrics = result.data
90
 
@@ -326,11 +334,31 @@ TRANSFORMER_TRAINER_TEST_CASES = [
326
  )
327
  ]
328
 
329
- TRANSFORMER_TEST_CASES = [
330
  *TRANSFORMER_QUANTIZER_BASE_CASES,
331
  *TRANSFORMER_QUANTIZER_AUTO_CASES,
332
  *TRANSFORMER_PIPELINE_BASE_TEST_CASES,
333
  *TRANSFORMER_PIPELINE_TEXT_GENERATION_TEST_CASES,
334
  *TRANSFORMER_LOGITS_PROCESSOR_TEST_CASES,
335
  *TRANSFORMER_TRAINER_TEST_CASES,
336
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from pydantic_ai import Agent
5
  from know_lang_bot.config import AppConfig
6
  from know_lang_bot.utils.model_provider import create_pydantic_model
7
+ from know_lang_bot.chat_bot.chat_graph import ChatResult, process_chat
8
  import json
9
  import asyncio
10
 
 
30
  polished_question: Optional[str] = None
31
 
32
  class EvalAgentResponse(BaseModel):
33
+ chunk_relevance: float = Field(ge=0.0, le=10.0, description="Score for chunk relevance")
34
+ answer_correctness: float = Field(ge=0.0, le=10.0, description="Score for answer correctness")
35
+ code_reference: float = Field(ge=0.0, le=10.0, description="Score for code reference quality")
36
  feedback: str
37
 
38
 
 
46
  self.config = config
47
  self.eval_agent = Agent(
48
  create_pydantic_model(
49
+ model_provider=config.evaluator.model_provider,
50
+ model_name=config.evaluator.model_name
51
  ),
52
  system_prompt=self._build_eval_prompt(),
53
  result_type=EvalAgentResponse
 
68
  3. Code Reference Quality (0-1):
69
  - Does it properly cite specific code locations?
70
  - Are code references clear and relevant?
71
+
72
+ Format your response as JSON:
73
+ {
74
+ "chunk_relevance": score (from 0.0 to 10.0),
75
+ "answer_correctness": score (from 0.0 to 10.0),
76
+ "code_reference": score (from 0.0 to 10.0),
77
+ "feedback": "Brief explanation of scores"
78
+ }
79
+ """
80
 
81
  async def evaluate_single(
82
  self,
 
92
 
93
  # Get evaluation from the model
94
  result = await self.eval_agent.run(
95
+ eval_context.model_dump_json(),
96
  )
97
  metrics = result.data
98
 
 
334
  )
335
  ]
336
 
337
+ TRANSFORMER_TEST_CASES : List[EvalCase] = [
338
  *TRANSFORMER_QUANTIZER_BASE_CASES,
339
  *TRANSFORMER_QUANTIZER_AUTO_CASES,
340
  *TRANSFORMER_PIPELINE_BASE_TEST_CASES,
341
  *TRANSFORMER_PIPELINE_TEXT_GENERATION_TEST_CASES,
342
  *TRANSFORMER_LOGITS_PROCESSOR_TEST_CASES,
343
  *TRANSFORMER_TRAINER_TEST_CASES,
344
+ ]
345
+
346
+
347
+ async def main():
348
+ from rich.console import Console
349
+ from rich.pretty import Pretty
350
+ import chromadb
351
+ console = Console()
352
+ config = AppConfig()
353
+ evaluator = ChatBotEvaluator(config)
354
+ collection = chromadb.PersistentClient(path=str(config.db.persist_directory)).get_collection(name=config.db.collection_name)
355
+
356
+ for case in TRANSFORMER_TEST_CASES:
357
+ chat_result = await process_chat(question=case.question, collection=collection, config=config)
358
+ result = await evaluator.evaluate_single(case, chat_result)
359
+ console.print(Pretty(result.model_dump()))
360
+
361
+ break
362
+
363
+ if __name__ == "__main__":
364
+ asyncio.run(main())