gabykim commited on
Commit
6c57077
·
1 Parent(s): eb592fa

fix pydantic error

Browse files
src/know_lang_bot/evaluation/chatbot_evaluation.py CHANGED
@@ -70,9 +70,9 @@ Evaluate the response based on these specific criteria:
70
 
71
  Format your response as JSON:
72
  {
73
- "chunk_relevance": score (from 0.0 to 10.0),
74
- "answer_correctness": score (from 0.0 to 10.0),
75
- "code_reference": score (from 0.0 to 10.0),
76
  "feedback": "Brief explanation of scores"
77
  }
78
  """
@@ -93,7 +93,12 @@ Format your response as JSON:
93
  result = await self.eval_agent.run(
94
  eval_context.model_dump_json(),
95
  )
96
- metrics = result.data
 
 
 
 
 
97
 
98
  # Calculate weighted score
99
  weights = {
@@ -103,7 +108,7 @@ Format your response as JSON:
103
  }
104
 
105
  total_score = sum(
106
- metrics.model_dump()[metric] * weights[metric] * case.difficulty
107
  for metric in EvalMetric
108
  )
109
 
@@ -111,7 +116,7 @@ Format your response as JSON:
111
  case=case,
112
  metrics=metrics,
113
  total_score=total_score,
114
- feedback=metrics.feedback,
115
  )
116
 
117
  async def evaluate_batch(
@@ -346,22 +351,36 @@ TRANSFORMER_TEST_CASES : List[EvalCase] = [
346
  async def main():
347
  from rich.console import Console
348
  from rich.pretty import Pretty
 
349
  import chromadb
350
  console = Console()
351
  config = AppConfig()
352
  evaluator = ChatBotEvaluator(config)
353
  collection = chromadb.PersistentClient(path=str(config.db.persist_directory)).get_collection(name=config.db.collection_name)
354
 
 
 
355
  for case in TRANSFORMER_TEST_CASES:
356
  try:
357
  chat_result = await process_chat(question=case.question, collection=collection, config=config)
358
  result = await evaluator.evaluate_single(case, chat_result)
359
- console.print(Pretty(result.model_dump()))
 
 
 
 
 
 
 
360
 
361
- except Exception as e:
362
  console.print_exception()
 
 
 
 
363
 
364
- break
365
 
366
  if __name__ == "__main__":
367
  asyncio.run(main())
 
70
 
71
  Format your response as JSON:
72
  {
73
+ "chunk_relevance": float type score (from 0.0f to 10.0f),
74
+ "answer_correctness": float type score (from 0.0f to 10.0f),
75
+ "code_reference": float type score (from 0.0f to 10.0f),
76
  "feedback": "Brief explanation of scores"
77
  }
78
  """
 
93
  result = await self.eval_agent.run(
94
  eval_context.model_dump_json(),
95
  )
96
+ eval_response : EvalAgentResponse = result.data
97
+ metrics = {
98
+ EvalMetric.CHUNK_RELEVANCE: eval_response.chunk_relevance,
99
+ EvalMetric.ANSWER_CORRECTNESS: eval_response.answer_correctness,
100
+ EvalMetric.CODE_REFERENCE: eval_response.code_reference
101
+ }
102
 
103
  # Calculate weighted score
104
  weights = {
 
108
  }
109
 
110
  total_score = sum(
111
+ metrics[metric] * weights[metric] * case.difficulty
112
  for metric in EvalMetric
113
  )
114
 
 
116
  case=case,
117
  metrics=metrics,
118
  total_score=total_score,
119
+ feedback=eval_response.feedback,
120
  )
121
 
122
  async def evaluate_batch(
 
351
  async def main():
352
  from rich.console import Console
353
  from rich.pretty import Pretty
354
+ import json
355
  import chromadb
356
  console = Console()
357
  config = AppConfig()
358
  evaluator = ChatBotEvaluator(config)
359
  collection = chromadb.PersistentClient(path=str(config.db.persist_directory)).get_collection(name=config.db.collection_name)
360
 
361
+ final_results = []
362
+
363
  for case in TRANSFORMER_TEST_CASES:
364
  try:
365
  chat_result = await process_chat(question=case.question, collection=collection, config=config)
366
  result = await evaluator.evaluate_single(case, chat_result)
367
+
368
+ # Aggregate chat_result and result into a single JSON object
369
+ aggregated_result = {
370
+ "question": case.question,
371
+ "chat_result": chat_result.model_dump(),
372
+ "evaluation_result": result.model_dump()
373
+ }
374
+ final_results.append(aggregated_result)
375
 
376
+ except Exception:
377
  console.print_exception()
378
+
379
+ # Write the final JSON array to a file
380
+ with open("evaluation_results.json", "w") as f:
381
+ json.dump(final_results, f, indent=2)
382
 
383
+ console.print(Pretty(final_results))
384
 
385
  if __name__ == "__main__":
386
  asyncio.run(main())