Spaces:
Sleeping
Sleeping
fix pydantic error
Browse files
src/know_lang_bot/evaluation/chatbot_evaluation.py
CHANGED
@@ -70,9 +70,9 @@ Evaluate the response based on these specific criteria:
|
|
70 |
|
71 |
Format your response as JSON:
|
72 |
{
|
73 |
-
"chunk_relevance": score (from 0.
|
74 |
-
"answer_correctness": score (from 0.
|
75 |
-
"code_reference": score (from 0.
|
76 |
"feedback": "Brief explanation of scores"
|
77 |
}
|
78 |
"""
|
@@ -93,7 +93,12 @@ Format your response as JSON:
|
|
93 |
result = await self.eval_agent.run(
|
94 |
eval_context.model_dump_json(),
|
95 |
)
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
# Calculate weighted score
|
99 |
weights = {
|
@@ -103,7 +108,7 @@ Format your response as JSON:
|
|
103 |
}
|
104 |
|
105 |
total_score = sum(
|
106 |
-
metrics
|
107 |
for metric in EvalMetric
|
108 |
)
|
109 |
|
@@ -111,7 +116,7 @@ Format your response as JSON:
|
|
111 |
case=case,
|
112 |
metrics=metrics,
|
113 |
total_score=total_score,
|
114 |
-
feedback=
|
115 |
)
|
116 |
|
117 |
async def evaluate_batch(
|
@@ -346,22 +351,36 @@ TRANSFORMER_TEST_CASES : List[EvalCase] = [
|
|
346 |
async def main():
|
347 |
from rich.console import Console
|
348 |
from rich.pretty import Pretty
|
|
|
349 |
import chromadb
|
350 |
console = Console()
|
351 |
config = AppConfig()
|
352 |
evaluator = ChatBotEvaluator(config)
|
353 |
collection = chromadb.PersistentClient(path=str(config.db.persist_directory)).get_collection(name=config.db.collection_name)
|
354 |
|
|
|
|
|
355 |
for case in TRANSFORMER_TEST_CASES:
|
356 |
try:
|
357 |
chat_result = await process_chat(question=case.question, collection=collection, config=config)
|
358 |
result = await evaluator.evaluate_single(case, chat_result)
|
359 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
360 |
|
361 |
-
except Exception
|
362 |
console.print_exception()
|
|
|
|
|
|
|
|
|
363 |
|
364 |
-
|
365 |
|
366 |
if __name__ == "__main__":
|
367 |
asyncio.run(main())
|
|
|
70 |
|
71 |
Format your response as JSON:
|
72 |
{
|
73 |
+
"chunk_relevance": float type score (from 0.0f to 10.0f),
|
74 |
+
"answer_correctness": float type score (from 0.0f to 10.0f),
|
75 |
+
"code_reference": float type score (from 0.0f to 10.0f),
|
76 |
"feedback": "Brief explanation of scores"
|
77 |
}
|
78 |
"""
|
|
|
93 |
result = await self.eval_agent.run(
|
94 |
eval_context.model_dump_json(),
|
95 |
)
|
96 |
+
eval_response : EvalAgentResponse = result.data
|
97 |
+
metrics = {
|
98 |
+
EvalMetric.CHUNK_RELEVANCE: eval_response.chunk_relevance,
|
99 |
+
EvalMetric.ANSWER_CORRECTNESS: eval_response.answer_correctness,
|
100 |
+
EvalMetric.CODE_REFERENCE: eval_response.code_reference
|
101 |
+
}
|
102 |
|
103 |
# Calculate weighted score
|
104 |
weights = {
|
|
|
108 |
}
|
109 |
|
110 |
total_score = sum(
|
111 |
+
metrics[metric] * weights[metric] * case.difficulty
|
112 |
for metric in EvalMetric
|
113 |
)
|
114 |
|
|
|
116 |
case=case,
|
117 |
metrics=metrics,
|
118 |
total_score=total_score,
|
119 |
+
feedback=eval_response.feedback,
|
120 |
)
|
121 |
|
122 |
async def evaluate_batch(
|
|
|
351 |
async def main():
|
352 |
from rich.console import Console
|
353 |
from rich.pretty import Pretty
|
354 |
+
import json
|
355 |
import chromadb
|
356 |
console = Console()
|
357 |
config = AppConfig()
|
358 |
evaluator = ChatBotEvaluator(config)
|
359 |
collection = chromadb.PersistentClient(path=str(config.db.persist_directory)).get_collection(name=config.db.collection_name)
|
360 |
|
361 |
+
final_results = []
|
362 |
+
|
363 |
for case in TRANSFORMER_TEST_CASES:
|
364 |
try:
|
365 |
chat_result = await process_chat(question=case.question, collection=collection, config=config)
|
366 |
result = await evaluator.evaluate_single(case, chat_result)
|
367 |
+
|
368 |
+
# Aggregate chat_result and result into a single JSON object
|
369 |
+
aggregated_result = {
|
370 |
+
"question": case.question,
|
371 |
+
"chat_result": chat_result.model_dump(),
|
372 |
+
"evaluation_result": result.model_dump()
|
373 |
+
}
|
374 |
+
final_results.append(aggregated_result)
|
375 |
|
376 |
+
except Exception:
|
377 |
console.print_exception()
|
378 |
+
|
379 |
+
# Write the final JSON array to a file
|
380 |
+
with open("evaluation_results.json", "w") as f:
|
381 |
+
json.dump(final_results, f, indent=2)
|
382 |
|
383 |
+
console.print(Pretty(final_results))
|
384 |
|
385 |
if __name__ == "__main__":
|
386 |
asyncio.run(main())
|