gabykim commited on
Commit
3f42de6
·
1 Parent(s): 31e157d

evaluation visualiation script

Browse files
src/know_lang_bot/evaluation/chatbot_evaluation_visualize.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import json
3
+ import pandas as pd
4
+ from rich.console import Console
5
+ from rich.table import Table
6
+ from typing import List
7
+ from know_lang_bot.evaluation.chatbot_evaluation import EvalSummary
8
+
9
+ class ResultAnalyzer:
10
+ def __init__(self, base_dir: Path):
11
+ self.console = Console()
12
+ self.embedding_dir = base_dir / "embedding"
13
+ self.reranking_dir = base_dir / "embedding_reranking"
14
+
15
+ def load_results(self, file_path: Path) -> List[EvalSummary]:
16
+ """Load evaluation results from JSON file"""
17
+ with open(file_path) as f:
18
+ obj_list = json.load(f)
19
+ return [EvalSummary.model_validate(obj) for obj in obj_list]
20
+
21
+ def create_dataframe(self, results: List[EvalSummary]) -> pd.DataFrame:
22
+ """Convert results to pandas DataFrame with flattened metrics"""
23
+ rows = []
24
+ for result in results:
25
+ row = {
26
+ "evaluator_model": result.evaluator_model,
27
+ "question": result.case.question,
28
+ "difficulty": result.case.difficulty,
29
+ "chunk_relevance": result.eval_response.chunk_relevance,
30
+ "answer_correctness": result.eval_response.answer_correctness,
31
+ "code_reference": result.eval_response.code_reference,
32
+ "weighted_total": result.eval_response.weighted_total
33
+ }
34
+ rows.append(row)
35
+
36
+ return pd.DataFrame(rows)
37
+
38
+ def analyze_results(self):
39
+ """Analyze and display results comparison"""
40
+ # Load all results
41
+ all_results = {
42
+ "embedding": [],
43
+ "reranking": []
44
+ }
45
+
46
+ for file in self.embedding_dir.glob("*.json"):
47
+ all_results["embedding"].extend(self.load_results(file))
48
+
49
+ for file in self.reranking_dir.glob("*.json"):
50
+ all_results["reranking"].extend(self.load_results(file))
51
+
52
+ # Convert to DataFrames
53
+ embedding_df = self.create_dataframe(all_results["embedding"])
54
+ reranking_df = self.create_dataframe(all_results["reranking"])
55
+
56
+ # Calculate statistics by evaluator model
57
+ def get_model_stats(df: pd.DataFrame) -> pd.DataFrame:
58
+ return df.groupby("evaluator_model").agg({
59
+ "chunk_relevance": ["mean", "std"],
60
+ "answer_correctness": ["mean", "std"],
61
+ "code_reference": ["mean", "std"],
62
+ "weighted_total": ["mean", "std"]
63
+ }).round(2)
64
+
65
+ embedding_stats = get_model_stats(embedding_df)
66
+ reranking_stats = get_model_stats(reranking_df)
67
+
68
+ # Display comparison tables
69
+ self.display_comparison_table(embedding_stats, reranking_stats)
70
+ self.display_improvement_metrics(embedding_df, reranking_df)
71
+
72
+ # Save detailed results to CSV
73
+ self.save_detailed_results(embedding_df, reranking_df)
74
+
75
+ def display_comparison_table(self, embedding_stats: pd.DataFrame, reranking_stats: pd.DataFrame):
76
+ """Display rich table comparing embedding and reranking results"""
77
+ table = Table(title="Embedding vs Reranking Comparison")
78
+
79
+ table.add_column("Metric", style="cyan")
80
+ table.add_column("Model", style="magenta")
81
+ table.add_column("Embedding", style="blue")
82
+ table.add_column("Reranking", style="green")
83
+ table.add_column("Improvement", style="yellow")
84
+
85
+ metrics = ["chunk_relevance", "answer_correctness", "code_reference", "weighted_total"]
86
+
87
+ for metric in metrics:
88
+ for model in embedding_stats.index:
89
+ emb_mean = embedding_stats.loc[model, (metric, "mean")]
90
+ emb_std = embedding_stats.loc[model, (metric, "std")]
91
+ rer_mean = reranking_stats.loc[model, (metric, "mean")]
92
+ rer_std = reranking_stats.loc[model, (metric, "std")]
93
+
94
+ improvement = ((rer_mean - emb_mean) / emb_mean * 100).round(1)
95
+
96
+ table.add_row(
97
+ metric.replace("_", " ").title(),
98
+ model.split(":")[-1],
99
+ f"{emb_mean:.2f} ±{emb_std:.2f}",
100
+ f"{rer_mean:.2f} ±{rer_std:.2f}",
101
+ f"{improvement:+.1f}%" if improvement else "0%"
102
+ )
103
+
104
+ self.console.print(table)
105
+
106
+ def display_improvement_metrics(self, embedding_df: pd.DataFrame, reranking_df: pd.DataFrame):
107
+ """Display additional improvement metrics"""
108
+ # Calculate improvements by difficulty
109
+ difficulties = sorted(embedding_df["difficulty"].unique())
110
+
111
+ table = Table(title="Improvements by Difficulty")
112
+ table.add_column("Difficulty", style="cyan")
113
+ table.add_column("Embedding", style="blue")
114
+ table.add_column("Reranking", style="green")
115
+ table.add_column("Improvement", style="yellow")
116
+
117
+ for diff in difficulties:
118
+ emb_score = embedding_df[embedding_df["difficulty"] == diff]["weighted_total"].mean()
119
+ rer_score = reranking_df[reranking_df["difficulty"] == diff]["weighted_total"].mean()
120
+ improvement = ((rer_score - emb_score) / emb_score * 100).round(1)
121
+
122
+ table.add_row(
123
+ str(diff),
124
+ f"{emb_score:.2f}",
125
+ f"{rer_score:.2f}",
126
+ f"{improvement:+.1f}%"
127
+ )
128
+
129
+ self.console.print(table)
130
+
131
+ def save_detailed_results(self, embedding_df: pd.DataFrame, reranking_df: pd.DataFrame):
132
+ """Save detailed results to CSV"""
133
+ # Add method column
134
+ embedding_df["method"] = "embedding"
135
+ reranking_df["method"] = "reranking"
136
+
137
+ # Combine and save
138
+ combined_df = pd.concat([embedding_df, reranking_df])
139
+ combined_df.to_csv("evaluation_comparison.csv", index=False)
140
+ self.console.print(f"\nDetailed results saved to evaluation_comparison.csv")
141
+
142
+ if __name__ == "__main__":
143
+ analyzer = ResultAnalyzer(Path("evaluations"))
144
+ analyzer.analyze_results()