|
from pathlib import Path |
|
import json |
|
import pandas as pd |
|
from rich.console import Console |
|
from rich.table import Table |
|
from typing import List, Dict |
|
from enum import Enum |
|
from knowlang.evaluation.chatbot_evaluation import EvalSummary |
|
|
|
class RetrievalMethod(str, Enum): |
|
EMBEDDING = "embedding" |
|
EMBEDDING_RERANKING = "embedding_reranking" |
|
EMBEDDING_WITH_CODE = "embedding_with_code" |
|
OPENAI_EMBEDDING_WITH_CODE = "openai_embedding_with_code" |
|
EMBEDDING_RERANKING_WITH_CODE = "embedding_reranking_with_code" |
|
|
|
class ResultAnalyzer: |
|
def __init__(self, base_dir: Path, baseline_method: RetrievalMethod = RetrievalMethod.EMBEDDING): |
|
self.console = Console() |
|
self.base_dir = base_dir |
|
self.baseline_method = baseline_method |
|
|
|
self.method_dirs = { |
|
RetrievalMethod.EMBEDDING: self.base_dir / RetrievalMethod.EMBEDDING.value, |
|
|
|
RetrievalMethod.EMBEDDING_WITH_CODE: self.base_dir / RetrievalMethod.EMBEDDING_WITH_CODE.value, |
|
RetrievalMethod.OPENAI_EMBEDDING_WITH_CODE: self.base_dir / RetrievalMethod.OPENAI_EMBEDDING_WITH_CODE.value, |
|
RetrievalMethod.VOYAGE_EMBEDDING_WITH_CODE: self.base_dir / RetrievalMethod.VOYAGE_EMBEDDING_WITH_CODE.value, |
|
|
|
} |
|
|
|
def load_results(self, file_path: Path) -> List[EvalSummary]: |
|
"""Load evaluation results from JSON file""" |
|
with open(file_path) as f: |
|
obj_list = json.load(f) |
|
return [EvalSummary.model_validate(obj) for obj in obj_list] |
|
|
|
def create_dataframe(self, results: List[EvalSummary]) -> pd.DataFrame: |
|
"""Convert results to pandas DataFrame with flattened metrics""" |
|
rows = [] |
|
for result in results: |
|
|
|
base_row = { |
|
"evaluator_model": result.evaluator_model, |
|
"question": result.case.question, |
|
"difficulty": result.case.difficulty, |
|
"environment": getattr(result.case, 'environment', 'default') |
|
} |
|
|
|
|
|
for round in result.eval_rounds: |
|
row = base_row.copy() |
|
row.update({ |
|
"round_id": round.round_id, |
|
"chunk_relevance": round.eval_response.chunk_relevance, |
|
"answer_correctness": round.eval_response.answer_correctness, |
|
"code_reference": round.eval_response.code_reference, |
|
"weighted_total": round.eval_response.weighted_total, |
|
"timestamp": round.timestamp |
|
}) |
|
rows.append(row) |
|
|
|
return pd.DataFrame(rows) |
|
|
|
def load_all_results(self) -> Dict[RetrievalMethod, pd.DataFrame]: |
|
"""Load results for all available methods""" |
|
results = {} |
|
for method in RetrievalMethod: |
|
method_dir = self.method_dirs.get(method) |
|
if method_dir and method_dir.exists(): |
|
all_results = [] |
|
for file in method_dir.glob("*.json"): |
|
all_results.extend(self.load_results(file)) |
|
if all_results: |
|
results[method] = self.create_dataframe(all_results) |
|
return results |
|
|
|
def calculate_improvement(self, new_val: float, baseline_val: float) -> str: |
|
"""Calculate and format improvement percentage""" |
|
if baseline_val == 0: |
|
return "N/A" |
|
improvement = ((new_val - baseline_val) / baseline_val * 100).round(1) |
|
return f"{improvement:+.1f}%" if improvement else "0%" |
|
|
|
def get_stats_by_group(self, df: pd.DataFrame, group_by: str) -> pd.DataFrame: |
|
"""Calculate statistics with round variance""" |
|
|
|
question_means = df.groupby([group_by, "question"]).agg({ |
|
"chunk_relevance": "mean", |
|
"answer_correctness": "mean", |
|
"code_reference": "mean", |
|
"weighted_total": "mean" |
|
}) |
|
|
|
|
|
return question_means.groupby(level=0).agg({ |
|
"chunk_relevance": ["mean", "std"], |
|
"answer_correctness": ["mean", "std"], |
|
"code_reference": ["mean", "std"], |
|
"weighted_total": ["mean", "std"] |
|
}).round(2) |
|
|
|
def display_comparison_table(self, results: Dict[RetrievalMethod, pd.DataFrame]): |
|
"""Display rich table comparing all methods""" |
|
table = Table(title="Method Comparison by Evaluator Model") |
|
|
|
|
|
table.add_column("Metric", style="cyan") |
|
table.add_column("Model", style="magenta") |
|
for method in results.keys(): |
|
table.add_column(method.value.replace("_", " ").title(), style="blue") |
|
if method != self.baseline_method: |
|
table.add_column(f"{method.value} Improvement", style="yellow") |
|
|
|
|
|
stats_by_method = { |
|
method: self.get_stats_by_group(df, "evaluator_model") |
|
for method, df in results.items() |
|
} |
|
|
|
metrics = ["chunk_relevance", "answer_correctness", "code_reference", "weighted_total"] |
|
baseline_stats = stats_by_method[self.baseline_method] |
|
|
|
for metric in metrics: |
|
for model in baseline_stats.index: |
|
row_data = [ |
|
metric.replace("_", " ").title(), |
|
model.split(":")[-1] |
|
] |
|
|
|
|
|
for method in results.keys(): |
|
stats = stats_by_method[method] |
|
mean = stats.loc[model, (metric, "mean")] |
|
std = stats.loc[model, (metric, "std")] |
|
row_data.append(f"{mean:.2f} ±{std:.2f}") |
|
|
|
|
|
if method != self.baseline_method: |
|
baseline_mean = baseline_stats.loc[model, (metric, "mean")] |
|
row_data.append(self.calculate_improvement(mean, baseline_mean)) |
|
|
|
table.add_row(*row_data) |
|
|
|
self.console.print(table) |
|
|
|
def display_environment_comparison(self, results: Dict[RetrievalMethod, pd.DataFrame]): |
|
"""Display comparison across different evaluation environments""" |
|
table = Table(title="Method Comparison by Environment") |
|
|
|
table.add_column("Environment", style="cyan") |
|
for method in results.keys(): |
|
table.add_column(method.value.replace("_", " ").title(), style="blue") |
|
if method != self.baseline_method: |
|
table.add_column(f"{method.value} Improvement", style="yellow") |
|
|
|
|
|
environments = sorted(set().union(*[ |
|
set(df["environment"].unique()) |
|
for df in results.values() |
|
])) |
|
|
|
baseline_df = results[self.baseline_method] |
|
|
|
for env in environments: |
|
row_data = [env] |
|
|
|
for method in results.keys(): |
|
df = results[method] |
|
env_score = df[df["environment"] == env]["weighted_total"].mean() |
|
row_data.append(f"{env_score:.2f}") |
|
|
|
if method != self.baseline_method: |
|
baseline_score = baseline_df[ |
|
baseline_df["environment"] == env |
|
]["weighted_total"].mean() |
|
row_data.append(self.calculate_improvement(env_score, baseline_score)) |
|
|
|
table.add_row(*row_data) |
|
|
|
self.console.print(table) |
|
|
|
def analyze_results(self): |
|
"""Analyze and display results comparison""" |
|
results = self.load_all_results() |
|
if not results: |
|
self.console.print("[red]No results found!") |
|
return |
|
|
|
|
|
self.display_comparison_table(results) |
|
self.console.print("\n") |
|
self.display_environment_comparison(results) |
|
|
|
|
|
self.save_detailed_results(results) |
|
|
|
def save_detailed_results(self, results: Dict[RetrievalMethod, pd.DataFrame]): |
|
"""Save detailed results to CSV""" |
|
|
|
dfs = [] |
|
for method, df in results.items(): |
|
df = df.copy() |
|
df["method"] = method.value |
|
dfs.append(df) |
|
|
|
combined_df = pd.concat(dfs) |
|
output_path = self.base_dir / "evaluation_comparison.csv" |
|
combined_df.to_csv(output_path, index=False) |
|
self.console.print(f"\nDetailed results saved to {output_path}") |
|
|
|
if __name__ == "__main__": |
|
analyzer = ResultAnalyzer( |
|
Path("evaluations"), |
|
baseline_method=RetrievalMethod.EMBEDDING |
|
) |
|
analyzer.analyze_results() |