import pandas as pd from leaderboard.src.backend.model_operations import SummaryGenerator, EvaluationModel from envs import HEM_PATH, SOURCE_PATH from leaderboard.src.backend.util import load_dataframe, format_results class Evaluator: def __init__(self, model, revision, precision, num_fewshot, batch_size, device, no_cache, limit, write_out=True, output_base_path='logs'): self.model = model self.revision = revision self.precision = precision self.num_fewshot = num_fewshot self.batch_size = batch_size self.device = device self.no_cache = no_cache self.limit = limit self.write_out = write_out self.output_base_path = output_base_path self.summary_generator = SummaryGenerator(model, revision) self.eval_model = EvaluationModel(HEM_PATH) def evaluate(self): df = load_dataframe(SOURCE_PATH) generated_summaries_df = self.summary_generator.generate_summaries(df) avg_summary_len = self.summary_generator.avg_length answer_rate = self.summary_generator.answer_rate hallucination_scores = self.eval_model.evaluate_hallucination(generated_summaries_df) accuracy = self.eval_model.compute_accuracy hallucination_rate = self.eval_model.hallucination_rate results = format_results(hallucination_scores, self.model, self.revision, self.precision, accuracy, hallucination_rate, answer_rate, avg_summary_len) return results