import logging import numpy as np import pandas as pd import spacy from transformers import AutoModelForCausalLM, AutoTokenizer from sentence_transformers import CrossEncoder import src.backend.util as util # Set up basic configuration for logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # Load spacy model for word tokenization nlp = spacy.load("en_core_web_sm") def load_evaluation_model(model_path): """Load the evaluation model from the given path Args: model_path (str): Path to the evaluation model Returns: CrossEncoder: The evaluation model """ model = CrossEncoder(model_path) return model class ModelLoadingException(Exception): """Exception raised for errors in loading a model. Attributes: model_id (str): The model identifier. revision (str): The model revision. """ def __init__(self, model_id, revision, messages="Error initializing model"): self.model_id = model_id self.revision = revision super().__init__(f"{messages} id={model_id} revision={revision}") class SummaryGenerator: """A class to generate summaries using a causal language model. Attributes: tokenizer (AutoTokenizer): Tokenizer for the model. model (AutoModelForCausalLM): The causal language model. summaries_df (DataFrame): DataFrame to store generated summaries. revision (str): Model revision. avg_length (float): Average length of summaries. answer_rate (float): Rate of non-empty summaries. """ def __init__(self, model_id, revision): """ Initializes the SummaryGenerator with a model. Args: model_id (str): Identifier for the model. revision (str): Revision of the model. """ try: self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision) self.model = AutoModelForCausalLM.from_pretrained(model_id, revision) except Exception as e: logging.error(f"Error initializing model with id {model_id} and revision {revision}: {e}") raise ModelLoadingException(model_id, revision) from e self.summaries_df = pd.DataFrame() self.revision = revision self.avg_length = None self.answer_rate = None self.error_rate = None def generate_summaries(self, df): """Generate summaries for a given DataFrame of source docs. Args: df (DataFrame): DataFrame containing source docs. Returns: summaries_df (DataFrame): Generated summaries by the model. """ source, summary, dataset = [], [], [] error_count = 0 for index, row in df.iterrows(): _source = row['text'] _dataset = row['dataset'] prompt = util.generate_prompt(_source) inputs = self.tokenizer(prompt, return_tensors='pt', max_length=1024, revision=self.revision) try: outputs = self.model.generate(**inputs, max_new_tokens=1024, do_sample=False, temperature=0.0, revision=self.revision) response = self.tokenizer.decode(outputs[0], skip_special_tokens=True, revision=self.revision) except Exception as e: print(f"Error at index {index}: {e}") response = "" error_count += 1 summary.append(response) source.append(_source) dataset.append(_dataset) self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)), columns=["source", "summary", "dataset"]) self._compute_avg_length() self._compute_answer_rate() self._compute_error_rate(error_count) return self.summaries_df def _compute_avg_length(self): """ Compute the average length of non-empty summaries using SpaCy. """ total_words = 0 count = 0 for summary in self.summaries_df['summary']: if summary != "": doc = nlp(summary) words = [token.text for token in doc if token.is_alpha] total_words += len(words) count += 1 self.avg_length = 0 if count == 0 else total_words / count def _compute_answer_rate(self): """ Compute the rate of non-empty summaries. """ non_empty_count = sum(1 for summary in self.summaries_df['summary'] if summary) total_rows = len(self.summaries_df) self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows def _compute_error_rate(self, count): """ Compute the error rate of summaries. """ total_rows = len(self.summaries_df) self.error_rate = 0 if total_rows == 0 else count / total_rows class EvaluationModel: """A class to evaluate generated summaries. Attributes: model (CrossEncoder): The evaluation model. scores (list): List of evaluation scores. accuracy (float): Accuracy of the summaries. hallucination_rate (float): Rate of hallucination in summaries. """ def __init__(self, model_path): """ Initializes the EvaluationModel with a CrossEncoder model. Args: model_path (str): Path to the CrossEncoder model. """ self.model = load_evaluation_model(model_path) self.scores = [] self.accuracy = None self.hallucination_rate = None def evaluate_hallucination(self, summaries_df): """ Evaluate the hallucination rate in summaries. This method updates the 'scores' attribute of the instance with the computed scores. Args: summaries_df (DataFrame): DataFrame containing source docs and summaries. Returns: list: List of hallucination scores. Also updates the 'scores' attribute of the instance. """ source_docs = np.array(summaries_df['source']) generated_summaries = np.array(summaries_df['summary']) try: scores = self.model.predict(source_docs, generated_summaries) self.scores = scores return self.scores except Exception as e: logging.error(f"Error evaluating hallucination: {e}") raise def compute_accuracy(self, threshold=0.5): """ Compute the accuracy of the evaluated summaries based on the previously calculated scores. This method relies on the 'scores' attribute being populated, typically via the 'evaluate_hallucination' method. Returns: float: Accuracy percentage. Also updates the 'accuracy' and 'hallucination_rate' attributes of the instance. Raises: ValueError: If scores have not been calculated prior to calling this method. """ if not self.scores: error_msg = "Scores not calculated. Call evaluate_hallucination() first." logging.error(error_msg) raise ValueError(error_msg) # Use threshold of 0.5 to compute accuracy num_above_threshold = sum(score >= threshold for score in self.scores) num_total = len(self.scores) if not num_total: raise ValueError("No scores available to compute accuracy.") self.accuracy = (num_above_threshold / num_total) * 100 self.hallucination_rate = 100 - self.accuracy return self.accuracy