Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

leaderboard / src /backend /model_operations.py

Minseok Bae

Integrated backend pipelines - error occurs during model submission. (Debugging needed).

58b9de9 over 1 year ago

7.75 kB

	import logging

	import numpy as np
	import pandas as pd
	import spacy
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from sentence_transformers import CrossEncoder

	import src.backend.util as util

	# Set up basic configuration for logging
	logging.basicConfig(level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s')

	# Load spacy model for word tokenization
	nlp = spacy.load("en_core_web_sm")


	def load_evaluation_model(model_path):
	"""Load the evaluation model from the given path

	Args:
	model_path (str): Path to the evaluation model

	Returns:
	CrossEncoder: The evaluation model
	"""
	model = CrossEncoder(model_path)
	return model


	class ModelLoadingException(Exception):
	"""Exception raised for errors in loading a model.

	Attributes:
	model_id (str): The model identifier.
	revision (str): The model revision.
	"""

	def __init__(self, model_id, revision, messages="Error initializing model"):
	self.model_id = model_id
	self.revision = revision
	super().__init__(f"{messages} id={model_id} revision={revision}")

	class SummaryGenerator:
	"""A class to generate summaries using a causal language model.

	Attributes:
	tokenizer (AutoTokenizer): Tokenizer for the model.
	model (AutoModelForCausalLM): The causal language model.
	summaries_df (DataFrame): DataFrame to store generated summaries.
	revision (str): Model revision.
	avg_length (float): Average length of summaries.
	answer_rate (float): Rate of non-empty summaries.
	"""

	def __init__(self, model_id, revision):
	"""
	Initializes the SummaryGenerator with a model.

	Args:
	model_id (str): Identifier for the model.
	revision (str): Revision of the model.
	"""
	try:
	self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision)
	self.model = AutoModelForCausalLM.from_pretrained(model_id, revision)
	except Exception as e:
	logging.error(f"Error initializing model with id {model_id} and revision {revision}: {e}")
	raise ModelLoadingException(model_id, revision) from e
	self.summaries_df = pd.DataFrame()
	self.revision = revision
	self.avg_length = None
	self.answer_rate = None
	self.error_rate = None

	def generate_summaries(self, df):
	"""Generate summaries for a given DataFrame of source docs.

	Args:
	df (DataFrame): DataFrame containing source docs.

	Returns:
	summaries_df (DataFrame): Generated summaries by the model.
	"""
	source, summary, dataset = [], [], []

	error_count = 0
	for index, row in df.iterrows():
	_source = row['text']
	_dataset = row['dataset']

	prompt = util.generate_prompt(_source)
	inputs = self.tokenizer(prompt, return_tensors='pt', max_length=1024,
	revision=self.revision)
	try:
	outputs = self.model.generate(**inputs, max_new_tokens=1024, do_sample=False,
	temperature=0.0, revision=self.revision)
	response = self.tokenizer.decode(outputs[0], skip_special_tokens=True,
	revision=self.revision)
	except Exception as e:
	print(f"Error at index {index}: {e}")
	response = ""
	error_count += 1

	summary.append(response)
	source.append(_source)
	dataset.append(_dataset)

	self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
	columns=["source", "summary", "dataset"])
	self._compute_avg_length()
	self._compute_answer_rate()
	self._compute_error_rate(error_count)

	return self.summaries_df

	def _compute_avg_length(self):
	"""
	Compute the average length of non-empty summaries using SpaCy.
	"""
	total_words = 0
	count = 0

	for summary in self.summaries_df['summary']:
	if summary != "":
	doc = nlp(summary)
	words = [token.text for token in doc if token.is_alpha]
	total_words += len(words)
	count += 1

	self.avg_length = 0 if count == 0 else total_words / count

	def _compute_answer_rate(self):
	"""
	Compute the rate of non-empty summaries.
	"""
	non_empty_count = sum(1 for summary in self.summaries_df['summary'] if summary)
	total_rows = len(self.summaries_df)

	self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows

	def _compute_error_rate(self, count):
	"""
	Compute the error rate of summaries.
	"""
	total_rows = len(self.summaries_df)

	self.error_rate = 0 if total_rows == 0 else count / total_rows


	class EvaluationModel:
	"""A class to evaluate generated summaries.

	Attributes:
	model (CrossEncoder): The evaluation model.
	scores (list): List of evaluation scores.
	accuracy (float): Accuracy of the summaries.
	hallucination_rate (float): Rate of hallucination in summaries.
	"""

	def __init__(self, model_path):
	"""
	Initializes the EvaluationModel with a CrossEncoder model.

	Args:
	model_path (str): Path to the CrossEncoder model.
	"""
	self.model = load_evaluation_model(model_path)
	self.scores = []
	self.accuracy = None
	self.hallucination_rate = None

	def evaluate_hallucination(self, summaries_df):
	"""
	Evaluate the hallucination rate in summaries. This method updates the 'scores' attribute
	of the instance with the computed scores.

	Args:
	summaries_df (DataFrame): DataFrame containing source docs and summaries.

	Returns:
	list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
	"""
	source_docs = np.array(summaries_df['source'])
	generated_summaries = np.array(summaries_df['summary'])
	try:
	scores = self.model.predict(source_docs, generated_summaries)
	self.scores = scores
	return self.scores
	except Exception as e:
	logging.error(f"Error evaluating hallucination: {e}")
	raise

	def compute_accuracy(self, threshold=0.5):
	"""
	Compute the accuracy of the evaluated summaries based on the previously calculated scores.
	This method relies on the 'scores' attribute being populated, typically via the
	'evaluate_hallucination' method.

	Returns:
	float: Accuracy percentage. Also updates the 'accuracy' and 'hallucination_rate'
	attributes of the instance.

	Raises:
	ValueError: If scores have not been calculated prior to calling this method.
	"""
	if not self.scores:
	error_msg = "Scores not calculated. Call evaluate_hallucination() first."
	logging.error(error_msg)
	raise ValueError(error_msg)

	# Use threshold of 0.5 to compute accuracy
	num_above_threshold = sum(score >= threshold for score in self.scores)
	num_total = len(self.scores)

	if not num_total:
	raise ValueError("No scores available to compute accuracy.")

	self.accuracy = (num_above_threshold / num_total) * 100
	self.hallucination_rate = 100 - self.accuracy

	return self.accuracy