Spaces:

jrc-ai
/

MultiNER-simplified

Running

MultiNER-simplified / retrieverRAG_testing.py

Consoli Sergio

major commit for change to interface to gradio Blocks

232b620 about 1 month ago

11.8 kB

	# https://www.mixedbread.ai/blog/mxbai-embed-large-v1
	# https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1

	import os
	import time
	import pandas as pd
	import numpy as np
	from typing import Dict

	import torch
	from transformers import AutoModel, AutoTokenizer
	from sentence_transformers.util import cos_sim
	from accelerate import Accelerator # Import from accelerate
	from scipy.stats import zscore

	# Set up environment variables for Hugging Face caching
	os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
	os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
	os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"

	# Initialize the Accelerator
	accelerator = Accelerator()

	# Use the device managed by Accelerator
	device = accelerator.device
	print("Using accelerator device =", device)


	# 1. Load the model and tokenizer
	model_id_Retriever = 'mixedbread-ai/mxbai-embed-large-v1'
	tokenizer_Retriever = AutoTokenizer.from_pretrained(model_id_Retriever)
	modelRetriever = AutoModel.from_pretrained(model_id_Retriever)

	# Accelerate prepares the model (e.g., moves to the appropriate device)
	modelRetriever = accelerator.prepare(modelRetriever)




	# Define the transform_query function
	def transform_query(queryText: str) -> str:
	"""For retrieval, add the prompt for queryText (not for documents)."""
	return f'Represent this sentence for searching relevant passages: {queryText}'

	# Define the pooling function
	def pooling(outputs: torch.Tensor, inputs: Dict, strategy: str = 'cls') -> np.ndarray:
	if strategy == 'cls':
	outputs = outputs[:, 0]
	elif strategy == 'mean':
	outputs = torch.sum(
	outputs * inputs["attention_mask"][:, :, None], dim=1
	) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True)
	else:
	raise NotImplementedError
	return outputs.detach().cpu().numpy()


	def retrievePassageSimilarities(queryText, passages):
	# Create the docs list by adding the transformed queryText and then the passages
	docs = [transform_query(queryText)] + passages

	# 2. Encode the inputs
	inputs = tokenizer_Retriever(docs, padding=True, return_tensors='pt')

	# Move inputs to the right device using accelerator
	inputs = {k: v.to(device) for k, v in inputs.items()}
	with torch.no_grad():
	outputs = modelRetriever(**inputs).last_hidden_state
	embeddings = pooling(outputs, inputs, 'cls')

	similarities = cos_sim(embeddings[0], embeddings[1:])

	#print('similarities:', similarities)

	return similarities



	def RAG_retrieval_Base(queryText,passages, min_threshold=0.0, max_num_passages=None):

	try:
	similarities=retrievePassageSimilarities(queryText, passages)

	#Create a DataFrame
	df = pd.DataFrame({
	'Passage': passages,
	'Similarity': similarities.flatten() # Flatten the similarity tensor/array to ensure compatibility
	})

	# Filter the DataFrame based on the similarity threshold
	df_filtered = df[df['Similarity'] >= min_threshold]

	# If max_num_passages is specified, limit the number of passages returned
	if max_num_passages is not None:
	df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')

	df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)

	# Return the filtered DataFrame
	return df_filtered

	except Exception as e:
	# Log the exception message or handle it as needed
	print(f"An error occurred: {e}")
	return pd.DataFrame() # Return an empty DataFrame in case of error



	def RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0, max_num_passages=None, min_threshold=0.5):
	try:
	# Encoding and similarity computation remains the same

	similarities = retrievePassageSimilarities(queryText, passages)

	# Calculate z-scores for similarities
	z_scores = zscore(similarities.flatten())

	# Create a DataFrame with passages, similarities, and z-scores
	df = pd.DataFrame({
	'Passage': passages,
	'Similarity': similarities.flatten(),
	'Z-Score': z_scores
	})

	# Filter passages based on z-score threshold
	df_filtered = df[df['Z-Score'] >= z_threshold]

	if min_threshold:
	# Filter the DataFrame also on min similarity threshold
	df_filtered = df[df['Similarity'] >= min_threshold]

	# If max_num_passages is specified, limit the number of passages returned
	if max_num_passages is not None:
	df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')

	# Sort by similarity (or z-score if preferred)
	df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)

	return df_filtered

	except Exception as e:
	# Log the exception message or handle it as needed
	print(f"An error occurred: {e}")
	return pd.DataFrame() # Return an empty DataFrame in case of error




	def RAG_retrieval_Percentile(queryText, passages, percentile=90,max_num_passages=None, min_threshold=0.5):
	try:
	# Encoding and similarity computation remains the same

	similarities = retrievePassageSimilarities(queryText, passages)

	# Determine threshold based on percentile
	threshold = np.percentile(similarities.flatten(), percentile)

	# Create a DataFrame
	df = pd.DataFrame({
	'Passage': passages,
	'Similarity': similarities.flatten()
	})

	# Filter using percentile threshold
	df_filtered = df[df['Similarity'] >= threshold]

	if min_threshold:
	# Filter the DataFrame also on min similarity threshold
	df_filtered = df[df['Similarity'] >= min_threshold]

	# If max_num_passages is specified, limit the number of passages returned
	if max_num_passages is not None:
	df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')

	# Sort by similarity
	df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)

	return df_filtered

	except Exception as e:
	# Log the exception message or handle it as needed
	print(f"An error occurred: {e}")
	return pd.DataFrame() # Return an empty DataFrame in case of error



	def RAG_retrieval_TopK(queryText, passages, top_fraction=0.1, max_num_passages=None, min_threshold=0.5):
	try:
	# Encoding and similarity computation (assuming retrievePassageSimilarities is defined elsewhere)
	similarities = retrievePassageSimilarities(queryText, passages)

	# Calculate the number of passages to select based on top fraction
	num_passages_TopFraction = max(1, int(top_fraction * len(passages)))

	# Create a DataFrame
	df = pd.DataFrame({
	'Passage': passages,
	'Similarity': similarities.flatten()
	})

	# Select the top passages dynamically
	df_filtered = df.nlargest(num_passages_TopFraction, 'Similarity')

	if min_threshold:
	# Filter the DataFrame also on min similarity threshold
	df_filtered = df_filtered[df_filtered['Similarity'] >= min_threshold]

	# If max_num_passages is specified, limit the number of passages returned
	if max_num_passages is not None:
	df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')

	# Sort by similarity
	df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)

	return df_filtered

	except Exception as e:
	# Log the exception message or handle it as needed
	print(f"An error occurred: {e}")
	return pd.DataFrame() # Return an empty DataFrame in case of error



	if __name__ == '__main__':

	queryText = 'A man is eating a piece of bread'

	# Define the passages list
	passages = [
	"A man is eating food.",
	"A man is eating pasta.",
	"The girl is carrying a baby.",
	"A man is riding a horse.",
	]

	#df_retrieved = RAG_retrieval_Base(queryText, passages)
	#df_retrieved = RAG_retrieval_Base(queryText, passages, min_threshold=0.5)
	#df_retrieved = RAG_retrieval_Base(queryText, passages, max_num_passages=3)
	df_retrieved = RAG_retrieval_Base(queryText, passages, min_threshold=0.5, max_num_passages=3)

	#df_retrieved = RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0)
	#df_retrieved = RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0,max_num_passages=3)

	#df_retrieved = RAG_retrieval_Percentile(queryText, passages, percentile=80)
	# df_retrieved = RAG_retrieval_Percentile(queryText, passages, percentile=80, max_num_passages=3)

	##df_retrieved = RAG_retrieval_TopK(queryText, passages, top_fraction=0.2)
	#df_retrieved = RAG_retrieval_TopK(queryText, passages, top_fraction=0.2, max_num_passages=3)


	print(df_retrieved)

	#labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()


	print("end of computations")

	# VERSION WITHOUT ACCELERATE
	#
	# #https://www.mixedbread.ai/blog/mxbai-embed-large-v1
	# #https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1
	#
	# import os
	#
	# os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
	# os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
	# os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
	#
	# import time
	# import pandas as pd
	# import numpy as np
	#
	#
	#
	# from typing import Dict
	#
	# import torch
	# import numpy as np
	# from transformers import AutoModel, AutoTokenizer
	# from sentence_transformers.util import cos_sim
	#
	# # For retrieval you need to pass this prompt. Please find our more in our blog post.
	# def transform_queryText(queryText: str) -> str:
	# """ For retrieval, add the prompt for queryText (not for documents).
	# """
	# return f'Represent this sentence for searching relevant passages: {queryText}'
	#
	# # The model works really well with cls pooling (default) but also with mean pooling.
	# def pooling(outputs: torch.Tensor, inputs: Dict, strategy: str = 'cls') -> np.ndarray:
	# if strategy == 'cls':
	# outputs = outputs[:, 0]
	# elif strategy == 'mean':
	# outputs = torch.sum(
	# outputs * inputs["attention_mask"][:, :, None], dim=1) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True)
	# else:
	# raise NotImplementedError
	# return outputs.detach().cpu().numpy()
	#
	# # 1. load model
	# model_id = 'mixedbread-ai/mxbai-embed-large-v1'
	# tokenizer = AutoTokenizer.from_pretrained(model_id)
	# model = AutoModel.from_pretrained(model_id).cuda()
	#
	# queryText = 'A man is eating a piece of bread'
	#
	# # Define the passages list
	# passages = [
	# "A man is eating food.",
	# "A man is eating pasta.",
	# "The girl is carrying a baby.",
	# "A man is riding a horse.",
	# ]
	#
	# # Create the docs list by adding the transformed queryText and then the passages
	# docs = [transform_queryText(queryText)] + passages
	#
	# # 2. encode
	# inputs = tokenizer(docs, padding=True, return_tensors='pt')
	# for k, v in inputs.items():
	# inputs[k] = v.cuda()
	# outputs = model(**inputs).last_hidden_state
	# embeddings = pooling(outputs, inputs, 'cls')
	#
	# similarities = cos_sim(embeddings[0], embeddings[1:])
	#
	# print('similarities:', similarities)
	#
	#
	# # Create a DataFrame
	# df = pd.DataFrame({
	# 'Passage': passages,
	# 'Similarity': similarities.flatten() # Flatten the similarity tensor/array to ensure compatibility
	# })
	#
	# # Display the DataFrame
	# print(df)
	#
	#
	# print("end of computations")