Spaces:
Running
Running
# https://www.mixedbread.ai/blog/mxbai-embed-large-v1 | |
# https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1 | |
import os | |
import time | |
import pandas as pd | |
import numpy as np | |
from typing import Dict | |
import torch | |
from transformers import AutoModel, AutoTokenizer | |
from sentence_transformers.util import cos_sim | |
from accelerate import Accelerator # Import from accelerate | |
from scipy.stats import zscore | |
# Set up environment variables for Hugging Face caching | |
os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" | |
os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" | |
os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" | |
# Initialize the Accelerator | |
accelerator = Accelerator() | |
# Use the device managed by Accelerator | |
device = accelerator.device | |
print("Using accelerator device =", device) | |
# 1. Load the model and tokenizer | |
model_id_Retriever = 'mixedbread-ai/mxbai-embed-large-v1' | |
tokenizer_Retriever = AutoTokenizer.from_pretrained(model_id_Retriever) | |
modelRetriever = AutoModel.from_pretrained(model_id_Retriever) | |
# Accelerate prepares the model (e.g., moves to the appropriate device) | |
modelRetriever = accelerator.prepare(modelRetriever) | |
# Define the transform_query function | |
def transform_query(queryText: str) -> str: | |
"""For retrieval, add the prompt for queryText (not for documents).""" | |
return f'Represent this sentence for searching relevant passages: {queryText}' | |
# Define the pooling function | |
def pooling(outputs: torch.Tensor, inputs: Dict, strategy: str = 'cls') -> np.ndarray: | |
if strategy == 'cls': | |
outputs = outputs[:, 0] | |
elif strategy == 'mean': | |
outputs = torch.sum( | |
outputs * inputs["attention_mask"][:, :, None], dim=1 | |
) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True) | |
else: | |
raise NotImplementedError | |
return outputs.detach().cpu().numpy() | |
def retrievePassageSimilarities(queryText, passages): | |
# Create the docs list by adding the transformed queryText and then the passages | |
docs = [transform_query(queryText)] + passages | |
# 2. Encode the inputs | |
inputs = tokenizer_Retriever(docs, padding=True, return_tensors='pt') | |
# Move inputs to the right device using accelerator | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
with torch.no_grad(): | |
outputs = modelRetriever(**inputs).last_hidden_state | |
embeddings = pooling(outputs, inputs, 'cls') | |
similarities = cos_sim(embeddings[0], embeddings[1:]) | |
#print('similarities:', similarities) | |
return similarities | |
def RAG_retrieval_Base(queryText,passages, min_threshold=0.0, max_num_passages=None): | |
try: | |
similarities=retrievePassageSimilarities(queryText, passages) | |
#Create a DataFrame | |
df = pd.DataFrame({ | |
'Passage': passages, | |
'Similarity': similarities.flatten() # Flatten the similarity tensor/array to ensure compatibility | |
}) | |
# Filter the DataFrame based on the similarity threshold | |
df_filtered = df[df['Similarity'] >= min_threshold] | |
# If max_num_passages is specified, limit the number of passages returned | |
if max_num_passages is not None: | |
df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity') | |
df_filtered = df_filtered.sort_values(by='Similarity', ascending=False) | |
# Return the filtered DataFrame | |
return df_filtered | |
except Exception as e: | |
# Log the exception message or handle it as needed | |
print(f"An error occurred: {e}") | |
return pd.DataFrame() # Return an empty DataFrame in case of error | |
def RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0, max_num_passages=None, min_threshold=0.5): | |
try: | |
# Encoding and similarity computation remains the same | |
similarities = retrievePassageSimilarities(queryText, passages) | |
# Calculate z-scores for similarities | |
z_scores = zscore(similarities.flatten()) | |
# Create a DataFrame with passages, similarities, and z-scores | |
df = pd.DataFrame({ | |
'Passage': passages, | |
'Similarity': similarities.flatten(), | |
'Z-Score': z_scores | |
}) | |
# Filter passages based on z-score threshold | |
df_filtered = df[df['Z-Score'] >= z_threshold] | |
if min_threshold: | |
# Filter the DataFrame also on min similarity threshold | |
df_filtered = df[df['Similarity'] >= min_threshold] | |
# If max_num_passages is specified, limit the number of passages returned | |
if max_num_passages is not None: | |
df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity') | |
# Sort by similarity (or z-score if preferred) | |
df_filtered = df_filtered.sort_values(by='Similarity', ascending=False) | |
return df_filtered | |
except Exception as e: | |
# Log the exception message or handle it as needed | |
print(f"An error occurred: {e}") | |
return pd.DataFrame() # Return an empty DataFrame in case of error | |
def RAG_retrieval_Percentile(queryText, passages, percentile=90,max_num_passages=None, min_threshold=0.5): | |
try: | |
# Encoding and similarity computation remains the same | |
similarities = retrievePassageSimilarities(queryText, passages) | |
# Determine threshold based on percentile | |
threshold = np.percentile(similarities.flatten(), percentile) | |
# Create a DataFrame | |
df = pd.DataFrame({ | |
'Passage': passages, | |
'Similarity': similarities.flatten() | |
}) | |
# Filter using percentile threshold | |
df_filtered = df[df['Similarity'] >= threshold] | |
if min_threshold: | |
# Filter the DataFrame also on min similarity threshold | |
df_filtered = df[df['Similarity'] >= min_threshold] | |
# If max_num_passages is specified, limit the number of passages returned | |
if max_num_passages is not None: | |
df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity') | |
# Sort by similarity | |
df_filtered = df_filtered.sort_values(by='Similarity', ascending=False) | |
return df_filtered | |
except Exception as e: | |
# Log the exception message or handle it as needed | |
print(f"An error occurred: {e}") | |
return pd.DataFrame() # Return an empty DataFrame in case of error | |
def RAG_retrieval_TopK(queryText, passages, top_fraction=0.1, max_num_passages=None, min_threshold=0.5): | |
try: | |
# Encoding and similarity computation (assuming retrievePassageSimilarities is defined elsewhere) | |
similarities = retrievePassageSimilarities(queryText, passages) | |
# Calculate the number of passages to select based on top fraction | |
num_passages_TopFraction = max(1, int(top_fraction * len(passages))) | |
# Create a DataFrame | |
df = pd.DataFrame({ | |
'Passage': passages, | |
'Similarity': similarities.flatten() | |
}) | |
# Select the top passages dynamically | |
df_filtered = df.nlargest(num_passages_TopFraction, 'Similarity') | |
if min_threshold: | |
# Filter the DataFrame also on min similarity threshold | |
df_filtered = df_filtered[df_filtered['Similarity'] >= min_threshold] | |
# If max_num_passages is specified, limit the number of passages returned | |
if max_num_passages is not None: | |
df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity') | |
# Sort by similarity | |
df_filtered = df_filtered.sort_values(by='Similarity', ascending=False) | |
return df_filtered | |
except Exception as e: | |
# Log the exception message or handle it as needed | |
print(f"An error occurred: {e}") | |
return pd.DataFrame() # Return an empty DataFrame in case of error | |
if __name__ == '__main__': | |
queryText = 'A man is eating a piece of bread' | |
# Define the passages list | |
passages = [ | |
"A man is eating food.", | |
"A man is eating pasta.", | |
"The girl is carrying a baby.", | |
"A man is riding a horse.", | |
] | |
#df_retrieved = RAG_retrieval_Base(queryText, passages) | |
#df_retrieved = RAG_retrieval_Base(queryText, passages, min_threshold=0.5) | |
#df_retrieved = RAG_retrieval_Base(queryText, passages, max_num_passages=3) | |
df_retrieved = RAG_retrieval_Base(queryText, passages, min_threshold=0.5, max_num_passages=3) | |
#df_retrieved = RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0) | |
#df_retrieved = RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0,max_num_passages=3) | |
#df_retrieved = RAG_retrieval_Percentile(queryText, passages, percentile=80) | |
# df_retrieved = RAG_retrieval_Percentile(queryText, passages, percentile=80, max_num_passages=3) | |
##df_retrieved = RAG_retrieval_TopK(queryText, passages, top_fraction=0.2) | |
#df_retrieved = RAG_retrieval_TopK(queryText, passages, top_fraction=0.2, max_num_passages=3) | |
print(df_retrieved) | |
#labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist() | |
print("end of computations") | |
# VERSION WITHOUT ACCELERATE | |
# | |
# #https://www.mixedbread.ai/blog/mxbai-embed-large-v1 | |
# #https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1 | |
# | |
# import os | |
# | |
# os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" | |
# os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" | |
# os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" | |
# | |
# import time | |
# import pandas as pd | |
# import numpy as np | |
# | |
# | |
# | |
# from typing import Dict | |
# | |
# import torch | |
# import numpy as np | |
# from transformers import AutoModel, AutoTokenizer | |
# from sentence_transformers.util import cos_sim | |
# | |
# # For retrieval you need to pass this prompt. Please find our more in our blog post. | |
# def transform_queryText(queryText: str) -> str: | |
# """ For retrieval, add the prompt for queryText (not for documents). | |
# """ | |
# return f'Represent this sentence for searching relevant passages: {queryText}' | |
# | |
# # The model works really well with cls pooling (default) but also with mean pooling. | |
# def pooling(outputs: torch.Tensor, inputs: Dict, strategy: str = 'cls') -> np.ndarray: | |
# if strategy == 'cls': | |
# outputs = outputs[:, 0] | |
# elif strategy == 'mean': | |
# outputs = torch.sum( | |
# outputs * inputs["attention_mask"][:, :, None], dim=1) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True) | |
# else: | |
# raise NotImplementedError | |
# return outputs.detach().cpu().numpy() | |
# | |
# # 1. load model | |
# model_id = 'mixedbread-ai/mxbai-embed-large-v1' | |
# tokenizer = AutoTokenizer.from_pretrained(model_id) | |
# model = AutoModel.from_pretrained(model_id).cuda() | |
# | |
# queryText = 'A man is eating a piece of bread' | |
# | |
# # Define the passages list | |
# passages = [ | |
# "A man is eating food.", | |
# "A man is eating pasta.", | |
# "The girl is carrying a baby.", | |
# "A man is riding a horse.", | |
# ] | |
# | |
# # Create the docs list by adding the transformed queryText and then the passages | |
# docs = [transform_queryText(queryText)] + passages | |
# | |
# # 2. encode | |
# inputs = tokenizer(docs, padding=True, return_tensors='pt') | |
# for k, v in inputs.items(): | |
# inputs[k] = v.cuda() | |
# outputs = model(**inputs).last_hidden_state | |
# embeddings = pooling(outputs, inputs, 'cls') | |
# | |
# similarities = cos_sim(embeddings[0], embeddings[1:]) | |
# | |
# print('similarities:', similarities) | |
# | |
# | |
# # Create a DataFrame | |
# df = pd.DataFrame({ | |
# 'Passage': passages, | |
# 'Similarity': similarities.flatten() # Flatten the similarity tensor/array to ensure compatibility | |
# }) | |
# | |
# # Display the DataFrame | |
# print(df) | |
# | |
# | |
# print("end of computations") | |