# https://www.mixedbread.ai/blog/mxbai-embed-large-v1 # https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1 import os import time import pandas as pd import numpy as np from typing import Dict import torch from transformers import AutoModel, AutoTokenizer from sentence_transformers.util import cos_sim from accelerate import Accelerator # Import from accelerate from scipy.stats import zscore # Set up environment variables for Hugging Face caching os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" # Initialize the Accelerator accelerator = Accelerator() # Use the device managed by Accelerator device = accelerator.device print("Using accelerator device =", device) # 1. Load the model and tokenizer model_id_Retriever = 'mixedbread-ai/mxbai-embed-large-v1' tokenizer_Retriever = AutoTokenizer.from_pretrained(model_id_Retriever) modelRetriever = AutoModel.from_pretrained(model_id_Retriever) # Accelerate prepares the model (e.g., moves to the appropriate device) modelRetriever = accelerator.prepare(modelRetriever) # Define the transform_query function def transform_query(queryText: str) -> str: """For retrieval, add the prompt for queryText (not for documents).""" return f'Represent this sentence for searching relevant passages: {queryText}' # Define the pooling function def pooling(outputs: torch.Tensor, inputs: Dict, strategy: str = 'cls') -> np.ndarray: if strategy == 'cls': outputs = outputs[:, 0] elif strategy == 'mean': outputs = torch.sum( outputs * inputs["attention_mask"][:, :, None], dim=1 ) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True) else: raise NotImplementedError return outputs.detach().cpu().numpy() def retrievePassageSimilarities(queryText, passages): # Create the docs list by adding the transformed queryText and then the passages docs = [transform_query(queryText)] + passages # 2. Encode the inputs inputs = tokenizer_Retriever(docs, padding=True, return_tensors='pt') # Move inputs to the right device using accelerator inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): outputs = modelRetriever(**inputs).last_hidden_state embeddings = pooling(outputs, inputs, 'cls') similarities = cos_sim(embeddings[0], embeddings[1:]) #print('similarities:', similarities) return similarities def RAG_retrieval_Base(queryText,passages, min_threshold=0.0, max_num_passages=None): try: similarities=retrievePassageSimilarities(queryText, passages) #Create a DataFrame df = pd.DataFrame({ 'Passage': passages, 'Similarity': similarities.flatten() # Flatten the similarity tensor/array to ensure compatibility }) # Filter the DataFrame based on the similarity threshold df_filtered = df[df['Similarity'] >= min_threshold] # If max_num_passages is specified, limit the number of passages returned if max_num_passages is not None: df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity') df_filtered = df_filtered.sort_values(by='Similarity', ascending=False) # Return the filtered DataFrame return df_filtered except Exception as e: # Log the exception message or handle it as needed print(f"An error occurred: {e}") return pd.DataFrame() # Return an empty DataFrame in case of error def RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0, max_num_passages=None, min_threshold=0.5): try: # Encoding and similarity computation remains the same similarities = retrievePassageSimilarities(queryText, passages) # Calculate z-scores for similarities z_scores = zscore(similarities.flatten()) # Create a DataFrame with passages, similarities, and z-scores df = pd.DataFrame({ 'Passage': passages, 'Similarity': similarities.flatten(), 'Z-Score': z_scores }) # Filter passages based on z-score threshold df_filtered = df[df['Z-Score'] >= z_threshold] if min_threshold: # Filter the DataFrame also on min similarity threshold df_filtered = df[df['Similarity'] >= min_threshold] # If max_num_passages is specified, limit the number of passages returned if max_num_passages is not None: df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity') # Sort by similarity (or z-score if preferred) df_filtered = df_filtered.sort_values(by='Similarity', ascending=False) return df_filtered except Exception as e: # Log the exception message or handle it as needed print(f"An error occurred: {e}") return pd.DataFrame() # Return an empty DataFrame in case of error def RAG_retrieval_Percentile(queryText, passages, percentile=90,max_num_passages=None, min_threshold=0.5): try: # Encoding and similarity computation remains the same similarities = retrievePassageSimilarities(queryText, passages) # Determine threshold based on percentile threshold = np.percentile(similarities.flatten(), percentile) # Create a DataFrame df = pd.DataFrame({ 'Passage': passages, 'Similarity': similarities.flatten() }) # Filter using percentile threshold df_filtered = df[df['Similarity'] >= threshold] if min_threshold: # Filter the DataFrame also on min similarity threshold df_filtered = df[df['Similarity'] >= min_threshold] # If max_num_passages is specified, limit the number of passages returned if max_num_passages is not None: df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity') # Sort by similarity df_filtered = df_filtered.sort_values(by='Similarity', ascending=False) return df_filtered except Exception as e: # Log the exception message or handle it as needed print(f"An error occurred: {e}") return pd.DataFrame() # Return an empty DataFrame in case of error def RAG_retrieval_TopK(queryText, passages, top_fraction=0.1, max_num_passages=None, min_threshold=0.5): try: # Encoding and similarity computation (assuming retrievePassageSimilarities is defined elsewhere) similarities = retrievePassageSimilarities(queryText, passages) # Calculate the number of passages to select based on top fraction num_passages_TopFraction = max(1, int(top_fraction * len(passages))) # Create a DataFrame df = pd.DataFrame({ 'Passage': passages, 'Similarity': similarities.flatten() }) # Select the top passages dynamically df_filtered = df.nlargest(num_passages_TopFraction, 'Similarity') if min_threshold: # Filter the DataFrame also on min similarity threshold df_filtered = df_filtered[df_filtered['Similarity'] >= min_threshold] # If max_num_passages is specified, limit the number of passages returned if max_num_passages is not None: df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity') # Sort by similarity df_filtered = df_filtered.sort_values(by='Similarity', ascending=False) return df_filtered except Exception as e: # Log the exception message or handle it as needed print(f"An error occurred: {e}") return pd.DataFrame() # Return an empty DataFrame in case of error if __name__ == '__main__': queryText = 'A man is eating a piece of bread' # Define the passages list passages = [ "A man is eating food.", "A man is eating pasta.", "The girl is carrying a baby.", "A man is riding a horse.", ] #df_retrieved = RAG_retrieval_Base(queryText, passages) #df_retrieved = RAG_retrieval_Base(queryText, passages, min_threshold=0.5) #df_retrieved = RAG_retrieval_Base(queryText, passages, max_num_passages=3) df_retrieved = RAG_retrieval_Base(queryText, passages, min_threshold=0.5, max_num_passages=3) #df_retrieved = RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0) #df_retrieved = RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0,max_num_passages=3) #df_retrieved = RAG_retrieval_Percentile(queryText, passages, percentile=80) # df_retrieved = RAG_retrieval_Percentile(queryText, passages, percentile=80, max_num_passages=3) ##df_retrieved = RAG_retrieval_TopK(queryText, passages, top_fraction=0.2) #df_retrieved = RAG_retrieval_TopK(queryText, passages, top_fraction=0.2, max_num_passages=3) print(df_retrieved) #labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist() print("end of computations") # VERSION WITHOUT ACCELERATE # # #https://www.mixedbread.ai/blog/mxbai-embed-large-v1 # #https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1 # # import os # # os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" # os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" # os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" # # import time # import pandas as pd # import numpy as np # # # # from typing import Dict # # import torch # import numpy as np # from transformers import AutoModel, AutoTokenizer # from sentence_transformers.util import cos_sim # # # For retrieval you need to pass this prompt. Please find our more in our blog post. # def transform_queryText(queryText: str) -> str: # """ For retrieval, add the prompt for queryText (not for documents). # """ # return f'Represent this sentence for searching relevant passages: {queryText}' # # # The model works really well with cls pooling (default) but also with mean pooling. # def pooling(outputs: torch.Tensor, inputs: Dict, strategy: str = 'cls') -> np.ndarray: # if strategy == 'cls': # outputs = outputs[:, 0] # elif strategy == 'mean': # outputs = torch.sum( # outputs * inputs["attention_mask"][:, :, None], dim=1) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True) # else: # raise NotImplementedError # return outputs.detach().cpu().numpy() # # # 1. load model # model_id = 'mixedbread-ai/mxbai-embed-large-v1' # tokenizer = AutoTokenizer.from_pretrained(model_id) # model = AutoModel.from_pretrained(model_id).cuda() # # queryText = 'A man is eating a piece of bread' # # # Define the passages list # passages = [ # "A man is eating food.", # "A man is eating pasta.", # "The girl is carrying a baby.", # "A man is riding a horse.", # ] # # # Create the docs list by adding the transformed queryText and then the passages # docs = [transform_queryText(queryText)] + passages # # # 2. encode # inputs = tokenizer(docs, padding=True, return_tensors='pt') # for k, v in inputs.items(): # inputs[k] = v.cuda() # outputs = model(**inputs).last_hidden_state # embeddings = pooling(outputs, inputs, 'cls') # # similarities = cos_sim(embeddings[0], embeddings[1:]) # # print('similarities:', similarities) # # # # Create a DataFrame # df = pd.DataFrame({ # 'Passage': passages, # 'Similarity': similarities.flatten() # Flatten the similarity tensor/array to ensure compatibility # }) # # # Display the DataFrame # print(df) # # # print("end of computations")