Spaces:

jrc-ai
/

MultiNER-simplified

Running

File size: 11,788 Bytes

232b620

# https://www.mixedbread.ai/blog/mxbai-embed-large-v1
# https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1

import os
import time
import pandas as pd
import numpy as np
from typing import Dict

import torch
from transformers import AutoModel, AutoTokenizer
from sentence_transformers.util import cos_sim
from accelerate import Accelerator  # Import from accelerate
from scipy.stats import zscore

# Set up environment variables for Hugging Face caching
os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"

# Initialize the Accelerator
accelerator = Accelerator()

# Use the device managed by Accelerator
device = accelerator.device
print("Using accelerator device =", device)


# 1. Load the model and tokenizer
model_id_Retriever = 'mixedbread-ai/mxbai-embed-large-v1'
tokenizer_Retriever = AutoTokenizer.from_pretrained(model_id_Retriever)
modelRetriever = AutoModel.from_pretrained(model_id_Retriever)

# Accelerate prepares the model (e.g., moves to the appropriate device)
modelRetriever = accelerator.prepare(modelRetriever)




# Define the transform_query function
def transform_query(queryText: str) -> str:
    """For retrieval, add the prompt for queryText (not for documents)."""
    return f'Represent this sentence for searching relevant passages: {queryText}'

# Define the pooling function
def pooling(outputs: torch.Tensor, inputs: Dict, strategy: str = 'cls') -> np.ndarray:
    if strategy == 'cls':
        outputs = outputs[:, 0]
    elif strategy == 'mean':
        outputs = torch.sum(
            outputs * inputs["attention_mask"][:, :, None], dim=1
        ) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True)
    else:
        raise NotImplementedError
    return outputs.detach().cpu().numpy()


def retrievePassageSimilarities(queryText, passages):
    # Create the docs list by adding the transformed queryText and then the passages
    docs = [transform_query(queryText)] + passages

    # 2. Encode the inputs
    inputs = tokenizer_Retriever(docs, padding=True, return_tensors='pt')

    # Move inputs to the right device using accelerator
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = modelRetriever(**inputs).last_hidden_state
    embeddings = pooling(outputs, inputs, 'cls')

    similarities = cos_sim(embeddings[0], embeddings[1:])

    #print('similarities:', similarities)

    return similarities



def RAG_retrieval_Base(queryText,passages, min_threshold=0.0, max_num_passages=None):

    try:
        similarities=retrievePassageSimilarities(queryText, passages)

        #Create a DataFrame
        df = pd.DataFrame({
            'Passage': passages,
            'Similarity': similarities.flatten()  # Flatten the similarity tensor/array to ensure compatibility
        })

        # Filter the DataFrame based on the similarity threshold
        df_filtered = df[df['Similarity'] >= min_threshold]

        # If max_num_passages is specified, limit the number of passages returned
        if max_num_passages is not None:
            df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')

        df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)

        # Return the filtered DataFrame
        return df_filtered

    except Exception as e:
        # Log the exception message or handle it as needed
        print(f"An error occurred: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of error



def RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0, max_num_passages=None, min_threshold=0.5):
    try:
        # Encoding and similarity computation remains the same

        similarities = retrievePassageSimilarities(queryText, passages)

        # Calculate z-scores for similarities
        z_scores = zscore(similarities.flatten())

        # Create a DataFrame with passages, similarities, and z-scores
        df = pd.DataFrame({
            'Passage': passages,
            'Similarity': similarities.flatten(),
            'Z-Score': z_scores
        })

        # Filter passages based on z-score threshold
        df_filtered = df[df['Z-Score'] >= z_threshold]

        if min_threshold:
            # Filter the DataFrame also on min similarity threshold
            df_filtered = df[df['Similarity'] >= min_threshold]

        # If max_num_passages is specified, limit the number of passages returned
        if max_num_passages is not None:
            df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')

        # Sort by similarity (or z-score if preferred)
        df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)

        return df_filtered

    except Exception as e:
        # Log the exception message or handle it as needed
        print(f"An error occurred: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of error




def RAG_retrieval_Percentile(queryText, passages, percentile=90,max_num_passages=None, min_threshold=0.5):
    try:
        # Encoding and similarity computation remains the same

        similarities = retrievePassageSimilarities(queryText, passages)

        # Determine threshold based on percentile
        threshold = np.percentile(similarities.flatten(), percentile)

        # Create a DataFrame
        df = pd.DataFrame({
            'Passage': passages,
            'Similarity': similarities.flatten()
        })

        # Filter using percentile threshold
        df_filtered = df[df['Similarity'] >= threshold]

        if min_threshold:
            # Filter the DataFrame also on min similarity threshold
            df_filtered = df[df['Similarity'] >= min_threshold]

        # If max_num_passages is specified, limit the number of passages returned
        if max_num_passages is not None:
            df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')

        # Sort by similarity
        df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)

        return df_filtered

    except Exception as e:
        # Log the exception message or handle it as needed
        print(f"An error occurred: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of error



def RAG_retrieval_TopK(queryText, passages, top_fraction=0.1, max_num_passages=None, min_threshold=0.5):
    try:
        # Encoding and similarity computation (assuming retrievePassageSimilarities is defined elsewhere)
        similarities = retrievePassageSimilarities(queryText, passages)

        # Calculate the number of passages to select based on top fraction
        num_passages_TopFraction = max(1, int(top_fraction * len(passages)))

        # Create a DataFrame
        df = pd.DataFrame({
            'Passage': passages,
            'Similarity': similarities.flatten()
        })

        # Select the top passages dynamically
        df_filtered = df.nlargest(num_passages_TopFraction, 'Similarity')

        if min_threshold:
            # Filter the DataFrame also on min similarity threshold
            df_filtered = df_filtered[df_filtered['Similarity'] >= min_threshold]

        # If max_num_passages is specified, limit the number of passages returned
        if max_num_passages is not None:
            df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')

        # Sort by similarity
        df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)

        return df_filtered

    except Exception as e:
        # Log the exception message or handle it as needed
        print(f"An error occurred: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of error



if __name__ == '__main__':

    queryText = 'A man is eating a piece of bread'

    # Define the passages list
    passages = [
        "A man is eating food.",
        "A man is eating pasta.",
        "The girl is carrying a baby.",
        "A man is riding a horse.",
    ]

    #df_retrieved = RAG_retrieval_Base(queryText, passages)
    #df_retrieved = RAG_retrieval_Base(queryText, passages, min_threshold=0.5)
    #df_retrieved = RAG_retrieval_Base(queryText, passages, max_num_passages=3)
    df_retrieved = RAG_retrieval_Base(queryText, passages, min_threshold=0.5, max_num_passages=3)

    #df_retrieved = RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0)
    #df_retrieved = RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0,max_num_passages=3)

    #df_retrieved = RAG_retrieval_Percentile(queryText, passages, percentile=80)
    # df_retrieved = RAG_retrieval_Percentile(queryText, passages, percentile=80, max_num_passages=3)

    ##df_retrieved = RAG_retrieval_TopK(queryText, passages, top_fraction=0.2)
    #df_retrieved = RAG_retrieval_TopK(queryText, passages, top_fraction=0.2, max_num_passages=3)


    print(df_retrieved)

    #labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()


    print("end of computations")

# VERSION WITHOUT ACCELERATE
#
# #https://www.mixedbread.ai/blog/mxbai-embed-large-v1
# #https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1
#
# import os
#
# os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
# os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
# os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
#
# import time
# import pandas as pd
# import numpy as np
#
#
#
# from typing import Dict
#
# import torch
# import numpy as np
# from transformers import AutoModel, AutoTokenizer
# from sentence_transformers.util import cos_sim
#
# # For retrieval you need to pass this prompt. Please find our more in our blog post.
# def transform_queryText(queryText: str) -> str:
#     """ For retrieval, add the prompt for queryText (not for documents).
#     """
#     return f'Represent this sentence for searching relevant passages: {queryText}'
#
# # The model works really well with cls pooling (default) but also with mean pooling.
# def pooling(outputs: torch.Tensor, inputs: Dict,  strategy: str = 'cls') -> np.ndarray:
#     if strategy == 'cls':
#         outputs = outputs[:, 0]
#     elif strategy == 'mean':
#         outputs = torch.sum(
#             outputs * inputs["attention_mask"][:, :, None], dim=1) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True)
#     else:
#         raise NotImplementedError
#     return outputs.detach().cpu().numpy()
#
# # 1. load model
# model_id = 'mixedbread-ai/mxbai-embed-large-v1'
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModel.from_pretrained(model_id).cuda()
#
# queryText = 'A man is eating a piece of bread'
#
# # Define the passages list
# passages = [
#     "A man is eating food.",
#     "A man is eating pasta.",
#     "The girl is carrying a baby.",
#     "A man is riding a horse.",
# ]
#
# # Create the docs list by adding the transformed queryText and then the passages
# docs = [transform_queryText(queryText)] + passages
#
# # 2. encode
# inputs = tokenizer(docs, padding=True, return_tensors='pt')
# for k, v in inputs.items():
#     inputs[k] = v.cuda()
# outputs = model(**inputs).last_hidden_state
# embeddings = pooling(outputs, inputs, 'cls')
#
# similarities = cos_sim(embeddings[0], embeddings[1:])
#
# print('similarities:', similarities)
#
#
# # Create a DataFrame
# df = pd.DataFrame({
#     'Passage': passages,
#     'Similarity': similarities.flatten()  # Flatten the similarity tensor/array to ensure compatibility
# })
#
# # Display the DataFrame
# print(df)
#
#
# print("end of computations")