Spaces:

riccorl
/

relik-entity-linking

Sleeping

App Files Files Community

relik-entity-linking / relik /retriever /indexers /faiss.py

riccorl

Upload models

8197b11 over 1 year ago

raw

history blame contribute delete

15.4 kB

	import contextlib
	import logging
	import math
	import os
	from dataclasses import dataclass
	from typing import Callable, List, Optional, Union

	import numpy
	import psutil
	import torch
	from relik.retriever.pytorch_modules import RetrievedSample
	from torch.utils.data import DataLoader
	from tqdm import tqdm

	from relik.common.log import get_logger
	from relik.common.utils import is_package_available
	from relik.retriever.common.model_inputs import ModelInputs
	from relik.retriever.data.base.datasets import BaseDataset
	from relik.retriever.data.labels import Labels
	from relik.retriever.indexers.base import BaseDocumentIndex
	from relik.retriever.pytorch_modules import PRECISION_MAP
	from relik.retriever.pytorch_modules.model import GoldenRetriever

	if is_package_available("faiss"):
	import faiss
	import faiss.contrib.torch_utils

	logger = get_logger(__name__, level=logging.INFO)


	@dataclass
	class FaissOutput:
	indices: Union[torch.Tensor, numpy.ndarray]
	distances: Union[torch.Tensor, numpy.ndarray]


	class FaissDocumentIndex(BaseDocumentIndex):
	DOCUMENTS_FILE_NAME = "documents.json"
	EMBEDDINGS_FILE_NAME = "embeddings.pt"
	INDEX_FILE_NAME = "index.faiss"

	def __init__(
	self,
	documents: Union[List[str], Labels],
	embeddings: Optional[Union[torch.Tensor, numpy.ndarray]] = None,
	index=None,
	index_type: str = "Flat",
	nprobe: int = 1,
	metric: int = faiss.METRIC_INNER_PRODUCT,
	normalize: bool = False,
	device: str = "cpu",
	name_or_dir: Optional[Union[str, os.PathLike]] = None,
	*args,
	**kwargs,
	) -> None:
	super().__init__(documents, embeddings, name_or_dir)

	if embeddings is not None and documents is not None:
	logger.info("Both documents and embeddings are provided.")
	if documents.get_label_size() != embeddings.shape[0]:
	raise ValueError(
	"The number of documents and embeddings must be the same."
	)

	faiss.omp_set_num_threads(psutil.cpu_count(logical=False))

	# device to store the embeddings
	self.device = device

	# params
	self.index_type = index_type
	self.metric = metric
	self.normalize = normalize

	if index is not None:
	self.embeddings = index
	if self.device == "cuda":
	# use a single GPU
	faiss_resource = faiss.StandardGpuResources()
	self.embeddings = faiss.index_cpu_to_gpu(
	faiss_resource, 0, self.embeddings
	)
	else:
	if embeddings is not None:
	# build the faiss index
	logger.info("Building the index from the embeddings.")
	self.embeddings = self._build_faiss_index(
	embeddings=embeddings,
	index_type=index_type,
	nprobe=nprobe,
	normalize=normalize,
	metric=metric,
	)

	def _build_faiss_index(
	self,
	embeddings: Optional[Union[torch.Tensor, numpy.ndarray]],
	index_type: str,
	nprobe: int,
	normalize: bool,
	metric: int,
	):
	# build the faiss index
	self.normalize = (
	normalize
	and metric == faiss.METRIC_INNER_PRODUCT
	and not isinstance(embeddings, torch.Tensor)
	)
	if self.normalize:
	index_type = f"L2norm,{index_type}"
	faiss_vector_size = embeddings.shape[1]
	# if self.device == "cpu":
	# index_type = index_type.replace("x,", "x_HNSW32,")
	# nlist = math.ceil(math.sqrt(faiss_vector_size)) * 4
	# # nlist = 8
	# index_type = index_type.replace(
	# "x", str(nlist)
	# )
	# print("Current nlist:", nlist)
	print("Current index:", index_type)
	self.embeddings = faiss.index_factory(faiss_vector_size, index_type, metric)

	# convert to GPU
	if self.device == "cuda":
	# use a single GPU
	faiss_resource = faiss.StandardGpuResources()
	self.embeddings = faiss.index_cpu_to_gpu(faiss_resource, 0, self.embeddings)
	else:
	# move to CPU if embeddings is a torch.Tensor
	embeddings = (
	embeddings.cpu() if isinstance(embeddings, torch.Tensor) else embeddings
	)

	self.embeddings.hnsw.efConstruction = 20
	# convert to float32 if embeddings is a torch.Tensor and is float16
	if isinstance(embeddings, torch.Tensor) and embeddings.dtype == torch.float16:
	embeddings = embeddings.float()

	logger.info("Training the index.")
	self.embeddings.train(embeddings)

	logger.info("Adding the embeddings to the index.")
	self.embeddings.add(embeddings)

	self.embeddings.nprobe = nprobe

	# self.embeddings.hnsw.efSearch
	self.embeddings.hnsw.efSearch = 256

	# self.embeddings.k_factor = 10

	# save parameters for saving/loading
	self.index_type = index_type
	self.metric = metric

	# clear the embeddings to free up memory
	embeddings = None

	return self.embeddings

	@torch.no_grad()
	@torch.inference_mode()
	def index(
	self,
	retriever: GoldenRetriever,
	documents: Optional[List[str]] = None,
	batch_size: int = 32,
	num_workers: int = 4,
	max_length: Optional[int] = None,
	collate_fn: Optional[Callable] = None,
	encoder_precision: Optional[Union[str, int]] = None,
	compute_on_cpu: bool = False,
	force_reindex: bool = False,
	*args,
	**kwargs,
	) -> "FaissDocumentIndex":
	"""
	Index the documents using the encoder.

	Args:
	retriever (:obj:`torch.nn.Module`):
	The encoder to be used for indexing.
	documents (:obj:`List[str]`, `optional`, defaults to None):
	The documents to be indexed.
	batch_size (:obj:`int`, `optional`, defaults to 32):
	The batch size to be used for indexing.
	num_workers (:obj:`int`, `optional`, defaults to 4):
	The number of workers to be used for indexing.
	max_length (:obj:`int`, `optional`, defaults to None):
	The maximum length of the input to the encoder.
	collate_fn (:obj:`Callable`, `optional`, defaults to None):
	The collate function to be used for batching.
	encoder_precision (:obj:`Union[str, int]`, `optional`, defaults to None):
	The precision to be used for the encoder.
	compute_on_cpu (:obj:`bool`, `optional`, defaults to False):
	Whether to compute the embeddings on CPU.
	force_reindex (:obj:`bool`, `optional`, defaults to False):
	Whether to force reindexing.

	Returns:
	:obj:`InMemoryIndexer`: The indexer object.
	"""

	if self.embeddings is not None and not force_reindex:
	logger.log(
	"Embeddings are already present and `force_reindex` is `False`. Skipping indexing."
	)
	if documents is None:
	return self

	# release the memory
	if collate_fn is None:
	tokenizer = retriever.passage_tokenizer

	def collate_fn(x):
	return ModelInputs(
	tokenizer(
	x,
	padding=True,
	return_tensors="pt",
	truncation=True,
	max_length=max_length or tokenizer.model_max_length,
	)
	)

	if force_reindex:
	if documents is not None:
	self.documents.add_labels(documents)
	data = [k for k in self.documents.get_labels()]

	else:
	if documents is not None:
	data = [k for k in Labels(documents).get_labels()]
	else:
	return self

	dataloader = DataLoader(
	BaseDataset(name="passage", data=data),
	batch_size=batch_size,
	shuffle=False,
	num_workers=num_workers,
	pin_memory=False,
	collate_fn=collate_fn,
	)

	encoder = retriever.passage_encoder

	# Create empty lists to store the passage embeddings and passage index
	passage_embeddings: List[torch.Tensor] = []

	encoder_device = "cpu" if compute_on_cpu else self.device

	# fucking autocast only wants pure strings like 'cpu' or 'cuda'
	# we need to convert the model device to that
	device_type_for_autocast = str(encoder_device).split(":")[0]
	# autocast doesn't work with CPU and stuff different from bfloat16
	autocast_pssg_mngr = (
	contextlib.nullcontext()
	if device_type_for_autocast == "cpu"
	else (
	torch.autocast(
	device_type=device_type_for_autocast,
	dtype=PRECISION_MAP[encoder_precision],
	)
	)
	)
	with autocast_pssg_mngr:
	# Iterate through each batch in the dataloader
	for batch in tqdm(dataloader, desc="Indexing"):
	# Move the batch to the device
	batch: ModelInputs = batch.to(encoder_device)
	# Compute the passage embeddings
	passage_outs = encoder(**batch)
	# Append the passage embeddings to the list
	if self.device == "cpu":
	passage_embeddings.extend([c.detach().cpu() for c in passage_outs])
	else:
	passage_embeddings.extend([c for c in passage_outs])

	# move the passage embeddings to the CPU if not already done
	passage_embeddings = [c.detach().cpu() for c in passage_embeddings]
	# stack it
	passage_embeddings: torch.Tensor = torch.stack(passage_embeddings, dim=0)
	# convert to float32 for faiss
	passage_embeddings.to(PRECISION_MAP["float32"])

	# index the embeddings
	self.embeddings = self._build_faiss_index(
	embeddings=passage_embeddings,
	index_type=self.index_type,
	normalize=self.normalize,
	metric=self.metric,
	)
	# free up memory from the unused variable
	del passage_embeddings

	return self

	@torch.no_grad()
	@torch.inference_mode()
	def search(self, query: torch.Tensor, k: int = 1) -> list[list[RetrievedSample]]:

	k = min(k, self.embeddings.ntotal)

	if self.normalize:
	faiss.normalize_L2(query)
	if isinstance(query, torch.Tensor) and self.device == "cpu":
	query = query.detach().cpu()
	# Retrieve the indices of the top k passage embeddings
	retriever_out = self.embeddings.search(query, k)

	# get int values (second element of the tuple)
	batch_top_k: List[List[int]] = retriever_out[1].detach().cpu().tolist()
	# get float values (first element of the tuple)
	batch_scores: List[List[float]] = retriever_out[0].detach().cpu().tolist()
	# Retrieve the passages corresponding to the indices
	batch_passages = [
	[self.documents.get_label_from_index(i) for i in indices if i != -1]
	for indices in batch_top_k
	]
	# build the output object
	batch_retrieved_samples = [
	[
	RetrievedSample(label=passage, index=index, score=score)
	for passage, index, score in zip(passages, indices, scores)
	]
	for passages, indices, scores in zip(
	batch_passages, batch_top_k, batch_scores
	)
	]
	return batch_retrieved_samples

	# def save(self, saving_dir: Union[str, os.PathLike]):
	# """
	# Save the indexer to the disk.

	# Args:
	# saving_dir (:obj:`Union[str, os.PathLike]`):
	# The directory where the indexer will be saved.
	# """
	# saving_dir = Path(saving_dir)
	# # save the passage embeddings
	# index_path = saving_dir / self.INDEX_FILE_NAME
	# logger.info(f"Saving passage embeddings to {index_path}")
	# faiss.write_index(self.embeddings, str(index_path))
	# # save the passage index
	# documents_path = saving_dir / self.DOCUMENTS_FILE_NAME
	# logger.info(f"Saving passage index to {documents_path}")
	# self.documents.save(documents_path)

	# @classmethod
	# def load(
	# cls,
	# loading_dir: Union[str, os.PathLike],
	# device: str = "cpu",
	# document_file_name: Optional[str] = None,
	# embedding_file_name: Optional[str] = None,
	# index_file_name: Optional[str] = None,
	# **kwargs,
	# ) -> "FaissDocumentIndex":
	# loading_dir = Path(loading_dir)

	# document_file_name = document_file_name or cls.DOCUMENTS_FILE_NAME
	# embedding_file_name = embedding_file_name or cls.EMBEDDINGS_FILE_NAME
	# index_file_name = index_file_name or cls.INDEX_FILE_NAME

	# # load the documents
	# documents_path = loading_dir / document_file_name

	# if not documents_path.exists():
	# raise ValueError(f"Document file `{documents_path}` does not exist.")
	# logger.info(f"Loading documents from {documents_path}")
	# documents = Labels.from_file(documents_path)

	# index = None
	# embeddings = None
	# # try to load the index directly
	# index_path = loading_dir / index_file_name
	# if not index_path.exists():
	# # try to load the embeddings
	# embedding_path = loading_dir / embedding_file_name
	# # run some checks
	# if embedding_path.exists():
	# logger.info(f"Loading embeddings from {embedding_path}")
	# embeddings = torch.load(embedding_path, map_location="cpu")
	# logger.warning(
	# f"Index file `{index_path}` and embedding file `{embedding_path}` do not exist."
	# )
	# else:
	# logger.info(f"Loading index from {index_path}")
	# index = faiss.read_index(str(embedding_path))

	# return cls(
	# documents=documents,
	# embeddings=embeddings,
	# index=index,
	# device=device,
	# **kwargs,
	# )

	def get_embeddings_from_index(
	self, index: int
	) -> Union[torch.Tensor, numpy.ndarray]:
	"""
	Get the document vector from the index.

	Args:
	index (`int`):
	The index of the document.

	Returns:
	`torch.Tensor`: The document vector.
	"""
	if self.embeddings is None:
	raise ValueError(
	"The documents must be indexed before they can be retrieved."
	)
	if index >= self.embeddings.ntotal:
	raise ValueError(
	f"The index {index} is out of bounds. The maximum index is {self.embeddings.ntotal}."
	)
	return self.embeddings.reconstruct(index)