PDF-Insight-PRO / utils /faiss_utils.py
Jatin Mehra
Add FAISS indexing utilities and enhance text processing functions for improved chunking and validation
447c09c
"""
FAISS indexing utilities for similarity search.
This module provides utilities for building and searching FAISS indexes.
"""
from typing import List, Tuple, Any, Dict
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from configs.config import Config
from utils.text_processing import validate_chunk_data
def build_faiss_index(embeddings: np.ndarray) -> faiss.IndexHNSWFlat:
"""
Build a FAISS HNSW index from embeddings for similarity search.
Args:
embeddings: Numpy array of embeddings
Returns:
FAISS HNSW index
"""
dim = embeddings.shape[1]
index = faiss.IndexHNSWFlat(dim, Config.FAISS_NEIGHBORS)
index.hnsw.efConstruction = Config.FAISS_EF_CONSTRUCTION
index.hnsw.efSearch = Config.FAISS_EF_SEARCH
index.add(embeddings)
return index
def retrieve_similar_chunks(
query: str,
index: faiss.IndexHNSWFlat,
chunks_with_metadata: List[Dict[str, Any]],
embedding_model: SentenceTransformer,
k: int = None,
max_chunk_length: int = None
) -> List[Tuple[str, float, Dict[str, Any]]]:
"""
Retrieve top k similar chunks to the query from the FAISS index.
Args:
query: Search query
index: FAISS index
chunks_with_metadata: List of chunk dictionaries
embedding_model: SentenceTransformer model
k: Number of chunks to retrieve
max_chunk_length: Maximum length for returned chunks
Returns:
List of tuples (chunk_text, distance, metadata)
"""
if k is None:
k = Config.DEFAULT_K_CHUNKS
if max_chunk_length is None:
max_chunk_length = Config.DEFAULT_CHUNK_SIZE
query_embedding = embedding_model.encode([query], convert_to_tensor=True).cpu().numpy()
distances, indices = index.search(query_embedding, k)
# Ensure indices are within bounds and create mapping for correct distances
valid_results = []
for idx_pos, chunk_idx in enumerate(indices[0]):
if 0 <= chunk_idx < len(chunks_with_metadata):
chunk_text = chunks_with_metadata[chunk_idx]["text"][:max_chunk_length]
# Only include chunks with meaningful content
if chunk_text.strip(): # Skip empty chunks
result = (
chunk_text,
distances[0][idx_pos], # Use original position for correct distance
chunks_with_metadata[chunk_idx]["metadata"]
)
if validate_chunk_data(result):
valid_results.append(result)
return valid_results
def search_index_with_validation(
query: str,
index: faiss.IndexHNSWFlat,
chunks_with_metadata: List[Dict[str, Any]],
embedding_model: SentenceTransformer,
k: int = None,
similarity_threshold: float = None
) -> List[Tuple[str, float, Dict[str, Any]]]:
"""
Search index with additional validation and filtering.
Args:
query: Search query
index: FAISS index
chunks_with_metadata: List of chunk dictionaries
embedding_model: SentenceTransformer model
k: Number of chunks to retrieve
similarity_threshold: Threshold for filtering results
Returns:
List of validated and filtered chunk tuples
"""
if not query or len(query.strip()) < 3:
return []
if similarity_threshold is None:
similarity_threshold = Config.SIMILARITY_THRESHOLD
try:
# Retrieve similar chunks
similar_chunks = retrieve_similar_chunks(
query, index, chunks_with_metadata, embedding_model, k
)
# Filter by similarity threshold
filtered_chunks = [
chunk for chunk in similar_chunks
if chunk[1] < similarity_threshold
]
return filtered_chunks
except Exception as e:
print(f"Error in index search: {e}")
return []
def get_index_stats(index: faiss.IndexHNSWFlat) -> Dict[str, Any]:
"""
Get statistics about the FAISS index.
Args:
index: FAISS index
Returns:
Dictionary with index statistics
"""
return {
"total_vectors": index.ntotal,
"dimension": index.d,
"index_type": type(index).__name__,
"ef_search": index.hnsw.efSearch,
"ef_construction": index.hnsw.efConstruction,
"is_trained": index.is_trained
}