Spaces:
Running
Running
Jatin Mehra
Add FAISS indexing utilities and enhance text processing functions for improved chunking and validation
447c09c
""" | |
FAISS indexing utilities for similarity search. | |
This module provides utilities for building and searching FAISS indexes. | |
""" | |
from typing import List, Tuple, Any, Dict | |
import numpy as np | |
import faiss | |
from sentence_transformers import SentenceTransformer | |
from configs.config import Config | |
from utils.text_processing import validate_chunk_data | |
def build_faiss_index(embeddings: np.ndarray) -> faiss.IndexHNSWFlat: | |
""" | |
Build a FAISS HNSW index from embeddings for similarity search. | |
Args: | |
embeddings: Numpy array of embeddings | |
Returns: | |
FAISS HNSW index | |
""" | |
dim = embeddings.shape[1] | |
index = faiss.IndexHNSWFlat(dim, Config.FAISS_NEIGHBORS) | |
index.hnsw.efConstruction = Config.FAISS_EF_CONSTRUCTION | |
index.hnsw.efSearch = Config.FAISS_EF_SEARCH | |
index.add(embeddings) | |
return index | |
def retrieve_similar_chunks( | |
query: str, | |
index: faiss.IndexHNSWFlat, | |
chunks_with_metadata: List[Dict[str, Any]], | |
embedding_model: SentenceTransformer, | |
k: int = None, | |
max_chunk_length: int = None | |
) -> List[Tuple[str, float, Dict[str, Any]]]: | |
""" | |
Retrieve top k similar chunks to the query from the FAISS index. | |
Args: | |
query: Search query | |
index: FAISS index | |
chunks_with_metadata: List of chunk dictionaries | |
embedding_model: SentenceTransformer model | |
k: Number of chunks to retrieve | |
max_chunk_length: Maximum length for returned chunks | |
Returns: | |
List of tuples (chunk_text, distance, metadata) | |
""" | |
if k is None: | |
k = Config.DEFAULT_K_CHUNKS | |
if max_chunk_length is None: | |
max_chunk_length = Config.DEFAULT_CHUNK_SIZE | |
query_embedding = embedding_model.encode([query], convert_to_tensor=True).cpu().numpy() | |
distances, indices = index.search(query_embedding, k) | |
# Ensure indices are within bounds and create mapping for correct distances | |
valid_results = [] | |
for idx_pos, chunk_idx in enumerate(indices[0]): | |
if 0 <= chunk_idx < len(chunks_with_metadata): | |
chunk_text = chunks_with_metadata[chunk_idx]["text"][:max_chunk_length] | |
# Only include chunks with meaningful content | |
if chunk_text.strip(): # Skip empty chunks | |
result = ( | |
chunk_text, | |
distances[0][idx_pos], # Use original position for correct distance | |
chunks_with_metadata[chunk_idx]["metadata"] | |
) | |
if validate_chunk_data(result): | |
valid_results.append(result) | |
return valid_results | |
def search_index_with_validation( | |
query: str, | |
index: faiss.IndexHNSWFlat, | |
chunks_with_metadata: List[Dict[str, Any]], | |
embedding_model: SentenceTransformer, | |
k: int = None, | |
similarity_threshold: float = None | |
) -> List[Tuple[str, float, Dict[str, Any]]]: | |
""" | |
Search index with additional validation and filtering. | |
Args: | |
query: Search query | |
index: FAISS index | |
chunks_with_metadata: List of chunk dictionaries | |
embedding_model: SentenceTransformer model | |
k: Number of chunks to retrieve | |
similarity_threshold: Threshold for filtering results | |
Returns: | |
List of validated and filtered chunk tuples | |
""" | |
if not query or len(query.strip()) < 3: | |
return [] | |
if similarity_threshold is None: | |
similarity_threshold = Config.SIMILARITY_THRESHOLD | |
try: | |
# Retrieve similar chunks | |
similar_chunks = retrieve_similar_chunks( | |
query, index, chunks_with_metadata, embedding_model, k | |
) | |
# Filter by similarity threshold | |
filtered_chunks = [ | |
chunk for chunk in similar_chunks | |
if chunk[1] < similarity_threshold | |
] | |
return filtered_chunks | |
except Exception as e: | |
print(f"Error in index search: {e}") | |
return [] | |
def get_index_stats(index: faiss.IndexHNSWFlat) -> Dict[str, Any]: | |
""" | |
Get statistics about the FAISS index. | |
Args: | |
index: FAISS index | |
Returns: | |
Dictionary with index statistics | |
""" | |
return { | |
"total_vectors": index.ntotal, | |
"dimension": index.d, | |
"index_type": type(index).__name__, | |
"ef_search": index.hnsw.efSearch, | |
"ef_construction": index.hnsw.efConstruction, | |
"is_trained": index.is_trained | |
} | |