Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Jatin Mehra
				
			
		Add FAISS indexing utilities and enhance text processing functions for improved chunking and validation
		447c09c
		
		| """ | |
| FAISS indexing utilities for similarity search. | |
| This module provides utilities for building and searching FAISS indexes. | |
| """ | |
| from typing import List, Tuple, Any, Dict | |
| import numpy as np | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| from configs.config import Config | |
| from utils.text_processing import validate_chunk_data | |
| def build_faiss_index(embeddings: np.ndarray) -> faiss.IndexHNSWFlat: | |
| """ | |
| Build a FAISS HNSW index from embeddings for similarity search. | |
| Args: | |
| embeddings: Numpy array of embeddings | |
| Returns: | |
| FAISS HNSW index | |
| """ | |
| dim = embeddings.shape[1] | |
| index = faiss.IndexHNSWFlat(dim, Config.FAISS_NEIGHBORS) | |
| index.hnsw.efConstruction = Config.FAISS_EF_CONSTRUCTION | |
| index.hnsw.efSearch = Config.FAISS_EF_SEARCH | |
| index.add(embeddings) | |
| return index | |
| def retrieve_similar_chunks( | |
| query: str, | |
| index: faiss.IndexHNSWFlat, | |
| chunks_with_metadata: List[Dict[str, Any]], | |
| embedding_model: SentenceTransformer, | |
| k: int = None, | |
| max_chunk_length: int = None | |
| ) -> List[Tuple[str, float, Dict[str, Any]]]: | |
| """ | |
| Retrieve top k similar chunks to the query from the FAISS index. | |
| Args: | |
| query: Search query | |
| index: FAISS index | |
| chunks_with_metadata: List of chunk dictionaries | |
| embedding_model: SentenceTransformer model | |
| k: Number of chunks to retrieve | |
| max_chunk_length: Maximum length for returned chunks | |
| Returns: | |
| List of tuples (chunk_text, distance, metadata) | |
| """ | |
| if k is None: | |
| k = Config.DEFAULT_K_CHUNKS | |
| if max_chunk_length is None: | |
| max_chunk_length = Config.DEFAULT_CHUNK_SIZE | |
| query_embedding = embedding_model.encode([query], convert_to_tensor=True).cpu().numpy() | |
| distances, indices = index.search(query_embedding, k) | |
| # Ensure indices are within bounds and create mapping for correct distances | |
| valid_results = [] | |
| for idx_pos, chunk_idx in enumerate(indices[0]): | |
| if 0 <= chunk_idx < len(chunks_with_metadata): | |
| chunk_text = chunks_with_metadata[chunk_idx]["text"][:max_chunk_length] | |
| # Only include chunks with meaningful content | |
| if chunk_text.strip(): # Skip empty chunks | |
| result = ( | |
| chunk_text, | |
| distances[0][idx_pos], # Use original position for correct distance | |
| chunks_with_metadata[chunk_idx]["metadata"] | |
| ) | |
| if validate_chunk_data(result): | |
| valid_results.append(result) | |
| return valid_results | |
| def search_index_with_validation( | |
| query: str, | |
| index: faiss.IndexHNSWFlat, | |
| chunks_with_metadata: List[Dict[str, Any]], | |
| embedding_model: SentenceTransformer, | |
| k: int = None, | |
| similarity_threshold: float = None | |
| ) -> List[Tuple[str, float, Dict[str, Any]]]: | |
| """ | |
| Search index with additional validation and filtering. | |
| Args: | |
| query: Search query | |
| index: FAISS index | |
| chunks_with_metadata: List of chunk dictionaries | |
| embedding_model: SentenceTransformer model | |
| k: Number of chunks to retrieve | |
| similarity_threshold: Threshold for filtering results | |
| Returns: | |
| List of validated and filtered chunk tuples | |
| """ | |
| if not query or len(query.strip()) < 3: | |
| return [] | |
| if similarity_threshold is None: | |
| similarity_threshold = Config.SIMILARITY_THRESHOLD | |
| try: | |
| # Retrieve similar chunks | |
| similar_chunks = retrieve_similar_chunks( | |
| query, index, chunks_with_metadata, embedding_model, k | |
| ) | |
| # Filter by similarity threshold | |
| filtered_chunks = [ | |
| chunk for chunk in similar_chunks | |
| if chunk[1] < similarity_threshold | |
| ] | |
| return filtered_chunks | |
| except Exception as e: | |
| print(f"Error in index search: {e}") | |
| return [] | |
| def get_index_stats(index: faiss.IndexHNSWFlat) -> Dict[str, Any]: | |
| """ | |
| Get statistics about the FAISS index. | |
| Args: | |
| index: FAISS index | |
| Returns: | |
| Dictionary with index statistics | |
| """ | |
| return { | |
| "total_vectors": index.ntotal, | |
| "dimension": index.d, | |
| "index_type": type(index).__name__, | |
| "ef_search": index.hnsw.efSearch, | |
| "ef_construction": index.hnsw.efConstruction, | |
| "is_trained": index.is_trained | |
| } | |
