Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	File size: 4,496 Bytes
			
			| 447c09c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | """
FAISS indexing utilities for similarity search.
This module provides utilities for building and searching FAISS indexes.
"""
from typing import List, Tuple, Any, Dict
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from configs.config import Config
from utils.text_processing import validate_chunk_data
def build_faiss_index(embeddings: np.ndarray) -> faiss.IndexHNSWFlat:
    """
    Build a FAISS HNSW index from embeddings for similarity search.
    
    Args:
        embeddings: Numpy array of embeddings
        
    Returns:
        FAISS HNSW index
    """
    dim = embeddings.shape[1]
    index = faiss.IndexHNSWFlat(dim, Config.FAISS_NEIGHBORS)
    index.hnsw.efConstruction = Config.FAISS_EF_CONSTRUCTION
    index.hnsw.efSearch = Config.FAISS_EF_SEARCH
    index.add(embeddings)
    return index
def retrieve_similar_chunks(
    query: str,
    index: faiss.IndexHNSWFlat,
    chunks_with_metadata: List[Dict[str, Any]],
    embedding_model: SentenceTransformer,
    k: int = None,
    max_chunk_length: int = None
) -> List[Tuple[str, float, Dict[str, Any]]]:
    """
    Retrieve top k similar chunks to the query from the FAISS index.
    
    Args:
        query: Search query
        index: FAISS index
        chunks_with_metadata: List of chunk dictionaries
        embedding_model: SentenceTransformer model
        k: Number of chunks to retrieve
        max_chunk_length: Maximum length for returned chunks
        
    Returns:
        List of tuples (chunk_text, distance, metadata)
    """
    if k is None:
        k = Config.DEFAULT_K_CHUNKS
    if max_chunk_length is None:
        max_chunk_length = Config.DEFAULT_CHUNK_SIZE
    
    query_embedding = embedding_model.encode([query], convert_to_tensor=True).cpu().numpy()
    distances, indices = index.search(query_embedding, k)
    
    # Ensure indices are within bounds and create mapping for correct distances
    valid_results = []
    for idx_pos, chunk_idx in enumerate(indices[0]):
        if 0 <= chunk_idx < len(chunks_with_metadata):
            chunk_text = chunks_with_metadata[chunk_idx]["text"][:max_chunk_length]
            # Only include chunks with meaningful content
            if chunk_text.strip():  # Skip empty chunks
                result = (
                    chunk_text,
                    distances[0][idx_pos],  # Use original position for correct distance
                    chunks_with_metadata[chunk_idx]["metadata"]
                )
                if validate_chunk_data(result):
                    valid_results.append(result)
    
    return valid_results
def search_index_with_validation(
    query: str,
    index: faiss.IndexHNSWFlat,
    chunks_with_metadata: List[Dict[str, Any]],
    embedding_model: SentenceTransformer,
    k: int = None,
    similarity_threshold: float = None
) -> List[Tuple[str, float, Dict[str, Any]]]:
    """
    Search index with additional validation and filtering.
    
    Args:
        query: Search query
        index: FAISS index
        chunks_with_metadata: List of chunk dictionaries
        embedding_model: SentenceTransformer model
        k: Number of chunks to retrieve
        similarity_threshold: Threshold for filtering results
        
    Returns:
        List of validated and filtered chunk tuples
    """
    if not query or len(query.strip()) < 3:
        return []
    
    if similarity_threshold is None:
        similarity_threshold = Config.SIMILARITY_THRESHOLD
    
    try:
        # Retrieve similar chunks
        similar_chunks = retrieve_similar_chunks(
            query, index, chunks_with_metadata, embedding_model, k
        )
        
        # Filter by similarity threshold
        filtered_chunks = [
            chunk for chunk in similar_chunks 
            if chunk[1] < similarity_threshold
        ]
        
        return filtered_chunks
        
    except Exception as e:
        print(f"Error in index search: {e}")
        return []
def get_index_stats(index: faiss.IndexHNSWFlat) -> Dict[str, Any]:
    """
    Get statistics about the FAISS index.
    
    Args:
        index: FAISS index
        
    Returns:
        Dictionary with index statistics
    """
    return {
        "total_vectors": index.ntotal,
        "dimension": index.d,
        "index_type": type(index).__name__,
        "ef_search": index.hnsw.efSearch,
        "ef_construction": index.hnsw.efConstruction,
        "is_trained": index.is_trained
    }
 | 
