Spaces:
Running
Running
File size: 4,496 Bytes
447c09c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
"""
FAISS indexing utilities for similarity search.
This module provides utilities for building and searching FAISS indexes.
"""
from typing import List, Tuple, Any, Dict
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from configs.config import Config
from utils.text_processing import validate_chunk_data
def build_faiss_index(embeddings: np.ndarray) -> faiss.IndexHNSWFlat:
"""
Build a FAISS HNSW index from embeddings for similarity search.
Args:
embeddings: Numpy array of embeddings
Returns:
FAISS HNSW index
"""
dim = embeddings.shape[1]
index = faiss.IndexHNSWFlat(dim, Config.FAISS_NEIGHBORS)
index.hnsw.efConstruction = Config.FAISS_EF_CONSTRUCTION
index.hnsw.efSearch = Config.FAISS_EF_SEARCH
index.add(embeddings)
return index
def retrieve_similar_chunks(
query: str,
index: faiss.IndexHNSWFlat,
chunks_with_metadata: List[Dict[str, Any]],
embedding_model: SentenceTransformer,
k: int = None,
max_chunk_length: int = None
) -> List[Tuple[str, float, Dict[str, Any]]]:
"""
Retrieve top k similar chunks to the query from the FAISS index.
Args:
query: Search query
index: FAISS index
chunks_with_metadata: List of chunk dictionaries
embedding_model: SentenceTransformer model
k: Number of chunks to retrieve
max_chunk_length: Maximum length for returned chunks
Returns:
List of tuples (chunk_text, distance, metadata)
"""
if k is None:
k = Config.DEFAULT_K_CHUNKS
if max_chunk_length is None:
max_chunk_length = Config.DEFAULT_CHUNK_SIZE
query_embedding = embedding_model.encode([query], convert_to_tensor=True).cpu().numpy()
distances, indices = index.search(query_embedding, k)
# Ensure indices are within bounds and create mapping for correct distances
valid_results = []
for idx_pos, chunk_idx in enumerate(indices[0]):
if 0 <= chunk_idx < len(chunks_with_metadata):
chunk_text = chunks_with_metadata[chunk_idx]["text"][:max_chunk_length]
# Only include chunks with meaningful content
if chunk_text.strip(): # Skip empty chunks
result = (
chunk_text,
distances[0][idx_pos], # Use original position for correct distance
chunks_with_metadata[chunk_idx]["metadata"]
)
if validate_chunk_data(result):
valid_results.append(result)
return valid_results
def search_index_with_validation(
query: str,
index: faiss.IndexHNSWFlat,
chunks_with_metadata: List[Dict[str, Any]],
embedding_model: SentenceTransformer,
k: int = None,
similarity_threshold: float = None
) -> List[Tuple[str, float, Dict[str, Any]]]:
"""
Search index with additional validation and filtering.
Args:
query: Search query
index: FAISS index
chunks_with_metadata: List of chunk dictionaries
embedding_model: SentenceTransformer model
k: Number of chunks to retrieve
similarity_threshold: Threshold for filtering results
Returns:
List of validated and filtered chunk tuples
"""
if not query or len(query.strip()) < 3:
return []
if similarity_threshold is None:
similarity_threshold = Config.SIMILARITY_THRESHOLD
try:
# Retrieve similar chunks
similar_chunks = retrieve_similar_chunks(
query, index, chunks_with_metadata, embedding_model, k
)
# Filter by similarity threshold
filtered_chunks = [
chunk for chunk in similar_chunks
if chunk[1] < similarity_threshold
]
return filtered_chunks
except Exception as e:
print(f"Error in index search: {e}")
return []
def get_index_stats(index: faiss.IndexHNSWFlat) -> Dict[str, Any]:
"""
Get statistics about the FAISS index.
Args:
index: FAISS index
Returns:
Dictionary with index statistics
"""
return {
"total_vectors": index.ntotal,
"dimension": index.d,
"index_type": type(index).__name__,
"ef_search": index.hnsw.efSearch,
"ef_construction": index.hnsw.efConstruction,
"is_trained": index.is_trained
}
|