Jatin Mehra commited on
Commit
447c09c
·
1 Parent(s): 4dbeb79

Add FAISS indexing utilities and enhance text processing functions for improved chunking and validation

Browse files
Files changed (2) hide show
  1. utils/faiss_utils.py +146 -0
  2. utils/text_processing.py +196 -0
utils/faiss_utils.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FAISS indexing utilities for similarity search.
3
+
4
+ This module provides utilities for building and searching FAISS indexes.
5
+ """
6
+
7
+ from typing import List, Tuple, Any, Dict
8
+ import numpy as np
9
+ import faiss
10
+ from sentence_transformers import SentenceTransformer
11
+
12
+ from configs.config import Config
13
+ from utils.text_processing import validate_chunk_data
14
+
15
+
16
+ def build_faiss_index(embeddings: np.ndarray) -> faiss.IndexHNSWFlat:
17
+ """
18
+ Build a FAISS HNSW index from embeddings for similarity search.
19
+
20
+ Args:
21
+ embeddings: Numpy array of embeddings
22
+
23
+ Returns:
24
+ FAISS HNSW index
25
+ """
26
+ dim = embeddings.shape[1]
27
+ index = faiss.IndexHNSWFlat(dim, Config.FAISS_NEIGHBORS)
28
+ index.hnsw.efConstruction = Config.FAISS_EF_CONSTRUCTION
29
+ index.hnsw.efSearch = Config.FAISS_EF_SEARCH
30
+ index.add(embeddings)
31
+ return index
32
+
33
+
34
+ def retrieve_similar_chunks(
35
+ query: str,
36
+ index: faiss.IndexHNSWFlat,
37
+ chunks_with_metadata: List[Dict[str, Any]],
38
+ embedding_model: SentenceTransformer,
39
+ k: int = None,
40
+ max_chunk_length: int = None
41
+ ) -> List[Tuple[str, float, Dict[str, Any]]]:
42
+ """
43
+ Retrieve top k similar chunks to the query from the FAISS index.
44
+
45
+ Args:
46
+ query: Search query
47
+ index: FAISS index
48
+ chunks_with_metadata: List of chunk dictionaries
49
+ embedding_model: SentenceTransformer model
50
+ k: Number of chunks to retrieve
51
+ max_chunk_length: Maximum length for returned chunks
52
+
53
+ Returns:
54
+ List of tuples (chunk_text, distance, metadata)
55
+ """
56
+ if k is None:
57
+ k = Config.DEFAULT_K_CHUNKS
58
+ if max_chunk_length is None:
59
+ max_chunk_length = Config.DEFAULT_CHUNK_SIZE
60
+
61
+ query_embedding = embedding_model.encode([query], convert_to_tensor=True).cpu().numpy()
62
+ distances, indices = index.search(query_embedding, k)
63
+
64
+ # Ensure indices are within bounds and create mapping for correct distances
65
+ valid_results = []
66
+ for idx_pos, chunk_idx in enumerate(indices[0]):
67
+ if 0 <= chunk_idx < len(chunks_with_metadata):
68
+ chunk_text = chunks_with_metadata[chunk_idx]["text"][:max_chunk_length]
69
+ # Only include chunks with meaningful content
70
+ if chunk_text.strip(): # Skip empty chunks
71
+ result = (
72
+ chunk_text,
73
+ distances[0][idx_pos], # Use original position for correct distance
74
+ chunks_with_metadata[chunk_idx]["metadata"]
75
+ )
76
+ if validate_chunk_data(result):
77
+ valid_results.append(result)
78
+
79
+ return valid_results
80
+
81
+
82
+ def search_index_with_validation(
83
+ query: str,
84
+ index: faiss.IndexHNSWFlat,
85
+ chunks_with_metadata: List[Dict[str, Any]],
86
+ embedding_model: SentenceTransformer,
87
+ k: int = None,
88
+ similarity_threshold: float = None
89
+ ) -> List[Tuple[str, float, Dict[str, Any]]]:
90
+ """
91
+ Search index with additional validation and filtering.
92
+
93
+ Args:
94
+ query: Search query
95
+ index: FAISS index
96
+ chunks_with_metadata: List of chunk dictionaries
97
+ embedding_model: SentenceTransformer model
98
+ k: Number of chunks to retrieve
99
+ similarity_threshold: Threshold for filtering results
100
+
101
+ Returns:
102
+ List of validated and filtered chunk tuples
103
+ """
104
+ if not query or len(query.strip()) < 3:
105
+ return []
106
+
107
+ if similarity_threshold is None:
108
+ similarity_threshold = Config.SIMILARITY_THRESHOLD
109
+
110
+ try:
111
+ # Retrieve similar chunks
112
+ similar_chunks = retrieve_similar_chunks(
113
+ query, index, chunks_with_metadata, embedding_model, k
114
+ )
115
+
116
+ # Filter by similarity threshold
117
+ filtered_chunks = [
118
+ chunk for chunk in similar_chunks
119
+ if chunk[1] < similarity_threshold
120
+ ]
121
+
122
+ return filtered_chunks
123
+
124
+ except Exception as e:
125
+ print(f"Error in index search: {e}")
126
+ return []
127
+
128
+
129
+ def get_index_stats(index: faiss.IndexHNSWFlat) -> Dict[str, Any]:
130
+ """
131
+ Get statistics about the FAISS index.
132
+
133
+ Args:
134
+ index: FAISS index
135
+
136
+ Returns:
137
+ Dictionary with index statistics
138
+ """
139
+ return {
140
+ "total_vectors": index.ntotal,
141
+ "dimension": index.d,
142
+ "index_type": type(index).__name__,
143
+ "ef_search": index.hnsw.efSearch,
144
+ "ef_construction": index.hnsw.efConstruction,
145
+ "is_trained": index.is_trained
146
+ }
utils/text_processing.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for text processing and embeddings.
3
+
4
+ This module contains utility functions for text processing, tokenization,
5
+ chunking, and embedding operations.
6
+ """
7
+
8
+ from typing import List, Dict, Any, Tuple
9
+ import numpy as np
10
+ from sentence_transformers import SentenceTransformer
11
+ from langchain_community.document_loaders import PyMuPDFLoader
12
+ from langchain.schema import Document
13
+
14
+ from configs.config import Config
15
+
16
+
17
+ def estimate_tokens(text: str) -> int:
18
+ """
19
+ Estimate the number of tokens in a text (rough approximation).
20
+
21
+ Args:
22
+ text: Input text
23
+
24
+ Returns:
25
+ Estimated number of tokens
26
+ """
27
+ return len(text) // 4
28
+
29
+
30
+ def process_pdf_file(file_path: str) -> List[Document]:
31
+ """
32
+ Load a PDF file and extract its text with metadata.
33
+
34
+ Args:
35
+ file_path: Path to the PDF file
36
+
37
+ Returns:
38
+ List of Document objects with metadata
39
+
40
+ Raises:
41
+ FileNotFoundError: If the file doesn't exist
42
+ """
43
+ import os
44
+ if not os.path.exists(file_path):
45
+ raise FileNotFoundError(f"The file {file_path} does not exist.")
46
+
47
+ loader = PyMuPDFLoader(file_path)
48
+ documents = loader.load()
49
+ return documents
50
+
51
+
52
+ def chunk_text(documents: List[Document], max_length: int = None) -> List[Dict[str, Any]]:
53
+ """
54
+ Split documents into chunks with metadata.
55
+
56
+ Args:
57
+ documents: List of Document objects
58
+ max_length: Maximum chunk length in tokens
59
+
60
+ Returns:
61
+ List of chunk dictionaries with text and metadata
62
+ """
63
+ if max_length is None:
64
+ max_length = Config.DEFAULT_CHUNK_SIZE
65
+
66
+ chunks = []
67
+
68
+ for doc in documents:
69
+ text = doc.page_content
70
+ metadata = doc.metadata
71
+ paragraphs = text.split("\n\n")
72
+ current_chunk = ""
73
+ current_metadata = metadata.copy()
74
+
75
+ for paragraph in paragraphs:
76
+ # Skip very short paragraphs
77
+ if len(paragraph.strip()) < Config.MIN_PARAGRAPH_LENGTH:
78
+ continue
79
+
80
+ if estimate_tokens(current_chunk + paragraph) <= max_length // 4:
81
+ current_chunk += paragraph + "\n\n"
82
+ else:
83
+ # Only add chunks with meaningful content
84
+ if current_chunk.strip() and len(current_chunk.strip()) > Config.MIN_CHUNK_LENGTH:
85
+ chunks.append({
86
+ "text": current_chunk.strip(),
87
+ "metadata": current_metadata
88
+ })
89
+ current_chunk = paragraph + "\n\n"
90
+
91
+ # Add the last chunk if it has meaningful content
92
+ if current_chunk.strip() and len(current_chunk.strip()) > Config.MIN_CHUNK_LENGTH:
93
+ chunks.append({
94
+ "text": current_chunk.strip(),
95
+ "metadata": current_metadata
96
+ })
97
+
98
+ return chunks
99
+
100
+
101
+ def create_embeddings(chunks: List[Dict[str, Any]], model: SentenceTransformer) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
102
+ """
103
+ Create embeddings for a list of chunk texts.
104
+
105
+ Args:
106
+ chunks: List of chunk dictionaries
107
+ model: SentenceTransformer model
108
+
109
+ Returns:
110
+ Tuple of (embeddings array, chunks)
111
+ """
112
+ texts = [chunk["text"] for chunk in chunks]
113
+ embeddings = model.encode(texts, show_progress_bar=True, convert_to_tensor=True)
114
+ return embeddings.cpu().numpy(), chunks
115
+
116
+
117
+ def filter_relevant_chunks(chunks_data: List[Tuple], threshold: float = None) -> List[Tuple]:
118
+ """
119
+ Filter chunks based on similarity threshold.
120
+
121
+ Args:
122
+ chunks_data: List of (text, score, metadata) tuples
123
+ threshold: Similarity threshold (lower is more similar)
124
+
125
+ Returns:
126
+ Filtered list of chunks
127
+ """
128
+ if threshold is None:
129
+ threshold = Config.SIMILARITY_THRESHOLD
130
+
131
+ return [chunk for chunk in chunks_data if len(chunk) >= 3 and chunk[1] < threshold]
132
+
133
+
134
+ def prepare_context_from_chunks(context_chunks: List[Tuple], max_tokens: int = None) -> str:
135
+ """
136
+ Prepare context string from chunk data.
137
+
138
+ Args:
139
+ context_chunks: List of (text, score, metadata) tuples
140
+ max_tokens: Maximum tokens for context
141
+
142
+ Returns:
143
+ Formatted context string
144
+ """
145
+ if max_tokens is None:
146
+ max_tokens = Config.MAX_CONTEXT_TOKENS
147
+
148
+ # Sort chunks by relevance (lower distance = more relevant)
149
+ sorted_chunks = sorted(context_chunks, key=lambda x: x[1]) if context_chunks else []
150
+
151
+ # Filter out chunks with very high distance scores (low similarity)
152
+ relevant_chunks = filter_relevant_chunks(sorted_chunks)
153
+
154
+ context = ""
155
+ total_tokens = 0
156
+
157
+ for chunk, _, _ in relevant_chunks:
158
+ if chunk and chunk.strip():
159
+ chunk_tokens = estimate_tokens(chunk)
160
+ if total_tokens + chunk_tokens <= max_tokens:
161
+ context += chunk + "\n\n"
162
+ total_tokens += chunk_tokens
163
+ else:
164
+ break
165
+
166
+ return context.strip() if context else "No initial context provided from preliminary search."
167
+
168
+
169
+ def validate_chunk_data(chunk_data: Any) -> bool:
170
+ """
171
+ Validate chunk data structure.
172
+
173
+ Args:
174
+ chunk_data: Chunk data to validate
175
+
176
+ Returns:
177
+ True if valid, False otherwise
178
+ """
179
+ if not isinstance(chunk_data, (list, tuple)):
180
+ return False
181
+
182
+ if len(chunk_data) < 3:
183
+ return False
184
+
185
+ text, score, metadata = chunk_data[0], chunk_data[1], chunk_data[2]
186
+
187
+ if not isinstance(text, str) or not text.strip():
188
+ return False
189
+
190
+ if not isinstance(score, (int, float)):
191
+ return False
192
+
193
+ if not isinstance(metadata, dict):
194
+ return False
195
+
196
+ return True