Spaces:
Running
Running
“vinit5112”
commited on
Commit
·
82dac66
1
Parent(s):
3092e52
qdrant changes
Browse files- backend/rag.py +6 -0
- backend/vector_store.py +142 -110
backend/rag.py
CHANGED
@@ -16,6 +16,12 @@ class RAG:
|
|
16 |
# Setup Vector Store (Qdrant configuration is handled via environment variables)
|
17 |
self.vector_store = VectorStore()
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
# Setup Text Splitter
|
20 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
21 |
chunk_size=1000,
|
|
|
16 |
# Setup Vector Store (Qdrant configuration is handled via environment variables)
|
17 |
self.vector_store = VectorStore()
|
18 |
|
19 |
+
# Verify vector store is properly initialized
|
20 |
+
if not self.vector_store.verify_collection_health():
|
21 |
+
print("Warning: Vector store collection health check failed")
|
22 |
+
else:
|
23 |
+
print("Vector store initialized successfully")
|
24 |
+
|
25 |
# Setup Text Splitter
|
26 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
27 |
chunk_size=1000,
|
backend/vector_store.py
CHANGED
@@ -6,6 +6,7 @@ import os
|
|
6 |
import logging
|
7 |
from typing import List, Dict, Any
|
8 |
from dotenv import load_dotenv
|
|
|
9 |
|
10 |
# Load environment variables
|
11 |
load_dotenv()
|
@@ -35,7 +36,7 @@ class VectorStore:
|
|
35 |
self.embedding_model = self._initialize_embedding_model()
|
36 |
|
37 |
# Create collection with proper indices
|
38 |
-
self.
|
39 |
|
40 |
def _initialize_embedding_model(self):
|
41 |
"""Initialize the embedding model from a local directory"""
|
@@ -51,82 +52,30 @@ class VectorStore:
|
|
51 |
print(f"Failed to load local model: {e}")
|
52 |
raise RuntimeError("Failed to initialize embedding model from local path")
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
|
56 |
-
# def _initialize_embedding_model(self):
|
57 |
-
# """Initialize the embedding model with offline support"""
|
58 |
-
# try:
|
59 |
-
# # Try to load the model normally first
|
60 |
-
# print("Attempting to load sentence transformer model...")
|
61 |
-
# model = SentenceTransformer("all-MiniLM-L6-v2")
|
62 |
-
# print("Successfully loaded sentence transformer model")
|
63 |
-
# return model
|
64 |
-
|
65 |
-
# except Exception as e:
|
66 |
-
# print(f"Failed to load model online: {e}")
|
67 |
-
# print("Attempting to load model in offline mode...")
|
68 |
-
|
69 |
-
# try:
|
70 |
-
# # Try to load from cache with offline mode
|
71 |
-
# import os
|
72 |
-
# os.environ['TRANSFORMERS_OFFLINE'] = '1'
|
73 |
-
# os.environ['HF_HUB_OFFLINE'] = '1'
|
74 |
-
|
75 |
-
# model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=None)
|
76 |
-
# print("Successfully loaded model in offline mode")
|
77 |
-
# return model
|
78 |
-
|
79 |
-
# except Exception as offline_error:
|
80 |
-
# print(f"Failed to load model in offline mode: {offline_error}")
|
81 |
-
|
82 |
-
# # Try to find a local cache directory
|
83 |
-
# try:
|
84 |
-
# import transformers
|
85 |
-
# cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "transformers")
|
86 |
-
# if os.path.exists(cache_dir):
|
87 |
-
# print(f"Looking for cached model in: {cache_dir}")
|
88 |
-
|
89 |
-
# # Try to load from specific cache directory
|
90 |
-
# model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=cache_dir)
|
91 |
-
# print("Successfully loaded model from cache")
|
92 |
-
# return model
|
93 |
-
|
94 |
-
# except Exception as cache_error:
|
95 |
-
# print(f"Failed to load from cache: {cache_error}")
|
96 |
-
|
97 |
-
# # If all else fails, provide instructions
|
98 |
-
# error_msg = """
|
99 |
-
# Failed to initialize sentence transformer model. This is likely due to network connectivity issues.
|
100 |
-
|
101 |
-
# Solutions:
|
102 |
-
# 1. Check your internet connection
|
103 |
-
# 2. If behind a corporate firewall, ensure huggingface.co is accessible
|
104 |
-
# 3. Pre-download the model when you have internet access by running:
|
105 |
-
# python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
|
106 |
-
# 4. Or manually download the model and place it in your cache directory
|
107 |
-
|
108 |
-
# For now, the application will not work without the embedding model.
|
109 |
-
# """
|
110 |
-
|
111 |
-
# print(error_msg)
|
112 |
-
# raise RuntimeError(f"Cannot initialize embedding model: {str(e)}")
|
113 |
-
|
114 |
-
def _create_collection_if_not_exists(self) -> bool:
|
115 |
"""
|
116 |
-
Create collection with proper
|
117 |
|
118 |
Returns:
|
119 |
-
bool: True if collection
|
120 |
"""
|
121 |
try:
|
122 |
-
# Check if collection exists
|
123 |
-
collections = self.client.get_collections()
|
124 |
-
collection_names = [col.name for col in collections.collections]
|
125 |
-
print("list of collections : ", collection_names)
|
126 |
-
if self.collection_name in collection_names:
|
127 |
-
print(f"Collection '{self.collection_name}' already exists")
|
128 |
-
return True
|
129 |
-
|
130 |
print(f"Creating new collection: {self.collection_name}")
|
131 |
|
132 |
# Vector size for all-MiniLM-L6-v2 is 384
|
@@ -145,6 +94,9 @@ class VectorStore:
|
|
145 |
),
|
146 |
)
|
147 |
|
|
|
|
|
|
|
148 |
# Create payload indices
|
149 |
payload_indices = {
|
150 |
"document_id": PayloadSchemaType.KEYWORD,
|
@@ -152,11 +104,14 @@ class VectorStore:
|
|
152 |
}
|
153 |
|
154 |
for field_name, schema_type in payload_indices.items():
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
160 |
|
161 |
print(f"Successfully created collection: {self.collection_name}")
|
162 |
return True
|
@@ -166,47 +121,105 @@ class VectorStore:
|
|
166 |
logger.error(error_msg, exc_info=True)
|
167 |
print(error_msg)
|
168 |
return False
|
169 |
-
|
170 |
-
def
|
171 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
172 |
try:
|
173 |
-
#
|
174 |
-
|
175 |
-
|
176 |
-
# Generate document ID
|
177 |
-
document_id = str(uuid.uuid4())
|
178 |
-
|
179 |
-
# Create payload with indexed fields
|
180 |
-
payload = {
|
181 |
-
"document_id": document_id, # KEYWORD index
|
182 |
-
"content": text, # TEXT index - stores the actual text content
|
183 |
-
}
|
184 |
-
|
185 |
-
# Add metadata fields if provided
|
186 |
-
if metadata:
|
187 |
-
payload.update(metadata)
|
188 |
|
189 |
-
#
|
190 |
-
|
191 |
-
|
192 |
-
vector=embedding.tolist(),
|
193 |
-
payload=payload
|
194 |
-
)
|
195 |
|
196 |
-
# Store in Qdrant
|
197 |
-
self.client.upsert(
|
198 |
-
collection_name=self.collection_name,
|
199 |
-
points=[point]
|
200 |
-
)
|
201 |
-
|
202 |
-
return True
|
203 |
except Exception as e:
|
204 |
-
|
|
|
|
|
205 |
return False
|
206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
def search_similar(self, query: str, limit: int = 5) -> List[Dict]:
|
208 |
-
"""Search for similar documents"""
|
209 |
try:
|
|
|
|
|
|
|
|
|
|
|
210 |
# Generate query embedding
|
211 |
query_embedding = self.embedding_model.encode([query])[0]
|
212 |
|
@@ -246,4 +259,23 @@ class VectorStore:
|
|
246 |
}
|
247 |
except Exception as e:
|
248 |
print(f"Error getting collection info: {e}")
|
249 |
-
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import logging
|
7 |
from typing import List, Dict, Any
|
8 |
from dotenv import load_dotenv
|
9 |
+
import time
|
10 |
|
11 |
# Load environment variables
|
12 |
load_dotenv()
|
|
|
36 |
self.embedding_model = self._initialize_embedding_model()
|
37 |
|
38 |
# Create collection with proper indices
|
39 |
+
self._ensure_collection_exists()
|
40 |
|
41 |
def _initialize_embedding_model(self):
|
42 |
"""Initialize the embedding model from a local directory"""
|
|
|
52 |
print(f"Failed to load local model: {e}")
|
53 |
raise RuntimeError("Failed to initialize embedding model from local path")
|
54 |
|
55 |
+
def _collection_exists_and_accessible(self) -> bool:
|
56 |
+
"""
|
57 |
+
Check if collection exists and is accessible by trying to get its info.
|
58 |
+
|
59 |
+
Returns:
|
60 |
+
bool: True if collection exists and is accessible
|
61 |
+
"""
|
62 |
+
try:
|
63 |
+
# Try to get collection info - this is more reliable than just listing collections
|
64 |
+
collection_info = self.client.get_collection(self.collection_name)
|
65 |
+
print(f"Collection '{self.collection_name}' exists and is accessible")
|
66 |
+
return True
|
67 |
+
except Exception as e:
|
68 |
+
print(f"Collection '{self.collection_name}' is not accessible: {e}")
|
69 |
+
return False
|
70 |
|
71 |
+
def _create_collection(self) -> bool:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
"""
|
73 |
+
Create the collection with proper configuration.
|
74 |
|
75 |
Returns:
|
76 |
+
bool: True if collection was created successfully
|
77 |
"""
|
78 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
print(f"Creating new collection: {self.collection_name}")
|
80 |
|
81 |
# Vector size for all-MiniLM-L6-v2 is 384
|
|
|
94 |
),
|
95 |
)
|
96 |
|
97 |
+
# Wait a moment for collection to be fully created
|
98 |
+
time.sleep(1)
|
99 |
+
|
100 |
# Create payload indices
|
101 |
payload_indices = {
|
102 |
"document_id": PayloadSchemaType.KEYWORD,
|
|
|
104 |
}
|
105 |
|
106 |
for field_name, schema_type in payload_indices.items():
|
107 |
+
try:
|
108 |
+
self.client.create_payload_index(
|
109 |
+
collection_name=self.collection_name,
|
110 |
+
field_name=field_name,
|
111 |
+
field_schema=schema_type
|
112 |
+
)
|
113 |
+
except Exception as idx_error:
|
114 |
+
print(f"Warning: Failed to create index for {field_name}: {idx_error}")
|
115 |
|
116 |
print(f"Successfully created collection: {self.collection_name}")
|
117 |
return True
|
|
|
121 |
logger.error(error_msg, exc_info=True)
|
122 |
print(error_msg)
|
123 |
return False
|
124 |
+
|
125 |
+
def _ensure_collection_exists(self) -> bool:
|
126 |
+
"""
|
127 |
+
Ensure collection exists and is accessible, create if necessary.
|
128 |
+
|
129 |
+
Returns:
|
130 |
+
bool: True if collection exists or was created successfully
|
131 |
+
"""
|
132 |
try:
|
133 |
+
# First, check if collection exists and is accessible
|
134 |
+
if self._collection_exists_and_accessible():
|
135 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
+
# If not accessible, try to create it
|
138 |
+
print(f"Collection '{self.collection_name}' not found or not accessible, creating...")
|
139 |
+
return self._create_collection()
|
|
|
|
|
|
|
140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
except Exception as e:
|
142 |
+
error_msg = f"Failed to ensure collection exists: {str(e)}"
|
143 |
+
logger.error(error_msg, exc_info=True)
|
144 |
+
print(error_msg)
|
145 |
return False
|
146 |
|
147 |
+
def add_document(self, text: str, metadata: Dict = None) -> bool:
|
148 |
+
"""Add a document to the collection with retry logic"""
|
149 |
+
max_retries = 3
|
150 |
+
retry_delay = 1
|
151 |
+
|
152 |
+
for attempt in range(max_retries):
|
153 |
+
try:
|
154 |
+
# Ensure collection exists before adding document
|
155 |
+
if not self._collection_exists_and_accessible():
|
156 |
+
print("Collection not accessible, trying to recreate...")
|
157 |
+
if not self._create_collection():
|
158 |
+
raise Exception("Failed to create collection")
|
159 |
+
|
160 |
+
# Generate embedding
|
161 |
+
embedding = self.embedding_model.encode([text])[0]
|
162 |
+
|
163 |
+
# Generate document ID
|
164 |
+
document_id = str(uuid.uuid4())
|
165 |
+
|
166 |
+
# Create payload with indexed fields
|
167 |
+
payload = {
|
168 |
+
"document_id": document_id, # KEYWORD index
|
169 |
+
"content": text, # TEXT index - stores the actual text content
|
170 |
+
}
|
171 |
+
|
172 |
+
# Add metadata fields if provided
|
173 |
+
if metadata:
|
174 |
+
payload.update(metadata)
|
175 |
+
|
176 |
+
# Create point
|
177 |
+
point = PointStruct(
|
178 |
+
id=document_id,
|
179 |
+
vector=embedding.tolist(),
|
180 |
+
payload=payload
|
181 |
+
)
|
182 |
+
|
183 |
+
# Store in Qdrant
|
184 |
+
result = self.client.upsert(
|
185 |
+
collection_name=self.collection_name,
|
186 |
+
points=[point]
|
187 |
+
)
|
188 |
+
|
189 |
+
# Check if upsert was successful
|
190 |
+
if hasattr(result, 'status') and result.status == 'completed':
|
191 |
+
return True
|
192 |
+
elif hasattr(result, 'operation_id'):
|
193 |
+
return True
|
194 |
+
else:
|
195 |
+
print(f"Unexpected upsert result: {result}")
|
196 |
+
return True # Assume success if no error was raised
|
197 |
+
|
198 |
+
except Exception as e:
|
199 |
+
print(f"Error adding document (attempt {attempt + 1}/{max_retries}): {e}")
|
200 |
+
if "Not found" in str(e) and "doesn't exist" in str(e):
|
201 |
+
# Collection doesn't exist, try to recreate
|
202 |
+
print("Collection not found, attempting to recreate...")
|
203 |
+
self._create_collection()
|
204 |
+
|
205 |
+
if attempt < max_retries - 1:
|
206 |
+
print(f"Retrying in {retry_delay} seconds...")
|
207 |
+
time.sleep(retry_delay)
|
208 |
+
retry_delay *= 2 # Exponential backoff
|
209 |
+
else:
|
210 |
+
print(f"Failed to add document after {max_retries} attempts")
|
211 |
+
return False
|
212 |
+
|
213 |
+
return False
|
214 |
+
|
215 |
def search_similar(self, query: str, limit: int = 5) -> List[Dict]:
|
216 |
+
"""Search for similar documents with error handling"""
|
217 |
try:
|
218 |
+
# Ensure collection exists before searching
|
219 |
+
if not self._collection_exists_and_accessible():
|
220 |
+
print("Collection not accessible for search")
|
221 |
+
return []
|
222 |
+
|
223 |
# Generate query embedding
|
224 |
query_embedding = self.embedding_model.encode([query])[0]
|
225 |
|
|
|
259 |
}
|
260 |
except Exception as e:
|
261 |
print(f"Error getting collection info: {e}")
|
262 |
+
return {}
|
263 |
+
|
264 |
+
def verify_collection_health(self) -> bool:
|
265 |
+
"""Verify that the collection is healthy and accessible"""
|
266 |
+
try:
|
267 |
+
# Try to get collection info
|
268 |
+
info = self.get_collection_info()
|
269 |
+
if not info:
|
270 |
+
return False
|
271 |
+
|
272 |
+
# Try a simple search to verify functionality
|
273 |
+
test_results = self.search_similar("test query", limit=1)
|
274 |
+
# This should not fail even if no results are found
|
275 |
+
|
276 |
+
print(f"Collection health check passed. Points count: {info.get('points_count', 0)}")
|
277 |
+
return True
|
278 |
+
|
279 |
+
except Exception as e:
|
280 |
+
print(f"Collection health check failed: {e}")
|
281 |
+
return False
|