WebashalarForML commited on
Commit
e0efaf2
·
verified ·
1 Parent(s): 16c4b55

Update retrival.py

Browse files
Files changed (1) hide show
  1. retrival.py +69 -65
retrival.py CHANGED
@@ -258,13 +258,14 @@ def split_text(documents: list[Document]):
258
  ####---------------------------------------------------- Creating and Storeing Data in Vector DB --------------------------------------------------####
259
  ########################################################################################################################################################
260
 
261
- def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
262
- CHROMA_PATH = f"./VectorDB/chroma_{name}"
263
- TABLE_PATH = f"./TableDB/chroma_{name}"
 
264
  if os.path.exists(CHROMA_PATH):
265
  shutil.rmtree(CHROMA_PATH)
266
- if os.path.exists(TABLE_PATH):
267
- shutil.rmtree(TABLE_PATH)
268
 
269
  try:
270
  # Load the embedding model
@@ -279,19 +280,21 @@ def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
279
  )
280
  print("Document database successfully saved.")
281
 
282
- # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
283
- if tables:
284
- print("Creating table vector database...")
285
- tdb = Chroma.from_documents(
286
- documents=tables,
287
- embedding=embedding_function,
288
- persist_directory=TABLE_PATH,
289
- )
290
- print("Table database successfully saved.")
291
- else:
292
- tdb = None
293
-
294
- return db, tdb
 
 
295
  except Exception as e:
296
  print("Error while saving to Chroma:", e)
297
  return None
@@ -306,54 +309,54 @@ def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
306
  ####----------------------------------------------------------- Updating Existing Data in Vector DB -----------------------------------------------####
307
  ########################################################################################################################################################
308
 
309
- def add_document_to_existing_db(new_documents: list[Document], db_name: str):
310
- CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
311
-
312
- if not os.path.exists(CHROMA_PATH):
313
- print(f"Database '{db_name}' does not exist. Please create it first.")
314
- return
315
-
316
- try:
317
- embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
318
- #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
319
- db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
320
-
321
- print("Adding new documents to the existing database...")
322
- chunks = split_text(new_documents)
323
- db.add_documents(chunks)
324
- db.persist()
325
- print("New documents added and database updated successfully.")
326
- except Exception as e:
327
- print("Error while adding documents to existing database:", e)
328
-
329
- def delete_chunks_by_source(chroma_path, source_to_delete):
330
- if not os.path.exists(chroma_path):
331
- print(f"Database at path '{chroma_path}' does not exist.")
332
- return
333
-
334
- try:
335
- #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
336
- embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
337
- db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)
338
 
339
- print(f"Retrieving all metadata to identify chunks with source '{source_to_delete}'...")
340
- metadata_list = db.get()["metadatas"]
 
341
 
342
- # Identify indices of chunks to delete
343
- indices_to_delete = [
344
- idx for idx, metadata in enumerate(metadata_list) if metadata.get("source") == source_to_delete
345
- ]
 
 
 
 
 
 
 
 
346
 
347
- if not indices_to_delete:
348
- print(f"No chunks found with source '{source_to_delete}'.")
349
- return
 
350
 
351
- print(f"Deleting {len(indices_to_delete)} chunks with source '{source_to_delete}'...")
352
- db.delete(indices=indices_to_delete)
353
- db.persist()
354
- print("Chunks deleted and database updated successfully.")
355
- except Exception as e:
356
- print(f"Error while deleting chunks by source: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
  # # update a data store
359
  # def update_data_store(file_path, db_name):
@@ -390,21 +393,22 @@ def generate_data_store(file_path, db_name):
390
  print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
391
 
392
  try:
393
- documents,processed_documents,table_document = load_document(file_path)
394
  print("Documents loaded successfully.")
395
  except Exception as e:
396
  print(f"Error loading documents: {e}")
397
  return
398
 
399
  try:
400
- chunks = split_text(documents)
401
  print(f"Text split into {len(chunks)} chunks.")
402
  except Exception as e:
403
  print(f"Error splitting text: {e}")
404
  return
405
 
406
  try:
407
- asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
 
408
  print(f"Data saved to Chroma for database {db_name}.")
409
  except Exception as e:
410
  print(f"Error saving to Chroma: {e}")
 
258
  ####---------------------------------------------------- Creating and Storeing Data in Vector DB --------------------------------------------------####
259
  ########################################################################################################################################################
260
 
261
+ #def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
262
+ def save_to_chroma(chunks: list[Document], name: str):
263
+ CHROMA_PATH = f"./VectorDB/chroma_{name}"
264
+ #TABLE_PATH = f"./TableDB/chroma_{name}"
265
  if os.path.exists(CHROMA_PATH):
266
  shutil.rmtree(CHROMA_PATH)
267
+ # if os.path.exists(TABLE_PATH):
268
+ # shutil.rmtree(TABLE_PATH)
269
 
270
  try:
271
  # Load the embedding model
 
280
  )
281
  print("Document database successfully saved.")
282
 
283
+ # # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
284
+ # if tables:
285
+ # print("Creating table vector database...")
286
+ # tdb = Chroma.from_documents(
287
+ # documents=tables,
288
+ # embedding=embedding_function,
289
+ # persist_directory=TABLE_PATH,
290
+ # )
291
+ # print("Table database successfully saved.")
292
+ # else:
293
+ # tdb = None
294
+
295
+ #return db, tdb
296
+ return db
297
+
298
  except Exception as e:
299
  print("Error while saving to Chroma:", e)
300
  return None
 
309
  ####----------------------------------------------------------- Updating Existing Data in Vector DB -----------------------------------------------####
310
  ########################################################################################################################################################
311
 
312
+ # def add_document_to_existing_db(new_documents: list[Document], db_name: str):
313
+ # CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
+ # if not os.path.exists(CHROMA_PATH):
316
+ # print(f"Database '{db_name}' does not exist. Please create it first.")
317
+ # return
318
 
319
+ # try:
320
+ # embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
321
+ # #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
322
+ # db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
323
+
324
+ # print("Adding new documents to the existing database...")
325
+ # chunks = split_text(new_documents)
326
+ # db.add_documents(chunks)
327
+ # db.persist()
328
+ # print("New documents added and database updated successfully.")
329
+ # except Exception as e:
330
+ # print("Error while adding documents to existing database:", e)
331
 
332
+ # def delete_chunks_by_source(chroma_path, source_to_delete):
333
+ # if not os.path.exists(chroma_path):
334
+ # print(f"Database at path '{chroma_path}' does not exist.")
335
+ # return
336
 
337
+ # try:
338
+ # #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
339
+ # embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
340
+ # db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)
341
+
342
+ # print(f"Retrieving all metadata to identify chunks with source '{source_to_delete}'...")
343
+ # metadata_list = db.get()["metadatas"]
344
+
345
+ # # Identify indices of chunks to delete
346
+ # indices_to_delete = [
347
+ # idx for idx, metadata in enumerate(metadata_list) if metadata.get("source") == source_to_delete
348
+ # ]
349
+
350
+ # if not indices_to_delete:
351
+ # print(f"No chunks found with source '{source_to_delete}'.")
352
+ # return
353
+
354
+ # print(f"Deleting {len(indices_to_delete)} chunks with source '{source_to_delete}'...")
355
+ # db.delete(indices=indices_to_delete)
356
+ # db.persist()
357
+ # print("Chunks deleted and database updated successfully.")
358
+ # except Exception as e:
359
+ # print(f"Error while deleting chunks by source: {e}")
360
 
361
  # # update a data store
362
  # def update_data_store(file_path, db_name):
 
393
  print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
394
 
395
  try:
396
+ documents,grouped_documents = load_document(file_path)
397
  print("Documents loaded successfully.")
398
  except Exception as e:
399
  print(f"Error loading documents: {e}")
400
  return
401
 
402
  try:
403
+ chunks = split_text(grouped_documents)
404
  print(f"Text split into {len(chunks)} chunks.")
405
  except Exception as e:
406
  print(f"Error splitting text: {e}")
407
  return
408
 
409
  try:
410
+ #asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
411
+ asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name)))
412
  print(f"Data saved to Chroma for database {db_name}.")
413
  except Exception as e:
414
  print(f"Error saving to Chroma: {e}")