WebashalarForML commited on
Commit
6277cb8
·
verified ·
1 Parent(s): 61b6a84

Update retrival.py

Browse files
Files changed (1) hide show
  1. retrival.py +121 -73
retrival.py CHANGED
@@ -317,80 +317,119 @@ async def save_to_chroma(chunks: list[Document], name: str, tables: list[Documen
317
  ####----------------------------------------------------------- Updating Existing Data in Vector DB -----------------------------------------------####
318
  ########################################################################################################################################################
319
 
320
- # def add_document_to_existing_db(new_documents: list[Document], db_name: str):
321
- # CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
322
-
323
- # if not os.path.exists(CHROMA_PATH):
324
- # print(f"Database '{db_name}' does not exist. Please create it first.")
325
- # return
326
-
327
- # try:
328
- # embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
329
- # #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
330
- # db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
331
-
332
- # print("Adding new documents to the existing database...")
333
- # chunks = split_text(new_documents)
334
- # db.add_documents(chunks)
335
- # db.persist()
336
- # print("New documents added and database updated successfully.")
337
- # except Exception as e:
338
- # print("Error while adding documents to existing database:", e)
339
-
340
- # def delete_chunks_by_source(chroma_path, source_to_delete):
341
- # if not os.path.exists(chroma_path):
342
- # print(f"Database at path '{chroma_path}' does not exist.")
343
- # return
344
-
345
- # try:
346
- # #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
347
- # embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
348
- # db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)
349
-
350
- # print(f"Retrieving all metadata to identify chunks with source '{source_to_delete}'...")
351
- # metadata_list = db.get()["metadatas"]
352
-
353
- # # Identify indices of chunks to delete
354
- # indices_to_delete = [
355
- # idx for idx, metadata in enumerate(metadata_list) if metadata.get("source") == source_to_delete
356
- # ]
357
-
358
- # if not indices_to_delete:
359
- # print(f"No chunks found with source '{source_to_delete}'.")
360
- # return
361
-
362
- # print(f"Deleting {len(indices_to_delete)} chunks with source '{source_to_delete}'...")
363
- # db.delete(indices=indices_to_delete)
364
- # db.persist()
365
- # print("Chunks deleted and database updated successfully.")
366
- # except Exception as e:
367
- # print(f"Error while deleting chunks by source: {e}")
368
 
369
- # # update a data store
370
- # def update_data_store(file_path, db_name):
371
- # CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
372
- # print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
373
-
374
- # try:
375
- # documents,table_document = load_document(file_path)
376
- # print("Documents loaded successfully.")
377
- # except Exception as e:
378
- # print(f"Error loading documents: {e}")
379
- # return
380
-
381
- # try:
382
- # chunks = split_text(documents)
383
- # print(f"Text split into {len(chunks)} chunks.")
384
- # except Exception as e:
385
- # print(f"Error splitting text: {e}")
386
- # return
387
-
388
- # try:
389
- # asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
390
- # print(f"Data saved to Chroma for database {db_name}.")
391
- # except Exception as e:
392
- # print(f"Error saving to Chroma: {e}")
393
- # return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
  ########################################################################################################################################################
396
  ####------------------------------------------------------- Combine Process of Load, Chunk and Store ----------------------------------------------####
@@ -423,3 +462,12 @@ async def generate_data_store(file_path, db_name):
423
  print(f"Error saving to Chroma: {e}")
424
  return
425
 
 
 
 
 
 
 
 
 
 
 
317
  ####----------------------------------------------------------- Updating Existing Data in Vector DB -----------------------------------------------####
318
  ########################################################################################################################################################
319
 
320
+ # adding document to Existing db
321
+ async def add_document_to_existing_db(new_chunks: list[Document], db_name: str,tables: list[Document]):
322
+ CHROMA_PATH = f"./VectorDB/{db_name}"
323
+ TABLE_PATH = f"./TableDB/{db_name}"
324
+ if not os.path.exists(CHROMA_PATH):
325
+ print(f"Database '{db_name}' does not exist. Please create it first.")
326
+ return
327
+ try:
328
+ # Load the embedding model
329
+ embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2",show_progress=True)
330
+ #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
331
+ # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
332
+ print("Creating document vector database...")
333
+ db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
334
+ # db =Chroma.from_documents(
335
+ # documents=new_chunks,
336
+ # embedding=embedding_function,
337
+ # persist_directory=CHROMA_PATH,
338
+
339
+ # )
340
+ print("Persisting the document database...")
341
+ db.add_documents(new_chunks)
342
+ db.persist()
343
+ print("Document database successfully saved.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
+ # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
346
+ if tables !=[]:
347
+ print("Creating table vector database...")
348
+ if not os.path.exists(TABLE_PATH):
349
+ print(f"Database '{db_name}' does not exist. Lets create it first.")
350
+ print("Persisting the table database...")
351
+ tdb =Chroma.from_documents(
352
+ documents=tables,
353
+ embedding=embedding_function,
354
+ persist_directory=TABLE_PATH,
355
+ )
356
+ else:
357
+ tdb = Chroma(persist_directory=TABLE_PATH, embedding_function=embedding_function)
358
+ print("Persisting the table database...")
359
+ db.add_documents(tables)
360
+ db.persist()
361
+ print("Table database successfully saved.")
362
+ else:
363
+ tdb = None
364
+
365
+ return db, tdb
366
+ #return db
367
+
368
+ except Exception as e:
369
+ print("Error while saving to Chroma:", e)
370
+ return None
371
+
372
+ #delete chunks by logics
373
+ def delete_chunks_by_source(chroma_path, source_to_delete):
374
+ if not os.path.exists(chroma_path):
375
+ print(f"Database at path '{chroma_path}' does not exist.")
376
+ return
377
+
378
+ try:
379
+ #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
380
+ embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
381
+ db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)
382
+
383
+ print(f"Retrieving all metadata to identify chunks with source '{source_to_delete}'...")
384
+ metadata_list = db.get()["metadatas"]
385
+
386
+ # Identify indices of chunks to delete
387
+ indices_to_delete = [
388
+ idx for idx, metadata in enumerate(metadata_list) if metadata.get("source") == source_to_delete
389
+ ]
390
+
391
+ if not indices_to_delete:
392
+ print(f"No chunks found with source '{source_to_delete}'.")
393
+ return
394
+
395
+ print(f"Deleting {len(indices_to_delete)} chunks with source '{source_to_delete}'...")
396
+ db.delete(indices=indices_to_delete)
397
+ db.persist()
398
+ print("Chunks deleted and database updated successfully.")
399
+ except Exception as e:
400
+ print(f"Error while deleting chunks by source: {e}")
401
+
402
+ ########################################################################################################################################################
403
+ ####-----------------------------------------------Combine Process of upload, Chunk and Store (FOR NEW DOC)----------------------------------------####
404
+ ########################################################################################################################################################
405
+
406
+ # update a data store
407
+ async def update_data_store(file_path, db_name):
408
+ CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
409
+ print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
410
+
411
+ try:
412
+ documents,processed_documents,table_document = load_document(file_path)
413
+ #grouped_document,document = load_document(file_path)
414
+ print("Documents loaded successfully.")
415
+ except Exception as e:
416
+ print(f"Error loading documents: {e}")
417
+ return
418
+
419
+ try:
420
+ chunks = split_text(documents)
421
+ print(f"Text split into {len(chunks)} chunks.")
422
+ except Exception as e:
423
+ print(f"Error splitting text: {e}")
424
+ return
425
+
426
+ try:
427
+ await add_document_to_existing_db(chunks, db_name, table_document)
428
+ #await asyncio.run(save_to_chroma(chunks, db_name,table_document))
429
+ print(f"Data saved to Chroma for database {db_name}.")
430
+ except Exception as e:
431
+ print(f"Error saving to Chroma: {e}")
432
+ return
433
 
434
  ########################################################################################################################################################
435
  ####------------------------------------------------------- Combine Process of Load, Chunk and Store ----------------------------------------------####
 
462
  print(f"Error saving to Chroma: {e}")
463
  return
464
 
465
+ ########################################################################################################################################################
466
+ ####-------------------------------------------------------------------- Token counter -----------------------------------------------------------####
467
+ ########################################################################################################################################################
468
+
469
+ def approximate_bpe_token_counter(text):
470
+ # Split on spaces, punctuation, and common subword patterns
471
+ tokens = re.findall(r"\w+|[^\w\s]", text, re.UNICODE)
472
+ return len(tokens)
473
+