RAG_AI_V2

Build error

App Files Files Community

WebashalarForML commited on Feb 5

Commit

6277cb8

verified ·

1 Parent(s): 61b6a84

Update retrival.py

Browse files

Files changed (1) hide show

retrival.py +121 -73

retrival.py CHANGED Viewed

@@ -317,80 +317,119 @@ async def save_to_chroma(chunks: list[Document], name: str, tables: list[Documen
 ####----------------------------------------------------------- Updating Existing Data in Vector DB  -----------------------------------------------####
 ########################################################################################################################################################
-# def add_document_to_existing_db(new_documents: list[Document], db_name: str):
-#     CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
-#     if not os.path.exists(CHROMA_PATH):
-#         print(f"Database '{db_name}' does not exist. Please create it first.")
-#         return
-#     try:
-#         embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
-#         #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
-#         db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
-#         print("Adding new documents to the existing database...")
-#         chunks = split_text(new_documents)
-#         db.add_documents(chunks)
-#         db.persist()
-#         print("New documents added and database updated successfully.")
-#     except Exception as e:
-#         print("Error while adding documents to existing database:", e)
-# def delete_chunks_by_source(chroma_path, source_to_delete):
-#     if not os.path.exists(chroma_path):
-#         print(f"Database at path '{chroma_path}' does not exist.")
-#         return
-#     try:
-#         #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
-#         embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
-#         db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)
-#         print(f"Retrieving all metadata to identify chunks with source '{source_to_delete}'...")
-#         metadata_list = db.get()["metadatas"]
-#         # Identify indices of chunks to delete
-#         indices_to_delete = [
-#             idx for idx, metadata in enumerate(metadata_list) if metadata.get("source") == source_to_delete
-#         ]
-#         if not indices_to_delete:
-#             print(f"No chunks found with source '{source_to_delete}'.")
-#             return
-#         print(f"Deleting {len(indices_to_delete)} chunks with source '{source_to_delete}'...")
-#         db.delete(indices=indices_to_delete)
-#         db.persist()
-#         print("Chunks deleted and database updated successfully.")
-#     except Exception as e:
-#         print(f"Error while deleting chunks by source: {e}")
-# # update a data store
-# def update_data_store(file_path, db_name):
-#     CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
-#     print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")
-#     try:
-#         documents,table_document = load_document(file_path)
-#         print("Documents loaded successfully.")
-#     except Exception as e:
-#         print(f"Error loading documents: {e}")
-#         return
-#     try:
-#         chunks = split_text(documents)
-#         print(f"Text split into {len(chunks)} chunks.")
-#     except Exception as e:
-#         print(f"Error splitting text: {e}")
-#         return
-#     try:
-#         asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
-#         print(f"Data saved to Chroma for database {db_name}.")
-#     except Exception as e:
-#         print(f"Error saving to Chroma: {e}")
-#         return
 ########################################################################################################################################################
 ####------------------------------------------------------- Combine Process of Load, Chunk and Store  ----------------------------------------------####
@@ -423,3 +462,12 @@ async def generate_data_store(file_path, db_name):
         print(f"Error saving to Chroma: {e}")
         return

 ####----------------------------------------------------------- Updating Existing Data in Vector DB  -----------------------------------------------####
 ########################################################################################################################################################
+# adding document to Existing db
+async def add_document_to_existing_db(new_chunks: list[Document], db_name: str,tables: list[Document]):
+    CHROMA_PATH = f"./VectorDB/{db_name}"
+    TABLE_PATH = f"./TableDB/{db_name}"
+    if not os.path.exists(CHROMA_PATH):
+        print(f"Database '{db_name}' does not exist. Please create it first.")
+        return
+    try:
+        # Load the embedding model
+        embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2",show_progress=True)
+        #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
+        # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
+        print("Creating document vector database...")
+        db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
+        # db =Chroma.from_documents(
+        #                                     documents=new_chunks,
+        #                                     embedding=embedding_function,
+        #                                     persist_directory=CHROMA_PATH,
+        #                                 )
+        print("Persisting the document database...")
+        db.add_documents(new_chunks)
+        db.persist()
+        print("Document database successfully saved.")
+        # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
+        if tables !=[]:
+            print("Creating table vector database...")
+            if not os.path.exists(TABLE_PATH):
+                print(f"Database '{db_name}' does not exist. Lets create it first.")
+                print("Persisting the table database...")
+                tdb =Chroma.from_documents(
+                documents=tables,
+                embedding=embedding_function,
+                persist_directory=TABLE_PATH,
+                  )
+            else:
+                tdb = Chroma(persist_directory=TABLE_PATH, embedding_function=embedding_function)
+                print("Persisting the table database...")
+                db.add_documents(tables)
+                db.persist()
+            print("Table database successfully saved.")
+        else:
+            tdb = None
+        return db, tdb
+        #return db
+    except Exception as e:
+        print("Error while saving to Chroma:", e)
+        return None
+#delete chunks by logics
+def delete_chunks_by_source(chroma_path, source_to_delete):
+    if not os.path.exists(chroma_path):
+        print(f"Database at path '{chroma_path}' does not exist.")
+        return
+    try:
+        #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+        embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
+        db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)
+        print(f"Retrieving all metadata to identify chunks with source '{source_to_delete}'...")
+        metadata_list = db.get()["metadatas"]
+        # Identify indices of chunks to delete
+        indices_to_delete = [
+            idx for idx, metadata in enumerate(metadata_list) if metadata.get("source") == source_to_delete
+        ]
+        if not indices_to_delete:
+            print(f"No chunks found with source '{source_to_delete}'.")
+            return
+        print(f"Deleting {len(indices_to_delete)} chunks with source '{source_to_delete}'...")
+        db.delete(indices=indices_to_delete)
+        db.persist()
+        print("Chunks deleted and database updated successfully.")
+    except Exception as e:
+        print(f"Error while deleting chunks by source: {e}")
+########################################################################################################################################################
+####-----------------------------------------------Combine Process of upload, Chunk and Store  (FOR NEW DOC)----------------------------------------####
+########################################################################################################################################################
+# update a data store
+async def update_data_store(file_path, db_name):
+    CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
+    print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")
+    try:
+        documents,processed_documents,table_document = load_document(file_path)
+        #grouped_document,document = load_document(file_path)
+        print("Documents loaded successfully.")
+    except Exception as e:
+        print(f"Error loading documents: {e}")
+        return
+    try:
+        chunks = split_text(documents)
+        print(f"Text split into {len(chunks)} chunks.")
+    except Exception as e:
+        print(f"Error splitting text: {e}")
+        return
+    try:
+        await add_document_to_existing_db(chunks, db_name, table_document)
+        #await asyncio.run(save_to_chroma(chunks, db_name,table_document))
+        print(f"Data saved to Chroma for database {db_name}.")
+    except Exception as e:
+        print(f"Error saving to Chroma: {e}")
+        return
 ########################################################################################################################################################
 ####------------------------------------------------------- Combine Process of Load, Chunk and Store  ----------------------------------------------####
         print(f"Error saving to Chroma: {e}")
         return
+########################################################################################################################################################
+####-------------------------------------------------------------------- Token counter   -----------------------------------------------------------####
+########################################################################################################################################################
+def approximate_bpe_token_counter(text):
+    # Split on spaces, punctuation, and common subword patterns
+    tokens = re.findall(r"\w+|[^\w\s]", text, re.UNICODE)
+    return len(tokens)