Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Update retrival.py
Browse files- retrival.py +358 -75
 
    	
        retrival.py
    CHANGED
    
    | 
         @@ -1,17 +1,19 @@ 
     | 
|
| 1 | 
         
             
            from langchain_community.document_loaders import DirectoryLoader
         
     | 
| 2 | 
         
            -
            from langchain.embeddings import  
     | 
| 3 | 
         
            -
            from langchain.text_splitter import RecursiveCharacterTextSplitter 
     | 
| 4 | 
         
            -
            from langchain.schema import Document 
     | 
| 5 | 
         
             
            from langchain_core.documents import Document
         
     | 
| 6 | 
         
            -
            from langchain_openai import OpenAIEmbeddings
         
     | 
| 7 | 
         
             
            from langchain_community.vectorstores import Chroma
         
     | 
| 8 | 
         
            -
            import openai 
         
     | 
| 9 | 
         
            -
            import openai
         
     | 
| 10 | 
         
             
            import os
         
     | 
| 11 | 
         
             
            import shutil
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 12 | 
         
             
            import uuid
         
     | 
| 13 | 
         
            -
             
     | 
| 14 | 
         
            -
             
     | 
| 15 | 
         | 
| 16 | 
         
             
            # Configurations
         
     | 
| 17 | 
         
             
            UPLOAD_FOLDER = "./uploads"
         
     | 
| 
         @@ -19,86 +21,375 @@ VECTOR_DB_FOLDER = "./VectorDB" 
     | 
|
| 19 | 
         
             
            os.makedirs(UPLOAD_FOLDER, exist_ok=True)
         
     | 
| 20 | 
         
             
            os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
         
     | 
| 21 | 
         | 
| 22 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 23 | 
         
             
            def load_document(data_path):
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 24 | 
         | 
| 25 | 
         
            -
             
     | 
| 
         | 
|
| 26 | 
         
             
                loader = DirectoryLoader(data_path, glob="*.*")
         
     | 
| 27 | 
         
            -
                 
     | 
| 28 | 
         
            -
                 
     | 
| 29 | 
         
            -
                 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 30 | 
         | 
| 31 | 
         
            -
            # Creating the chunks of Data from the knowledge 
         
     | 
| 32 | 
         
             
            def split_text(documents: list[Document]):
         
     | 
| 33 | 
         
             
                text_splitter = RecursiveCharacterTextSplitter(
         
     | 
| 34 | 
         
            -
                    chunk_size 
     | 
| 35 | 
         
            -
                    chunk_overlap 
     | 
| 36 | 
         
            -
                    length_function 
     | 
| 37 | 
         
             
                    add_start_index=True,
         
     | 
| 38 | 
         
            -
                ) 
     | 
| 39 | 
         
            -
                chunks = text_splitter.split_documents(documents)
         
     | 
| 
         | 
|
| 
         | 
|
| 40 | 
         
             
                print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
         
     | 
| 41 | 
         
            -
                
         
     | 
| 42 | 
         
             
                return chunks
         
     | 
| 43 | 
         | 
| 44 | 
         
            -
             
     | 
| 45 | 
         
            -
             
     | 
| 46 | 
         
            -
             
     | 
| 47 | 
         
            -
            #     CHROMA_PATH = f"./VectorDB/chroma_{name}"
         
     | 
| 48 | 
         
            -
            #     # Clear out the database first.
         
     | 
| 49 | 
         
            -
            #     if os.path.exists(CHROMA_PATH):
         
     | 
| 50 | 
         
            -
            #         shutil.rmtree(CHROMA_PATH)        
         
     | 
| 51 | 
         
            -
               
         
     | 
| 52 | 
         
            -
            #     # Initialize SBERT embedding function
         
     | 
| 53 | 
         
            -
            #     embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         
     | 
| 54 | 
         
            -
            #     db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
         
     | 
| 55 | 
         
            -
                
         
     | 
| 56 | 
         
            -
            #     # Add documents and persist the database
         
     | 
| 57 | 
         
            -
            #     db.add_documents(chunks)
         
     | 
| 58 | 
         
            -
            #     db.persist()
         
     | 
| 59 | 
         
            -
            #     # Return the database instance or a success status
         
     | 
| 60 | 
         
            -
            #     return db
         
     | 
| 61 | 
         | 
| 62 | 
         
            -
            def save_to_chroma(chunks: list[Document], name: str):
         
     | 
| 63 | 
         
             
                CHROMA_PATH = f"./VectorDB/chroma_{name}"
         
     | 
| 64 | 
         
            -
                
         
     | 
| 65 | 
         
            -
                # Clear out the database first
         
     | 
| 66 | 
         
             
                if os.path.exists(CHROMA_PATH):
         
     | 
| 67 | 
         
             
                    shutil.rmtree(CHROMA_PATH)
         
     | 
| 68 | 
         
            -
                
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 69 | 
         
             
                try:
         
     | 
| 70 | 
         
            -
                    # Initialize SBERT embedding function
         
     | 
| 71 | 
         
             
                    embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         
     | 
| 
         | 
|
| 72 | 
         
             
                    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
         
     | 
| 73 | 
         
            -
             
     | 
| 74 | 
         
            -
                     
     | 
| 75 | 
         
            -
                     
     | 
| 76 | 
         
             
                    db.add_documents(chunks)
         
     | 
| 77 | 
         
            -
                    print("Persisting the database...")
         
     | 
| 78 | 
         
             
                    db.persist()
         
     | 
| 79 | 
         
            -
                    print(" 
     | 
| 80 | 
         
            -
                    
         
     | 
| 81 | 
         
            -
                    return db
         
     | 
| 82 | 
         
             
                except Exception as e:
         
     | 
| 83 | 
         
            -
                    print("Error while  
     | 
| 84 | 
         
            -
             
     | 
| 85 | 
         
            -
             
         
     | 
| 86 | 
         
            -
             
     | 
| 87 | 
         
            -
             
     | 
| 88 | 
         
            -
             
     | 
| 89 | 
         
            -
             
     | 
| 90 | 
         
            -
                 
     | 
| 91 | 
         
            -
             
     | 
| 92 | 
         
            -
             
     | 
| 93 | 
         
            -
             
     | 
| 94 | 
         
            -
             
     | 
| 95 | 
         
            -
             
     | 
| 96 | 
         
            -
             
     | 
| 97 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 98 | 
         
             
                CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
         
     | 
| 99 | 
         
            -
                print(f" 
     | 
| 
         | 
|
| 100 | 
         
             
                try:
         
     | 
| 101 | 
         
            -
                    documents = load_document(file_path)
         
     | 
| 102 | 
         
             
                    print("Documents loaded successfully.")
         
     | 
| 103 | 
         
             
                except Exception as e:
         
     | 
| 104 | 
         
             
                    print(f"Error loading documents: {e}")
         
     | 
| 
         @@ -112,17 +403,9 @@ def generate_data_store(file_path,db_name): 
     | 
|
| 112 | 
         
             
                    return
         
     | 
| 113 | 
         | 
| 114 | 
         
             
                try:
         
     | 
| 115 | 
         
            -
                    asyncio.run(save_to_chroma(chunks, db_name))
         
     | 
| 116 | 
         
             
                    print(f"Data saved to Chroma for database {db_name}.")
         
     | 
| 117 | 
         
             
                except Exception as e:
         
     | 
| 118 | 
         
             
                    print(f"Error saving to Chroma: {e}")
         
     | 
| 119 | 
         
             
                    return
         
     | 
| 120 | 
         
            -
            # def main():
         
     | 
| 121 | 
         
            -
            #     data_path = "H:\\DEV PATEL\\RAG Project\\data1"
         
     | 
| 122 | 
         
            -
            #     db_name = "Product_data"
         
     | 
| 123 | 
         
            -
            #     generate_data_store(data_path,db_name)
         
     | 
| 124 | 
         
            -
             
     | 
| 125 | 
         
            -
            # if __name__ == "__main__":
         
     | 
| 126 | 
         
            -
            #     main()
         
     | 
| 127 | 
         
            -
                
         
     | 
| 128 | 
         | 
| 
         | 
|
| 1 | 
         
             
            from langchain_community.document_loaders import DirectoryLoader
         
     | 
| 2 | 
         
            +
            from langchain.embeddings import HuggingFaceEmbeddings
         
     | 
| 3 | 
         
            +
            from langchain.text_splitter import RecursiveCharacterTextSplitter
         
     | 
| 4 | 
         
            +
            from langchain.schema import Document
         
     | 
| 5 | 
         
             
            from langchain_core.documents import Document
         
     | 
| 
         | 
|
| 6 | 
         
             
            from langchain_community.vectorstores import Chroma
         
     | 
| 
         | 
|
| 
         | 
|
| 7 | 
         
             
            import os
         
     | 
| 8 | 
         
             
            import shutil
         
     | 
| 9 | 
         
            +
            import asyncio
         
     | 
| 10 | 
         
            +
            from unstructured.partition.pdf import partition_pdf
         
     | 
| 11 | 
         
            +
            from unstructured.partition.auto import partition
         
     | 
| 12 | 
         
            +
            import pytesseract
         
     | 
| 13 | 
         
            +
            import os
         
     | 
| 14 | 
         
            +
            import re
         
     | 
| 15 | 
         
             
            import uuid
         
     | 
| 16 | 
         
            +
            pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
         
     | 
| 
         | 
|
| 17 | 
         | 
| 18 | 
         
             
            # Configurations
         
     | 
| 19 | 
         
             
            UPLOAD_FOLDER = "./uploads"
         
     | 
| 
         | 
|
| 21 | 
         
             
            os.makedirs(UPLOAD_FOLDER, exist_ok=True)
         
     | 
| 22 | 
         
             
            os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
         
     | 
| 23 | 
         | 
| 24 | 
         
            +
            ########################################################################################################################################################
         
     | 
| 25 | 
         
            +
            ####--------------------------------------------------------------  Documnet Loader  ---------------------------------------------------------------####
         
     | 
| 26 | 
         
            +
            ########################################################################################################################################################
         
     | 
| 27 | 
         
            +
            # Loaders for loading Document text, tables and images from any file format.
         
     | 
| 28 | 
         
            +
            #data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
         
     | 
| 29 | 
         
             
            def load_document(data_path):
         
     | 
| 30 | 
         
            +
                processed_documents = []
         
     | 
| 31 | 
         
            +
                element_content = []
         
     | 
| 32 | 
         
            +
                table_document = []
         
     | 
| 33 | 
         
            +
                #having different process for the pdf
         
     | 
| 34 | 
         
            +
                for root, _, files in os.walk(data_path):
         
     | 
| 35 | 
         
            +
                    for file in files:
         
     | 
| 36 | 
         
            +
                        file_path = os.path.join(root, file)
         
     | 
| 37 | 
         
            +
                        doc_id = str(uuid.uuid4())  # Generate a unique ID for the document
         
     | 
| 38 | 
         
            +
             
     | 
| 39 | 
         
            +
                        print(f"Processing document ID: {doc_id}, Path: {file_path}")
         
     | 
| 40 | 
         
            +
             
     | 
| 41 | 
         
            +
                        try:
         
     | 
| 42 | 
         
            +
                            # Determine the file type based on extension
         
     | 
| 43 | 
         
            +
                            filename, file_extension = os.path.splitext(file.lower())
         
     | 
| 44 | 
         
            +
                            image_output = f"H:/DEV PATEL/2025/RAG Project/Images/{filename}/"
         
     | 
| 45 | 
         
            +
                            # Use specific partition techniques based on file extension
         
     | 
| 46 | 
         
            +
                            if file_extension == ".pdf":
         
     | 
| 47 | 
         
            +
                                elements = partition_pdf(
         
     | 
| 48 | 
         
            +
                                    filename=file_path,
         
     | 
| 49 | 
         
            +
                                    strategy="hi_res",  # Use layout detection
         
     | 
| 50 | 
         
            +
                                    infer_table_structure=True,
         
     | 
| 51 | 
         
            +
                                    hi_res_model_name="yolox",
         
     | 
| 52 | 
         
            +
                                    extract_images_in_pdf=True,
         
     | 
| 53 | 
         
            +
                                    extract_image_block_types=["Image","Table"],
         
     | 
| 54 | 
         
            +
                                    extract_image_block_output_dir=image_output,
         
     | 
| 55 | 
         
            +
                                    show_progress=True,
         
     | 
| 56 | 
         
            +
                                    #chunking_strategy="by_title",
         
     | 
| 57 | 
         
            +
                                )
         
     | 
| 58 | 
         
            +
                            else:
         
     | 
| 59 | 
         
            +
                                # Default to auto partition if no specific handler is found
         
     | 
| 60 | 
         
            +
                                elements = partition(
         
     | 
| 61 | 
         
            +
                                    filename=file_path,
         
     | 
| 62 | 
         
            +
                                    strategy="hi_res",
         
     | 
| 63 | 
         
            +
                                    infer_table_structure=True,
         
     | 
| 64 | 
         
            +
                                    show_progress=True,
         
     | 
| 65 | 
         
            +
                                    #chunking_strategy="by_title"
         
     | 
| 66 | 
         
            +
                                )
         
     | 
| 67 | 
         
            +
                        except Exception as e:
         
     | 
| 68 | 
         
            +
                            print(f"Failed to process document {file_path}: {e}")
         
     | 
| 69 | 
         
            +
                            continue
         
     | 
| 70 | 
         
            +
                        categorized_content = {                
         
     | 
| 71 | 
         
            +
                            "tables": {"content": [], "Metadata": []},
         
     | 
| 72 | 
         
            +
                            "images": {"content": [], "Metadata": []},
         
     | 
| 73 | 
         
            +
                            "text": {"content": [], "Metadata": []},
         
     | 
| 74 | 
         
            +
                            "text2": {"content": [], "Metadata": []}                           
         
     | 
| 75 | 
         
            +
                        }
         
     | 
| 76 | 
         
            +
                        element_content.append(elements)
         
     | 
| 77 | 
         
            +
                        CNT=1
         
     | 
| 78 | 
         
            +
                        for chunk in elements:
         
     | 
| 79 | 
         
            +
                            # Safely extract metadata and text
         
     | 
| 80 | 
         
            +
                            chunk_type = str(type(chunk))
         
     | 
| 81 | 
         
            +
                            chunk_metadata = chunk.metadata.to_dict() if chunk.metadata else {}                
         
     | 
| 82 | 
         
            +
                            chunk_text = getattr(chunk, "text", None)
         
     | 
| 83 | 
         
            +
             
     | 
| 84 | 
         
            +
                            # Separate content into categories
         
     | 
| 85 | 
         
            +
                            #if "Table" in chunk_type:
         
     | 
| 86 | 
         
            +
                            if any(
         
     | 
| 87 | 
         
            +
                                keyword in chunk_type
         
     | 
| 88 | 
         
            +
                                for keyword in [
         
     | 
| 89 | 
         
            +
                                    "Table",
         
     | 
| 90 | 
         
            +
                                    "TableChunk"]):                    
         
     | 
| 91 | 
         
            +
                                categorized_content["tables"]["content"].append(chunk_text)
         
     | 
| 92 | 
         
            +
                                categorized_content["tables"]["Metadata"].append(chunk_metadata)
         
     | 
| 93 | 
         
            +
                                
         
     | 
| 94 | 
         
            +
                                #test1
         
     | 
| 95 | 
         
            +
                                TABLE_DATA=f"Table number {CNT} "+chunk_metadata.get("text_as_html", "")+" "
         
     | 
| 96 | 
         
            +
                                CNT+=1
         
     | 
| 97 | 
         
            +
                                categorized_content["text"]["content"].append(TABLE_DATA)
         
     | 
| 98 | 
         
            +
                                categorized_content["text"]["Metadata"].append(chunk_metadata) 
         
     | 
| 99 | 
         
            +
             
     | 
| 100 | 
         
            +
                            elif "Image" in chunk_type:
         
     | 
| 101 | 
         
            +
                                categorized_content["images"]["content"].append(chunk_text)
         
     | 
| 102 | 
         
            +
                                categorized_content["images"]["Metadata"].append(chunk_metadata)
         
     | 
| 103 | 
         
            +
                            elif any(
         
     | 
| 104 | 
         
            +
                                keyword in chunk_type
         
     | 
| 105 | 
         
            +
                                for keyword in [
         
     | 
| 106 | 
         
            +
                                    "CompositeElement",
         
     | 
| 107 | 
         
            +
                                    "Text",
         
     | 
| 108 | 
         
            +
                                    "NarrativeText",
         
     | 
| 109 | 
         
            +
                                    "Title",
         
     | 
| 110 | 
         
            +
                                    "Header",
         
     | 
| 111 | 
         
            +
                                    "Footer",
         
     | 
| 112 | 
         
            +
                                    "FigureCaption",
         
     | 
| 113 | 
         
            +
                                    "ListItem",
         
     | 
| 114 | 
         
            +
                                    "UncategorizedText",
         
     | 
| 115 | 
         
            +
                                    "Formula",
         
     | 
| 116 | 
         
            +
                                    "CodeSnippet",
         
     | 
| 117 | 
         
            +
                                    "Address",
         
     | 
| 118 | 
         
            +
                                    "EmailAddress",
         
     | 
| 119 | 
         
            +
                                    "PageBreak",
         
     | 
| 120 | 
         
            +
                                ]
         
     | 
| 121 | 
         
            +
                            ):
         
     | 
| 122 | 
         
            +
                                categorized_content["text"]["content"].append(chunk_text)
         
     | 
| 123 | 
         
            +
                                categorized_content["text"]["Metadata"].append(chunk_metadata)                     
         
     | 
| 124 | 
         
            +
                                            
         
     | 
| 125 | 
         
            +
                            else:
         
     | 
| 126 | 
         
            +
                                continue
         
     | 
| 127 | 
         
            +
                        # Append processed document
         
     | 
| 128 | 
         
            +
                        processed_documents.append({
         
     | 
| 129 | 
         
            +
                            "doc_id": doc_id,
         
     | 
| 130 | 
         
            +
                            "source": file_path,
         
     | 
| 131 | 
         
            +
                            **categorized_content,
         
     | 
| 132 | 
         
            +
                        })
         
     | 
| 133 | 
         
            +
                        
         
     | 
| 134 | 
         
            +
                # Loop over tables and match text from the same document and page
         
     | 
| 135 | 
         
            +
                
         
     | 
| 136 | 
         
            +
                for doc in processed_documents:
         
     | 
| 137 | 
         
            +
                    cnt=1 # count for storing number of the table
         
     | 
| 138 | 
         
            +
                    for table_metadata in doc.get("tables", {}).get("Metadata", []):
         
     | 
| 139 | 
         
            +
                        page_number = table_metadata.get("page_number")
         
     | 
| 140 | 
         
            +
                        source = doc.get("source")
         
     | 
| 141 | 
         
            +
                        page_content = ""
         
     | 
| 142 | 
         
            +
                
         
     | 
| 143 | 
         
            +
                        for text_metadata, text_content in zip(
         
     | 
| 144 | 
         
            +
                            doc.get("text", {}).get("Metadata", []),
         
     | 
| 145 | 
         
            +
                            doc.get("text", {}).get("content", [])
         
     | 
| 146 | 
         
            +
                        ):
         
     | 
| 147 | 
         
            +
                            page_number2 = text_metadata.get("page_number")
         
     | 
| 148 | 
         
            +
                            source2 = doc.get("source")
         
     | 
| 149 | 
         
            +
                        
         
     | 
| 150 | 
         
            +
                            if source == source2 and page_number == page_number2:
         
     | 
| 151 | 
         
            +
                                print(f"Matching text found for source: {source}, page: {page_number}")
         
     | 
| 152 | 
         
            +
                                page_content += f"{text_content} "  # Concatenate text with a space
         
     | 
| 153 | 
         
            +
                    
         
     | 
| 154 | 
         
            +
                        # Add the matched content to the table metadata 
         
     | 
| 155 | 
         
            +
                        table_metadata["page_content"] =f"Table number {cnt} "+table_metadata.get("text_as_html", "")+" "+page_content.strip()  # Remove trailing spaces and have the content proper here 
         
     | 
| 156 | 
         
            +
                        table_metadata["text_as_html"] = table_metadata.get("text_as_html", "") # we are also storing it seperatly
         
     | 
| 157 | 
         
            +
                        table_metadata["Table_number"] = cnt  # addiing the table number it will be use in retrival
         
     | 
| 158 | 
         
            +
                        cnt+=1
         
     | 
| 159 | 
         
            +
                    
         
     | 
| 160 | 
         
            +
                        # Custom loader of document which will store the table along with the text on that page specifically
         
     | 
| 161 | 
         
            +
                        # making document of each table with its content
         
     | 
| 162 | 
         
            +
                        unique_id = str(uuid.uuid4())
         
     | 
| 163 | 
         
            +
                        table_document.append(
         
     | 
| 164 | 
         
            +
                            Document(
         
     | 
| 165 | 
         
            +
                                
         
     | 
| 166 | 
         
            +
                                id =unique_id, # Add doc_id directly
         
     | 
| 167 | 
         
            +
                                page_content=table_metadata.get("page_content", ""),  # Get page_content from metadata, default to empty string if missing
         
     | 
| 168 | 
         
            +
                                metadata={
         
     | 
| 169 | 
         
            +
                                    "source": doc["source"],
         
     | 
| 170 | 
         
            +
                                    "text_as_html": table_metadata.get("text_as_html", ""),
         
     | 
| 171 | 
         
            +
                                    "filetype": table_metadata.get("filetype", ""),
         
     | 
| 172 | 
         
            +
                                    "page_number": str(table_metadata.get("page_number", 0)),  # Default to 0 if missing
         
     | 
| 173 | 
         
            +
                                    "image_path": table_metadata.get("image_path", ""),
         
     | 
| 174 | 
         
            +
                                    "file_directory": table_metadata.get("file_directory", ""),
         
     | 
| 175 | 
         
            +
                                    "filename": table_metadata.get("filename", ""),
         
     | 
| 176 | 
         
            +
                                    "Table_number": str(table_metadata.get("Table_number", 0))  # Default to 0 if missing
         
     | 
| 177 | 
         
            +
                                }
         
     | 
| 178 | 
         
            +
                            )
         
     | 
| 179 | 
         
            +
                        )
         
     | 
| 180 | 
         
            +
             
     | 
| 181 | 
         
            +
                # Initialize a structure to group content by doc_id
         
     | 
| 182 | 
         
            +
                grouped_by_doc_id = defaultdict(lambda: {
         
     | 
| 183 | 
         
            +
                    "text_content": [],
         
     | 
| 184 | 
         
            +
                    "metadata": None,  # Metadata will only be set once per doc_id
         
     | 
| 185 | 
         
            +
                })
         
     | 
| 186 | 
         
            +
             
     | 
| 187 | 
         
            +
                for doc in processed_documents:
         
     | 
| 188 | 
         
            +
                    doc_id = doc.get("doc_id")
         
     | 
| 189 | 
         
            +
                    source = doc.get("source")
         
     | 
| 190 | 
         
            +
                    text_content = doc.get("text", {}).get("content", [])
         
     | 
| 191 | 
         
            +
                    metadata_list = doc.get("text", {}).get("Metadata", [])
         
     | 
| 192 | 
         
            +
             
     | 
| 193 | 
         
            +
                    # Merge text content
         
     | 
| 194 | 
         
            +
                    grouped_by_doc_id[doc_id]["text_content"].extend(text_content)
         
     | 
| 195 | 
         
            +
             
     | 
| 196 | 
         
            +
                    # Set metadata (if not already set)
         
     | 
| 197 | 
         
            +
                    if grouped_by_doc_id[doc_id]["metadata"] is None and metadata_list:
         
     | 
| 198 | 
         
            +
                        metadata = metadata_list[0]  # Assuming metadata is consistent
         
     | 
| 199 | 
         
            +
                        grouped_by_doc_id[doc_id]["metadata"] = {
         
     | 
| 200 | 
         
            +
                            "source": source,
         
     | 
| 201 | 
         
            +
                            "filetype": metadata.get("filetype"),
         
     | 
| 202 | 
         
            +
                            "file_directory": metadata.get("file_directory"),
         
     | 
| 203 | 
         
            +
                            "filename": metadata.get("filename"),
         
     | 
| 204 | 
         
            +
                            "languages": str(metadata.get("languages")),
         
     | 
| 205 | 
         
            +
                        }
         
     | 
| 206 | 
         
            +
             
     | 
| 207 | 
         
            +
                # Convert grouped content into Document objects
         
     | 
| 208 | 
         
            +
                grouped_documents = []
         
     | 
| 209 | 
         
            +
                for doc_id, data in grouped_by_doc_id.items():
         
     | 
| 210 | 
         
            +
                    grouped_documents.append(
         
     | 
| 211 | 
         
            +
                        Document(
         
     | 
| 212 | 
         
            +
                            id=doc_id,
         
     | 
| 213 | 
         
            +
                            page_content=" ".join(data["text_content"]).strip(),
         
     | 
| 214 | 
         
            +
                            metadata=data["metadata"],
         
     | 
| 215 | 
         
            +
                        )
         
     | 
| 216 | 
         
            +
                    )
         
     | 
| 217 | 
         
            +
             
     | 
| 218 | 
         
            +
                # Output the grouped documents
         
     | 
| 219 | 
         
            +
                for document in grouped_documents:
         
     | 
| 220 | 
         
            +
                    print(document)
         
     | 
| 221 | 
         | 
| 222 | 
         
            +
             
     | 
| 223 | 
         
            +
                #Dirctory loader for loading the text data only to specific db
         
     | 
| 224 | 
         
             
                loader = DirectoryLoader(data_path, glob="*.*")
         
     | 
| 225 | 
         
            +
                documents = loader.load()
         
     | 
| 226 | 
         
            +
                
         
     | 
| 227 | 
         
            +
                # update the metadata adding filname to the met
         
     | 
| 228 | 
         
            +
                for doc in documents:   
         
     | 
| 229 | 
         
            +
                    unique_id = str(uuid.uuid4()) 
         
     | 
| 230 | 
         
            +
                    doc.id = unique_id  
         
     | 
| 231 | 
         
            +
                    path=doc.metadata.get("source")
         
     | 
| 232 | 
         
            +
                    match = re.search(r'([^\\]+\.[^\\]+)$', path)
         
     | 
| 233 | 
         
            +
                    doc.metadata.update({"filename":match.group(1)})
         
     | 
| 234 | 
         
            +
                
         
     | 
| 235 | 
         
            +
                return documents,grouped_documents
         
     | 
| 236 | 
         
            +
            #documents,processed_documents,table_document = load_document(data_path)
         
     | 
| 237 | 
         
            +
             
     | 
| 238 | 
         
            +
             
     | 
| 239 | 
         
            +
            ########################################################################################################################################################
         
     | 
| 240 | 
         
            +
            ####-------------------------------------------------------------- Chunking the Text  --------------------------------------------------------------####
         
     | 
| 241 | 
         
            +
            ########################################################################################################################################################
         
     | 
| 242 | 
         | 
| 
         | 
|
| 243 | 
         
             
            def split_text(documents: list[Document]):
         
     | 
| 244 | 
         
             
                text_splitter = RecursiveCharacterTextSplitter(
         
     | 
| 245 | 
         
            +
                    chunk_size=1000,
         
     | 
| 246 | 
         
            +
                    chunk_overlap=500,
         
     | 
| 247 | 
         
            +
                    length_function=len,
         
     | 
| 248 | 
         
             
                    add_start_index=True,
         
     | 
| 249 | 
         
            +
                )
         
     | 
| 250 | 
         
            +
                chunks = text_splitter.split_documents(documents) # splitting the document into chunks 
         
     | 
| 251 | 
         
            +
                for index in chunks:
         
     | 
| 252 | 
         
            +
                    index.metadata["start_index"]=str(index.metadata["start_index"]) # the converstion of int metadata to str was done to store it in sqlite3 
         
     | 
| 253 | 
         
             
                print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
         
     | 
| 
         | 
|
| 254 | 
         
             
                return chunks
         
     | 
| 255 | 
         | 
| 256 | 
         
            +
            ########################################################################################################################################################
         
     | 
| 257 | 
         
            +
            ####---------------------------------------------------- Creating and Storeing Data in Vector DB  --------------------------------------------------####
         
     | 
| 258 | 
         
            +
            ########################################################################################################################################################
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 259 | 
         | 
| 260 | 
         
            +
            def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
         
     | 
| 261 | 
         
             
                CHROMA_PATH = f"./VectorDB/chroma_{name}"
         
     | 
| 262 | 
         
            +
                TABLE_PATH = f"./TableDB/chroma_{name}"
         
     | 
| 
         | 
|
| 263 | 
         
             
                if os.path.exists(CHROMA_PATH):
         
     | 
| 264 | 
         
             
                    shutil.rmtree(CHROMA_PATH)
         
     | 
| 265 | 
         
            +
                if os.path.exists(TABLE_PATH):
         
     | 
| 266 | 
         
            +
                    shutil.rmtree(TABLE_PATH)
         
     | 
| 267 | 
         
            +
             
     | 
| 268 | 
         
            +
                try:
         
     | 
| 269 | 
         
            +
                    # Load the embedding model
         
     | 
| 270 | 
         
            +
                    #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         
     | 
| 271 | 
         
            +
                    embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
         
     | 
| 272 | 
         
            +
                    # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
         
     | 
| 273 | 
         
            +
                    print("Creating document vector database...")
         
     | 
| 274 | 
         
            +
                    db = Chroma.from_documents(
         
     | 
| 275 | 
         
            +
                        documents=chunks,
         
     | 
| 276 | 
         
            +
                        embedding=embedding_function,
         
     | 
| 277 | 
         
            +
                        persist_directory=CHROMA_PATH,
         
     | 
| 278 | 
         
            +
                    )
         
     | 
| 279 | 
         
            +
                    print("Document database successfully saved.")
         
     | 
| 280 | 
         
            +
             
     | 
| 281 | 
         
            +
                    # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
         
     | 
| 282 | 
         
            +
                    if tables:
         
     | 
| 283 | 
         
            +
                        print("Creating table vector database...")
         
     | 
| 284 | 
         
            +
                        tdb = Chroma.from_documents(
         
     | 
| 285 | 
         
            +
                            documents=tables,
         
     | 
| 286 | 
         
            +
                            embedding=embedding_function,
         
     | 
| 287 | 
         
            +
                            persist_directory=TABLE_PATH,
         
     | 
| 288 | 
         
            +
                        )
         
     | 
| 289 | 
         
            +
                        print("Table database successfully saved.")
         
     | 
| 290 | 
         
            +
                    else:
         
     | 
| 291 | 
         
            +
                        tdb = None
         
     | 
| 292 | 
         
            +
             
     | 
| 293 | 
         
            +
                    return db, tdb
         
     | 
| 294 | 
         
            +
                except Exception as e:
         
     | 
| 295 | 
         
            +
                    print("Error while saving to Chroma:", e)
         
     | 
| 296 | 
         
            +
                    return None
         
     | 
| 297 | 
         
            +
             
     | 
| 298 | 
         
            +
            # def get_unique_sources(chroma_path):
         
     | 
| 299 | 
         
            +
            #     db = Chroma(persist_directory=chroma_path)
         
     | 
| 300 | 
         
            +
            #     metadata_list = db.get()["metadatas"]
         
     | 
| 301 | 
         
            +
            #     unique_sources = {metadata["source"] for metadata in metadata_list if "source" in metadata}
         
     | 
| 302 | 
         
            +
            #     return list(unique_sources)
         
     | 
| 303 | 
         
            +
             
     | 
| 304 | 
         
            +
            ########################################################################################################################################################
         
     | 
| 305 | 
         
            +
            ####----------------------------------------------------------- Updating Existing Data in Vector DB  -----------------------------------------------####
         
     | 
| 306 | 
         
            +
            ########################################################################################################################################################
         
     | 
| 307 | 
         
            +
             
     | 
| 308 | 
         
            +
            def add_document_to_existing_db(new_documents: list[Document], db_name: str):
         
     | 
| 309 | 
         
            +
                CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
         
     | 
| 310 | 
         
            +
             
     | 
| 311 | 
         
            +
                if not os.path.exists(CHROMA_PATH):
         
     | 
| 312 | 
         
            +
                    print(f"Database '{db_name}' does not exist. Please create it first.")
         
     | 
| 313 | 
         
            +
                    return
         
     | 
| 314 | 
         
            +
             
     | 
| 315 | 
         
             
                try:
         
     | 
| 
         | 
|
| 316 | 
         
             
                    embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         
     | 
| 317 | 
         
            +
                    #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
         
     | 
| 318 | 
         
             
                    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
         
     | 
| 319 | 
         
            +
             
     | 
| 320 | 
         
            +
                    print("Adding new documents to the existing database...")
         
     | 
| 321 | 
         
            +
                    chunks = split_text(new_documents)
         
     | 
| 322 | 
         
             
                    db.add_documents(chunks)
         
     | 
| 
         | 
|
| 323 | 
         
             
                    db.persist()
         
     | 
| 324 | 
         
            +
                    print("New documents added and database updated successfully.")
         
     | 
| 
         | 
|
| 
         | 
|
| 325 | 
         
             
                except Exception as e:
         
     | 
| 326 | 
         
            +
                    print("Error while adding documents to existing database:", e)
         
     | 
| 327 | 
         
            +
             
     | 
| 328 | 
         
            +
            def delete_chunks_by_source(chroma_path, source_to_delete):
         
     | 
| 329 | 
         
            +
                if not os.path.exists(chroma_path):
         
     | 
| 330 | 
         
            +
                    print(f"Database at path '{chroma_path}' does not exist.")
         
     | 
| 331 | 
         
            +
                    return
         
     | 
| 332 | 
         
            +
             
     | 
| 333 | 
         
            +
                try:
         
     | 
| 334 | 
         
            +
                    #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         
     | 
| 335 | 
         
            +
                    embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
         
     | 
| 336 | 
         
            +
                    db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)
         
     | 
| 337 | 
         
            +
             
     | 
| 338 | 
         
            +
                    print(f"Retrieving all metadata to identify chunks with source '{source_to_delete}'...")
         
     | 
| 339 | 
         
            +
                    metadata_list = db.get()["metadatas"]
         
     | 
| 340 | 
         
            +
             
     | 
| 341 | 
         
            +
                    # Identify indices of chunks to delete
         
     | 
| 342 | 
         
            +
                    indices_to_delete = [
         
     | 
| 343 | 
         
            +
                        idx for idx, metadata in enumerate(metadata_list) if metadata.get("source") == source_to_delete
         
     | 
| 344 | 
         
            +
                    ]
         
     | 
| 345 | 
         
            +
             
     | 
| 346 | 
         
            +
                    if not indices_to_delete:
         
     | 
| 347 | 
         
            +
                        print(f"No chunks found with source '{source_to_delete}'.")
         
     | 
| 348 | 
         
            +
                        return
         
     | 
| 349 | 
         
            +
             
     | 
| 350 | 
         
            +
                    print(f"Deleting {len(indices_to_delete)} chunks with source '{source_to_delete}'...")
         
     | 
| 351 | 
         
            +
                    db.delete(indices=indices_to_delete)
         
     | 
| 352 | 
         
            +
                    db.persist()
         
     | 
| 353 | 
         
            +
                    print("Chunks deleted and database updated successfully.")
         
     | 
| 354 | 
         
            +
                except Exception as e:
         
     | 
| 355 | 
         
            +
                    print(f"Error while deleting chunks by source: {e}")
         
     | 
| 356 | 
         
            +
                    
         
     | 
| 357 | 
         
            +
            # # update a data store        
         
     | 
| 358 | 
         
            +
            # def update_data_store(file_path, db_name):
         
     | 
| 359 | 
         
            +
            #     CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
         
     | 
| 360 | 
         
            +
            #     print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")
         
     | 
| 361 | 
         
            +
             
     | 
| 362 | 
         
            +
            #     try:
         
     | 
| 363 | 
         
            +
            #         documents,table_document = load_document(file_path)
         
     | 
| 364 | 
         
            +
            #         print("Documents loaded successfully.")
         
     | 
| 365 | 
         
            +
            #     except Exception as e:
         
     | 
| 366 | 
         
            +
            #         print(f"Error loading documents: {e}")
         
     | 
| 367 | 
         
            +
            #         return
         
     | 
| 368 | 
         
            +
             
     | 
| 369 | 
         
            +
            #     try:
         
     | 
| 370 | 
         
            +
            #         chunks = split_text(documents)
         
     | 
| 371 | 
         
            +
            #         print(f"Text split into {len(chunks)} chunks.")
         
     | 
| 372 | 
         
            +
            #     except Exception as e:
         
     | 
| 373 | 
         
            +
            #         print(f"Error splitting text: {e}")
         
     | 
| 374 | 
         
            +
            #         return
         
     | 
| 375 | 
         
            +
             
     | 
| 376 | 
         
            +
            #     try:
         
     | 
| 377 | 
         
            +
            #         asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
         
     | 
| 378 | 
         
            +
            #         print(f"Data saved to Chroma for database {db_name}.")
         
     | 
| 379 | 
         
            +
            #     except Exception as e:
         
     | 
| 380 | 
         
            +
            #         print(f"Error saving to Chroma: {e}")
         
     | 
| 381 | 
         
            +
            #         return
         
     | 
| 382 | 
         
            +
             
     | 
| 383 | 
         
            +
            ########################################################################################################################################################
         
     | 
| 384 | 
         
            +
            ####------------------------------------------------------- Combine Process of Load, Chunk and Store  ----------------------------------------------####
         
     | 
| 385 | 
         
            +
            ########################################################################################################################################################
         
     | 
| 386 | 
         
            +
             
     | 
| 387 | 
         
            +
            def generate_data_store(file_path, db_name):
         
     | 
| 388 | 
         
             
                CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
         
     | 
| 389 | 
         
            +
                print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")
         
     | 
| 390 | 
         
            +
             
     | 
| 391 | 
         
             
                try:
         
     | 
| 392 | 
         
            +
                    documents,processed_documents,table_document = load_document(file_path)
         
     | 
| 393 | 
         
             
                    print("Documents loaded successfully.")
         
     | 
| 394 | 
         
             
                except Exception as e:
         
     | 
| 395 | 
         
             
                    print(f"Error loading documents: {e}")
         
     | 
| 
         | 
|
| 403 | 
         
             
                    return
         
     | 
| 404 | 
         | 
| 405 | 
         
             
                try:
         
     | 
| 406 | 
         
            +
                    asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
         
     | 
| 407 | 
         
             
                    print(f"Data saved to Chroma for database {db_name}.")
         
     | 
| 408 | 
         
             
                except Exception as e:
         
     | 
| 409 | 
         
             
                    print(f"Error saving to Chroma: {e}")
         
     | 
| 410 | 
         
             
                    return
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 411 | 
         |