File size: 19,285 Bytes
894171e
263ccc7
 
 
894171e
 
 
 
263ccc7
 
 
 
 
 
894171e
bdbd2e3
 
263ccc7
894171e
 
 
 
16c4b55
894171e
 
 
263ccc7
 
 
 
 
894171e
263ccc7
 
 
 
 
 
 
 
 
 
 
 
 
 
16c4b55
263ccc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541ce76
263ccc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541ce76
263ccc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
894171e
263ccc7
 
541ce76
894171e
263ccc7
 
 
 
 
 
 
 
 
541ce76
 
 
263ccc7
 
 
 
 
 
894171e
 
 
263ccc7
 
 
894171e
263ccc7
 
 
 
894171e
 
 
263ccc7
 
 
894171e
e0efaf2
 
1ca4c9d
e0efaf2
894171e
 
e0efaf2
 
263ccc7
 
 
2fa40d0
 
263ccc7
 
 
 
 
 
 
 
 
e0efaf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263ccc7
 
 
 
 
 
 
 
 
 
 
 
 
 
e0efaf2
 
263ccc7
e0efaf2
 
 
263ccc7
e0efaf2
 
 
 
 
 
 
 
 
 
 
 
263ccc7
e0efaf2
 
 
 
263ccc7
e0efaf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263ccc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
894171e
263ccc7
 
894171e
541ce76
 
894171e
 
 
 
 
 
e0efaf2
894171e
 
 
 
 
 
e0efaf2
b324727
894171e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
from langchain_community.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
import os
import shutil
import asyncio
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.auto import partition
import pytesseract
import os
import re
import uuid
from collections import defaultdict

pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')

# Configurations
UPLOAD_FOLDER = "./uploads"
VECTOR_DB_FOLDER = "./VectorDB"
IMAGE_DB_FOLDER = "./Images"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)

########################################################################################################################################################
####--------------------------------------------------------------  Documnet Loader  ---------------------------------------------------------------####
########################################################################################################################################################
# Loaders for loading Document text, tables and images from any file format.
#data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
def load_document(data_path):
    processed_documents = []
    element_content = []
    table_document = []
    #having different process for the pdf
    for root, _, files in os.walk(data_path):
        for file in files:
            file_path = os.path.join(root, file)
            doc_id = str(uuid.uuid4())  # Generate a unique ID for the document

            print(f"Processing document ID: {doc_id}, Path: {file_path}")

            try:
                # Determine the file type based on extension
                filename, file_extension = os.path.splitext(file.lower())
                image_output = f"./Images/{filename}/"
                # Use specific partition techniques based on file extension
                if file_extension == ".pdf":
                    elements = partition_pdf(
                        filename=file_path,
                        strategy="hi_res",  # Use layout detection
                        infer_table_structure=True,
                        hi_res_model_name="yolox",
                        extract_images_in_pdf=True,
                        extract_image_block_types=["Image","Table"],
                        extract_image_block_output_dir=image_output,
                        show_progress=True,
                        #chunking_strategy="by_title",
                    )
                else:
                    # Default to auto partition if no specific handler is found
                    elements = partition(
                        filename=file_path,
                        strategy="hi_res",
                        infer_table_structure=True,
                        show_progress=True,
                        #chunking_strategy="by_title"
                    )
            except Exception as e:
                print(f"Failed to process document {file_path}: {e}")
                continue
            categorized_content = {                
                "tables": {"content": [], "Metadata": []},
                "images": {"content": [], "Metadata": []},
                "text": {"content": [], "Metadata": []},
                "text2": {"content": [], "Metadata": []}                           
            }
            element_content.append(elements)
            CNT=1
            for chunk in elements:
                # Safely extract metadata and text
                chunk_type = str(type(chunk))
                chunk_metadata = chunk.metadata.to_dict() if chunk.metadata else {}                
                chunk_text = getattr(chunk, "text", None)

                # Separate content into categories
                #if "Table" in chunk_type:
                if any(
                    keyword in chunk_type
                    for keyword in [
                        "Table",
                        "TableChunk"]):                    
                    categorized_content["tables"]["content"].append(chunk_text)
                    categorized_content["tables"]["Metadata"].append(chunk_metadata)
                    
                    #test1
                    TABLE_DATA=f"Table number {CNT} "+chunk_metadata.get("text_as_html", "")+" "
                    CNT+=1
                    categorized_content["text"]["content"].append(TABLE_DATA)
                    categorized_content["text"]["Metadata"].append(chunk_metadata) 

                elif "Image" in chunk_type:
                    categorized_content["images"]["content"].append(chunk_text)
                    categorized_content["images"]["Metadata"].append(chunk_metadata)
                elif any(
                    keyword in chunk_type
                    for keyword in [
                        "CompositeElement",
                        "Text",
                        "NarrativeText",
                        "Title",
                        "Header",
                        "Footer",
                        "FigureCaption",
                        "ListItem",
                        "UncategorizedText",
                        "Formula",
                        "CodeSnippet",
                        "Address",
                        "EmailAddress",
                        "PageBreak",
                    ]
                ):
                    categorized_content["text"]["content"].append(chunk_text)
                    categorized_content["text"]["Metadata"].append(chunk_metadata)                     
                                
                else:
                    continue
            # Append processed document
            processed_documents.append({
                "doc_id": doc_id,
                "source": file_path,
                **categorized_content,
            })
            
    # Loop over tables and match text from the same document and page
    
    '''
    for doc in processed_documents:
        cnt=1 # count for storing number of the table
        for table_metadata in doc.get("tables", {}).get("Metadata", []):
            page_number = table_metadata.get("page_number")
            source = doc.get("source")
            page_content = ""
    
            for text_metadata, text_content in zip(
                doc.get("text", {}).get("Metadata", []),
                doc.get("text", {}).get("content", [])
            ):
                page_number2 = text_metadata.get("page_number")
                source2 = doc.get("source")
            
                if source == source2 and page_number == page_number2:
                    print(f"Matching text found for source: {source}, page: {page_number}")
                    page_content += f"{text_content} "  # Concatenate text with a space
        
            # Add the matched content to the table metadata 
            table_metadata["page_content"] =f"Table number {cnt} "+table_metadata.get("text_as_html", "")+" "+page_content.strip()  # Remove trailing spaces and have the content proper here 
            table_metadata["text_as_html"] = table_metadata.get("text_as_html", "") # we are also storing it seperatly
            table_metadata["Table_number"] = cnt  # addiing the table number it will be use in retrival
            cnt+=1
        
            # Custom loader of document which will store the table along with the text on that page specifically
            # making document of each table with its content
            unique_id = str(uuid.uuid4())
            table_document.append(
                Document(
                    
                    id =unique_id, # Add doc_id directly
                    page_content=table_metadata.get("page_content", ""),  # Get page_content from metadata, default to empty string if missing
                    metadata={
                        "source": doc["source"],
                        "text_as_html": table_metadata.get("text_as_html", ""),
                        "filetype": table_metadata.get("filetype", ""),
                        "page_number": str(table_metadata.get("page_number", 0)),  # Default to 0 if missing
                        "image_path": table_metadata.get("image_path", ""),
                        "file_directory": table_metadata.get("file_directory", ""),
                        "filename": table_metadata.get("filename", ""),
                        "Table_number": str(table_metadata.get("Table_number", 0))  # Default to 0 if missing
                    }
                )
            )
    '''

    # Initialize a structure to group content by doc_id
    grouped_by_doc_id = defaultdict(lambda: {
        "text_content": [],
        "metadata": None,  # Metadata will only be set once per doc_id
    })

    for doc in processed_documents:
        doc_id = doc.get("doc_id")
        source = doc.get("source")
        text_content = doc.get("text", {}).get("content", [])
        metadata_list = doc.get("text", {}).get("Metadata", [])

        # Merge text content
        grouped_by_doc_id[doc_id]["text_content"].extend(text_content)

        # Set metadata (if not already set)
        if grouped_by_doc_id[doc_id]["metadata"] is None and metadata_list:
            metadata = metadata_list[0]  # Assuming metadata is consistent
            grouped_by_doc_id[doc_id]["metadata"] = {
                "source": source,
                "filetype": metadata.get("filetype"),
                "file_directory": metadata.get("file_directory"),
                "filename": metadata.get("filename"),
                "languages": str(metadata.get("languages")),
            }

    # Convert grouped content into Document objects
    grouped_documents = []
    for doc_id, data in grouped_by_doc_id.items():
        grouped_documents.append(
            Document(
                id=doc_id,
                page_content=" ".join(data["text_content"]).strip(),
                metadata=data["metadata"],
            )
        )

    # Output the grouped documents
    for document in grouped_documents:
        print(document)


    #Dirctory loader for loading the text data only to specific db
    '''
    loader = DirectoryLoader(data_path, glob="*.*")
    documents = loader.load()
    
    # update the metadata adding filname to the met
    for doc in documents:   
        unique_id = str(uuid.uuid4()) 
        doc.id = unique_id  
        path=doc.metadata.get("source")
        match = re.search(r'([^\\]+\.[^\\]+)$', path)
        doc.metadata.update({"filename":match.group(1)})
    return documents,
    '''
    return grouped_documents
#documents,processed_documents,table_document = load_document(data_path)


########################################################################################################################################################
####-------------------------------------------------------------- Chunking the Text  --------------------------------------------------------------####
########################################################################################################################################################

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=500,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents) # splitting the document into chunks 
    for index in chunks:
        index.metadata["start_index"]=str(index.metadata["start_index"]) # the converstion of int metadata to str was done to store it in sqlite3 
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    return chunks

########################################################################################################################################################
####---------------------------------------------------- Creating and Storeing Data in Vector DB  --------------------------------------------------####
########################################################################################################################################################

#def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
def save_to_chroma(chunks: list[Document], name: str):
    CHROMA_PATH = f"./VectorDB/chroma_{name}"
    #TABLE_PATH = f"./TableDB/chroma_{name}"
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
    # if os.path.exists(TABLE_PATH):
    #     shutil.rmtree(TABLE_PATH)

    try:
        # Load the embedding model
        embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
        # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
        print("Creating document vector database...")
        db = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_function,
            persist_directory=CHROMA_PATH,
        )
        print("Document database successfully saved.")

        # # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
        # if tables:
        #     print("Creating table vector database...")
        #     tdb = Chroma.from_documents(
        #         documents=tables,
        #         embedding=embedding_function,
        #         persist_directory=TABLE_PATH,
        #     )
        #     print("Table database successfully saved.")
        # else:
        #     tdb = None

        #return db, tdb
        return db
    
    except Exception as e:
        print("Error while saving to Chroma:", e)
        return None

# def get_unique_sources(chroma_path):
#     db = Chroma(persist_directory=chroma_path)
#     metadata_list = db.get()["metadatas"]
#     unique_sources = {metadata["source"] for metadata in metadata_list if "source" in metadata}
#     return list(unique_sources)

########################################################################################################################################################
####----------------------------------------------------------- Updating Existing Data in Vector DB  -----------------------------------------------####
########################################################################################################################################################

# def add_document_to_existing_db(new_documents: list[Document], db_name: str):
#     CHROMA_PATH = f"./VectorDB/chroma_{db_name}"

#     if not os.path.exists(CHROMA_PATH):
#         print(f"Database '{db_name}' does not exist. Please create it first.")
#         return

#     try:
#         embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
#         #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
#         db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

#         print("Adding new documents to the existing database...")
#         chunks = split_text(new_documents)
#         db.add_documents(chunks)
#         db.persist()
#         print("New documents added and database updated successfully.")
#     except Exception as e:
#         print("Error while adding documents to existing database:", e)

# def delete_chunks_by_source(chroma_path, source_to_delete):
#     if not os.path.exists(chroma_path):
#         print(f"Database at path '{chroma_path}' does not exist.")
#         return

#     try:
#         #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
#         embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
#         db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)

#         print(f"Retrieving all metadata to identify chunks with source '{source_to_delete}'...")
#         metadata_list = db.get()["metadatas"]

#         # Identify indices of chunks to delete
#         indices_to_delete = [
#             idx for idx, metadata in enumerate(metadata_list) if metadata.get("source") == source_to_delete
#         ]

#         if not indices_to_delete:
#             print(f"No chunks found with source '{source_to_delete}'.")
#             return

#         print(f"Deleting {len(indices_to_delete)} chunks with source '{source_to_delete}'...")
#         db.delete(indices=indices_to_delete)
#         db.persist()
#         print("Chunks deleted and database updated successfully.")
#     except Exception as e:
#         print(f"Error while deleting chunks by source: {e}")
        
# # update a data store        
# def update_data_store(file_path, db_name):
#     CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
#     print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")

#     try:
#         documents,table_document = load_document(file_path)
#         print("Documents loaded successfully.")
#     except Exception as e:
#         print(f"Error loading documents: {e}")
#         return

#     try:
#         chunks = split_text(documents)
#         print(f"Text split into {len(chunks)} chunks.")
#     except Exception as e:
#         print(f"Error splitting text: {e}")
#         return

#     try:
#         asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
#         print(f"Data saved to Chroma for database {db_name}.")
#     except Exception as e:
#         print(f"Error saving to Chroma: {e}")
#         return

########################################################################################################################################################
####------------------------------------------------------- Combine Process of Load, Chunk and Store  ----------------------------------------------####
########################################################################################################################################################

def generate_data_store(file_path, db_name):
    CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
    print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")

    try:
        #documents,grouped_documents = load_document(file_path)
        grouped_documents = load_document(file_path)
        print("Documents loaded successfully.")
    except Exception as e:
        print(f"Error loading documents: {e}")
        return

    try:
        chunks = split_text(grouped_documents)
        print(f"Text split into {len(chunks)} chunks.")
    except Exception as e:
        print(f"Error splitting text: {e}")
        return

    try:
        #asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
        asyncio.run(save_to_chroma(chunks, db_name))
        print(f"Data saved to Chroma for database {db_name}.")
    except Exception as e:
        print(f"Error saving to Chroma: {e}")
        return