File size: 22,166 Bytes
894171e
263ccc7
 
 
894171e
 
 
 
263ccc7
 
 
 
 
 
894171e
8ce0321
bdbd2e3
8ce0321
263ccc7
894171e
 
 
 
f33b573
894171e
 
f33b573
894171e
263ccc7
 
 
 
f33b573
894171e
263ccc7
8ce0321
263ccc7
 
 
 
 
 
 
 
 
 
 
 
f33b573
263ccc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ce0321
263ccc7
8ce0321
263ccc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ce0321
263ccc7
 
8ce0321
263ccc7
 
 
 
 
 
 
 
 
 
 
 
 
 
894171e
263ccc7
 
 
 
 
 
 
 
 
8ce0321
 
f33b573
8ce0321
263ccc7
 
 
 
 
 
894171e
 
 
8ce0321
 
263ccc7
894171e
263ccc7
 
 
 
894171e
 
 
263ccc7
 
 
894171e
e0efaf2
8ce0321
1ca4c9d
8ce0321
894171e
 
8ce0321
 
263ccc7
 
 
8ce0321
2fa40d0
263ccc7
 
8ce0321
 
 
 
 
 
 
 
 
263ccc7
8ce0321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e0efaf2
263ccc7
 
 
 
 
 
 
 
6277cb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263ccc7
6277cb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263ccc7
 
 
 
 
8ce0321
894171e
263ccc7
 
894171e
8ce0321
 
894171e
 
 
 
 
 
8ce0321
894171e
 
 
 
 
 
8ce0321
 
894171e
 
 
 
8ce0321
6277cb8
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
from langchain_community.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
import os
import shutil
import asyncio
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.auto import partition
import pytesseract
import os
import re
import uuid
from langchain.schema import Document
from collections import defaultdict
pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')

# Configurations
UPLOAD_FOLDER = "./uploads"
VECTOR_DB_FOLDER = "./VectorDB"
IMAGE_DB_FOLDER = "./ImageDB"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
os.makedirs(IMAGE_DB_FOLDER, exist_ok=True)

########################################################################################################################################################
####--------------------------------------------------------------  Documnet Loader  ---------------------------------------------------------------####
########################################################################################################################################################
# Loaders for loading Document text, tables and images from any file format.

def load_document(data_path):
    processed_documents = []
    #element_content = []
    table_document = []
    #having different process for the pdf
    for root, _, files in os.walk(data_path):
        for file in files:
            file_path = os.path.join(root, file)
            doc_id = str(uuid.uuid4())  # Generate a unique ID for the document

            print(f"Processing document ID: {doc_id}, Path: {file_path}")

            try:
                # Determine the file type based on extension
                filename, file_extension = os.path.splitext(file.lower())
                image_output = f"./ImageDB/{filename}/"
                # Use specific partition techniques based on file extension
                if file_extension == ".pdf":
                    elements = partition_pdf(
                        filename=file_path,
                        strategy="hi_res",  # Use layout detection
                        infer_table_structure=True,
                        hi_res_model_name="yolox",
                        extract_images_in_pdf=True,
                        extract_image_block_types=["Image","Table"],
                        extract_image_block_output_dir=image_output,
                        show_progress=True,
                        #chunking_strategy="by_title",
                    )
                else:
                    # Default to auto partition if no specific handler is found
                    elements = partition(
                        filename=file_path,
                        strategy="hi_res",
                        infer_table_structure=True,
                        show_progress=True,
                        #chunking_strategy="by_title"
                    )
            except Exception as e:
                print(f"Failed to process document {file_path}: {e}")
                continue
            categorized_content = {                
                "tables": {"content": [], "Metadata": []},
                "images": {"content": [], "Metadata": []},
                "text": {"content": [], "Metadata": []},                                         
            }
            #element_content.append(elements)
            CNT=1
            for chunk in elements:
                # Safely extract metadata and text
                chunk_type = str(type(chunk))
                chunk_metadata = chunk.metadata.to_dict() if chunk.metadata else {}                
                chunk_text = getattr(chunk, "text", None)

                # Separate content into categories
                #if "Table" in chunk_type:
                if any(
                    keyword in chunk_type
                    for keyword in [
                        "Table",
                        "TableChunk"]):                    
                    categorized_content["tables"]["content"].append(chunk_text)
                    categorized_content["tables"]["Metadata"].append(chunk_metadata)
                    
                    #test1
                    TABLE_DATA=f"Table number {CNT} "+chunk_metadata.get("text_as_html", "")+" "
                    CNT+=1
                    categorized_content["text"]["content"].append(TABLE_DATA)
                    categorized_content["text"]["Metadata"].append(chunk_metadata) 

                elif "Image" in chunk_type:
                    categorized_content["images"]["content"].append(chunk_text)
                    categorized_content["images"]["Metadata"].append(chunk_metadata)
                elif any(
                    keyword in chunk_type
                    for keyword in [
                        "CompositeElement",
                        "Text",
                        "NarrativeText",
                        "Title",
                        "Header",
                        "Footer",
                        "FigureCaption",
                        "ListItem",
                        "UncategorizedText",
                        "Formula",
                        "CodeSnippet",
                        "Address",
                        "EmailAddress",
                        "PageBreak",
                    ]
                ):
                    categorized_content["text"]["content"].append(chunk_text)
                    categorized_content["text"]["Metadata"].append(chunk_metadata)                     
                                
                else:
                    continue
            # Append processed document
            processed_documents.append({
                "doc_id": doc_id,
                "source": file_path,
                **categorized_content,
            })
            
    # Loop over tables and match text from the same document and page
    
    for doc in processed_documents:
        cnt=1 # count for storing number of the table
        for table_metadata in doc.get("tables", {}).get("Metadata", []):
            page_number = table_metadata.get("page_number")
            source = doc.get("source")
            page_content = ""
    
            for text_metadata, text_content in zip(
                doc.get("text", {}).get("Metadata", []),
                doc.get("text", {}).get("content", [])
            ):
                page_number2 = text_metadata.get("page_number")
                source2 = doc.get("source")
            
                if source == source2 and page_number == page_number2:
                    print(f"Matching text found for source: {source}, page: {page_number}")
                    page_content += f"{text_content} "  # Concatenate text with a space
        
            # Add the matched content to the table metadata 
            table_metadata["page_content"] =f"Table number {cnt} "+table_metadata.get("text_as_html", "")+" "+page_content.strip()  # Remove trailing spaces and have the content proper here 
            table_metadata["text_as_html"] = table_metadata.get("text_as_html", "") # we are also storing it seperatly
            table_metadata["Table_number"] = cnt  # addiing the table number it will be use in retrival
            cnt+=1
        
            # Custom loader of document which will store the table along with the text on that page specifically
            # making document of each table with its content
            unique_id = str(uuid.uuid4())
            table_document.append(
                Document(
                    
                    id =unique_id, # Add doc_id directly
                    page_content=table_metadata.get("page_content", ""),  # Get page_content from metadata, default to empty string if missing
                    metadata={
                        "source": doc["source"],
                        "text_as_html": table_metadata.get("text_as_html", ""),
                        "filetype": table_metadata.get("filetype", ""),
                        "page_number": str(table_metadata.get("page_number", 0)),  # Default to 0 if missing
                        "image_path": table_metadata.get("image_path", ""),
                        "file_directory": table_metadata.get("file_directory", ""),
                        "filename": table_metadata.get("filename", ""),
                        "Table_number": str(table_metadata.get("Table_number", 0))  # Default to 0 if missing
                    }
                )
            )

    # Initialize a structure to group content by doc_id
    grouped_by_doc_id = defaultdict(lambda: {
        "text_content": [],
        "metadata": None,  # Metadata will only be set once per doc_id
    })

    for doc in processed_documents:
        doc_id = doc.get("doc_id")
        source = doc.get("source")
        text_content = doc.get("text", {}).get("content", [])
        metadata_list = doc.get("text", {}).get("Metadata", [])

        # Merge text content
        grouped_by_doc_id[doc_id]["text_content"].extend(text_content)

        # Set metadata (if not already set)
        if grouped_by_doc_id[doc_id]["metadata"] is None and metadata_list:
            metadata = metadata_list[0]  # Assuming metadata is consistent
            grouped_by_doc_id[doc_id]["metadata"] = {
                "source": source,
                #"filetype": metadata.get("filetype"),
                "file_directory": metadata.get("file_directory"),
                "filename": metadata.get("filename"),
                #"languages": str(metadata.get("languages")),
            }

    # Convert grouped content into Document objects
    grouped_documents = []
    for doc_id, data in grouped_by_doc_id.items():
        grouped_documents.append(
            Document(
                id=doc_id,
                page_content=" ".join(data["text_content"]).strip(),
                metadata=data["metadata"],
            )
        )

    #Dirctory loader for loading the text data only to specific db
    loader = DirectoryLoader(data_path, glob="*.*")
    documents = loader.load()
    
    # update the metadata adding filname to the met
    for doc in documents:   
        unique_id = str(uuid.uuid4()) 
        doc.id = unique_id  
        path=doc.metadata.get("source")
        match = re.search(r'([^\\]+\.[^\\]+)$', path)
        doc.metadata.update({"filename":match.group(1)})
    
    return grouped_documents,documents,table_document
    
#grouped_documents = load_document(data_path)
#documents,processed_documents,table_document = load_document(data_path)


########################################################################################################################################################
####-------------------------------------------------------------- Chunking the Text  --------------------------------------------------------------####
########################################################################################################################################################

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=600,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents) # splitting the document into chunks 
    for index in chunks:
        index.metadata["start_index"]=str(index.metadata["start_index"]) # the converstion of int metadata to str was done to store it in sqlite3 
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    return chunks

########################################################################################################################################################
####---------------------------------------------------- Creating and Storeing Data in Vector DB  --------------------------------------------------####
########################################################################################################################################################

#def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
async def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
    CHROMA_PATH = f"./VectorDB/chroma_{name}"
    TABLE_PATH = f"./TableDB/chroma_{name}"
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
    if os.path.exists(TABLE_PATH):
        shutil.rmtree(TABLE_PATH)

    try:
        # Load the embedding model
        embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2",show_progress=True)
        #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
        # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
        print("Creating document vector database...")
        db =Chroma.from_documents(
                                            documents=chunks,
                                            embedding=embedding_function,
                                            persist_directory=CHROMA_PATH,
                                            
                                        )
      
        print("Persisting the document database...")
        db.persist()
        print("Document database successfully saved.")
        
        # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
        if tables !=[]:
            print("Creating table vector database...")
            tdb =Chroma.from_documents(
                documents=tables,
                embedding=embedding_function,
                persist_directory=TABLE_PATH,
            )
            print("Persisting the table database...")
            db.persist()
            print("Table database successfully saved.")
        else:
            tdb = None

        return db, tdb
        #return db
    
    except Exception as e:
        print("Error while saving to Chroma:", e)
        return None

########################################################################################################################################################
####----------------------------------------------------------- Updating Existing Data in Vector DB  -----------------------------------------------####
########################################################################################################################################################

# adding document to Existing db
async def add_document_to_existing_db(new_chunks: list[Document], db_name: str,tables: list[Document]):
    CHROMA_PATH = f"./VectorDB/{db_name}"
    TABLE_PATH = f"./TableDB/{db_name}"
    if not os.path.exists(CHROMA_PATH):
        print(f"Database '{db_name}' does not exist. Please create it first.")
        return
    try:
        # Load the embedding model
        embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2",show_progress=True)
        #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
        # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
        print("Creating document vector database...")
        db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
        # db =Chroma.from_documents(
        #                                     documents=new_chunks,
        #                                     embedding=embedding_function,
        #                                     persist_directory=CHROMA_PATH,
                                            
        #                                 )
        print("Persisting the document database...")
        db.add_documents(new_chunks)
        db.persist()
        print("Document database successfully saved.")
        
        # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
        if tables !=[]:
            print("Creating table vector database...")
            if not os.path.exists(TABLE_PATH):
                print(f"Database '{db_name}' does not exist. Lets create it first.")
                print("Persisting the table database...")
                tdb =Chroma.from_documents(
                documents=tables,
                embedding=embedding_function,
                persist_directory=TABLE_PATH,
                  )
            else:
                tdb = Chroma(persist_directory=TABLE_PATH, embedding_function=embedding_function)
                print("Persisting the table database...")
                db.add_documents(tables)
                db.persist()
            print("Table database successfully saved.")
        else:
            tdb = None

        return db, tdb
        #return db
    
    except Exception as e:
        print("Error while saving to Chroma:", e)
        return None

#delete chunks by logics 
def delete_chunks_by_source(chroma_path, source_to_delete):
    if not os.path.exists(chroma_path):
        print(f"Database at path '{chroma_path}' does not exist.")
        return

    try:
        #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
        db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)

        print(f"Retrieving all metadata to identify chunks with source '{source_to_delete}'...")
        metadata_list = db.get()["metadatas"]

        # Identify indices of chunks to delete
        indices_to_delete = [
            idx for idx, metadata in enumerate(metadata_list) if metadata.get("source") == source_to_delete
        ]

        if not indices_to_delete:
            print(f"No chunks found with source '{source_to_delete}'.")
            return

        print(f"Deleting {len(indices_to_delete)} chunks with source '{source_to_delete}'...")
        db.delete(indices=indices_to_delete)
        db.persist()
        print("Chunks deleted and database updated successfully.")
    except Exception as e:
        print(f"Error while deleting chunks by source: {e}")
        
########################################################################################################################################################
####-----------------------------------------------Combine Process of upload, Chunk and Store  (FOR NEW DOC)----------------------------------------####
########################################################################################################################################################
        
# update a data store        
async def update_data_store(file_path, db_name):
    CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
    print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")

    try:
        documents,processed_documents,table_document = load_document(file_path)
        #grouped_document,document = load_document(file_path)
        print("Documents loaded successfully.")
    except Exception as e:
        print(f"Error loading documents: {e}")
        return

    try:
        chunks = split_text(documents)
        print(f"Text split into {len(chunks)} chunks.")
    except Exception as e:
        print(f"Error splitting text: {e}")
        return

    try:
        await add_document_to_existing_db(chunks, db_name, table_document)
        #await asyncio.run(save_to_chroma(chunks, db_name,table_document))
        print(f"Data saved to Chroma for database {db_name}.")
    except Exception as e:
        print(f"Error saving to Chroma: {e}")
        return

########################################################################################################################################################
####------------------------------------------------------- Combine Process of Load, Chunk and Store  ----------------------------------------------####
########################################################################################################################################################

async def generate_data_store(file_path, db_name):
    CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
    print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")

    try:
        documents,processed_documents,table_document = load_document(file_path)
        #grouped_document,document = load_document(file_path)
        print("Documents loaded successfully.")
    except Exception as e:
        print(f"Error loading documents: {e}")
        return

    try:
        chunks = split_text(documents)
        print(f"Text split into {len(chunks)} chunks.")
    except Exception as e:
        print(f"Error splitting text: {e}")
        return

    try:
        await save_to_chroma(chunks, db_name, table_document)
        #await asyncio.run(save_to_chroma(chunks, db_name,table_document))
        print(f"Data saved to Chroma for database {db_name}.")
    except Exception as e:
        print(f"Error saving to Chroma: {e}")
        return
   
########################################################################################################################################################
####-------------------------------------------------------------------- Token counter   -----------------------------------------------------------####
########################################################################################################################################################

def approximate_bpe_token_counter(text):
    # Split on spaces, punctuation, and common subword patterns
    tokens = re.findall(r"\w+|[^\w\s]", text, re.UNICODE)
    return len(tokens)