WebashalarForML commited on
Commit
5eb0b04
·
verified ·
1 Parent(s): 5c6971a

Update N.TXT

Browse files
Files changed (1) hide show
  1. N.TXT +427 -0
N.TXT CHANGED
@@ -296,4 +296,431 @@ def update_db(db_name):
296
  if __name__ == "__main__":
297
  app.run(debug=False, use_reloader=False)
298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
 
 
296
  if __name__ == "__main__":
297
  app.run(debug=False, use_reloader=False)
298
 
299
+
300
+
301
+ RETRIVAL PY
302
+
303
+
304
+ from langchain_community.document_loaders import DirectoryLoader
305
+ from langchain.embeddings import HuggingFaceEmbeddings
306
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
307
+ from langchain.schema import Document
308
+ from langchain_core.documents import Document
309
+ from langchain_community.vectorstores import Chroma
310
+ import os
311
+ import shutil
312
+ import asyncio
313
+ from unstructured.partition.pdf import partition_pdf
314
+ from unstructured.partition.auto import partition
315
+ import pytesseract
316
+ import os
317
+ import re
318
+ import uuid
319
+ from collections import defaultdict
320
+
321
+ pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
322
+
323
+ # Configurations
324
+ UPLOAD_FOLDER = "./uploads"
325
+ VECTOR_DB_FOLDER = "./VectorDB"
326
+ IMAGE_DB_FOLDER = "./Images"
327
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
328
+ os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
329
+
330
+ ########################################################################################################################################################
331
+ ####-------------------------------------------------------------- Documnet Loader ---------------------------------------------------------------####
332
+ ########################################################################################################################################################
333
+ # Loaders for loading Document text, tables and images from any file format.
334
+ #data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
335
+ def load_document(data_path):
336
+ processed_documents = []
337
+ element_content = []
338
+ table_document = []
339
+ #having different process for the pdf
340
+ for root, _, files in os.walk(data_path):
341
+ for file in files:
342
+ file_path = os.path.join(root, file)
343
+ doc_id = str(uuid.uuid4()) # Generate a unique ID for the document
344
+
345
+ print(f"Processing document ID: {doc_id}, Path: {file_path}")
346
+
347
+ try:
348
+ # Determine the file type based on extension
349
+ filename, file_extension = os.path.splitext(file.lower())
350
+ image_output = f"./Images/{filename}/"
351
+ # Use specific partition techniques based on file extension
352
+ if file_extension == ".pdf":
353
+ elements = partition_pdf(
354
+ filename=file_path,
355
+ strategy="hi_res", # Use layout detection
356
+ infer_table_structure=True,
357
+ hi_res_model_name="yolox",
358
+ extract_images_in_pdf=True,
359
+ extract_image_block_types=["Image","Table"],
360
+ extract_image_block_output_dir=image_output,
361
+ show_progress=True,
362
+ #chunking_strategy="by_title",
363
+ )
364
+ else:
365
+ # Default to auto partition if no specific handler is found
366
+ elements = partition(
367
+ filename=file_path,
368
+ strategy="hi_res",
369
+ infer_table_structure=True,
370
+ show_progress=True,
371
+ #chunking_strategy="by_title"
372
+ )
373
+ except Exception as e:
374
+ print(f"Failed to process document {file_path}: {e}")
375
+ continue
376
+ categorized_content = {
377
+ "tables": {"content": [], "Metadata": []},
378
+ "images": {"content": [], "Metadata": []},
379
+ "text": {"content": [], "Metadata": []},
380
+ "text2": {"content": [], "Metadata": []}
381
+ }
382
+ element_content.append(elements)
383
+ CNT=1
384
+ for chunk in elements:
385
+ # Safely extract metadata and text
386
+ chunk_type = str(type(chunk))
387
+ chunk_metadata = chunk.metadata.to_dict() if chunk.metadata else {}
388
+ chunk_text = getattr(chunk, "text", None)
389
+
390
+ # Separate content into categories
391
+ #if "Table" in chunk_type:
392
+ if any(
393
+ keyword in chunk_type
394
+ for keyword in [
395
+ "Table",
396
+ "TableChunk"]):
397
+ categorized_content["tables"]["content"].append(chunk_text)
398
+ categorized_content["tables"]["Metadata"].append(chunk_metadata)
399
+
400
+ #test1
401
+ TABLE_DATA=f"Table number {CNT} "+chunk_metadata.get("text_as_html", "")+" "
402
+ CNT+=1
403
+ categorized_content["text"]["content"].append(TABLE_DATA)
404
+ categorized_content["text"]["Metadata"].append(chunk_metadata)
405
+
406
+ elif "Image" in chunk_type:
407
+ categorized_content["images"]["content"].append(chunk_text)
408
+ categorized_content["images"]["Metadata"].append(chunk_metadata)
409
+ elif any(
410
+ keyword in chunk_type
411
+ for keyword in [
412
+ "CompositeElement",
413
+ "Text",
414
+ "NarrativeText",
415
+ "Title",
416
+ "Header",
417
+ "Footer",
418
+ "FigureCaption",
419
+ "ListItem",
420
+ "UncategorizedText",
421
+ "Formula",
422
+ "CodeSnippet",
423
+ "Address",
424
+ "EmailAddress",
425
+ "PageBreak",
426
+ ]
427
+ ):
428
+ categorized_content["text"]["content"].append(chunk_text)
429
+ categorized_content["text"]["Metadata"].append(chunk_metadata)
430
+
431
+ else:
432
+ continue
433
+ # Append processed document
434
+ processed_documents.append({
435
+ "doc_id": doc_id,
436
+ "source": file_path,
437
+ **categorized_content,
438
+ })
439
+
440
+ # Loop over tables and match text from the same document and page
441
+
442
+ '''
443
+ for doc in processed_documents:
444
+ cnt=1 # count for storing number of the table
445
+ for table_metadata in doc.get("tables", {}).get("Metadata", []):
446
+ page_number = table_metadata.get("page_number")
447
+ source = doc.get("source")
448
+ page_content = ""
449
+
450
+ for text_metadata, text_content in zip(
451
+ doc.get("text", {}).get("Metadata", []),
452
+ doc.get("text", {}).get("content", [])
453
+ ):
454
+ page_number2 = text_metadata.get("page_number")
455
+ source2 = doc.get("source")
456
+
457
+ if source == source2 and page_number == page_number2:
458
+ print(f"Matching text found for source: {source}, page: {page_number}")
459
+ page_content += f"{text_content} " # Concatenate text with a space
460
+
461
+ # Add the matched content to the table metadata
462
+ table_metadata["page_content"] =f"Table number {cnt} "+table_metadata.get("text_as_html", "")+" "+page_content.strip() # Remove trailing spaces and have the content proper here
463
+ table_metadata["text_as_html"] = table_metadata.get("text_as_html", "") # we are also storing it seperatly
464
+ table_metadata["Table_number"] = cnt # addiing the table number it will be use in retrival
465
+ cnt+=1
466
+
467
+ # Custom loader of document which will store the table along with the text on that page specifically
468
+ # making document of each table with its content
469
+ unique_id = str(uuid.uuid4())
470
+ table_document.append(
471
+ Document(
472
+
473
+ id =unique_id, # Add doc_id directly
474
+ page_content=table_metadata.get("page_content", ""), # Get page_content from metadata, default to empty string if missing
475
+ metadata={
476
+ "source": doc["source"],
477
+ "text_as_html": table_metadata.get("text_as_html", ""),
478
+ "filetype": table_metadata.get("filetype", ""),
479
+ "page_number": str(table_metadata.get("page_number", 0)), # Default to 0 if missing
480
+ "image_path": table_metadata.get("image_path", ""),
481
+ "file_directory": table_metadata.get("file_directory", ""),
482
+ "filename": table_metadata.get("filename", ""),
483
+ "Table_number": str(table_metadata.get("Table_number", 0)) # Default to 0 if missing
484
+ }
485
+ )
486
+ )
487
+ '''
488
+
489
+ # Initialize a structure to group content by doc_id
490
+ grouped_by_doc_id = defaultdict(lambda: {
491
+ "text_content": [],
492
+ "metadata": None, # Metadata will only be set once per doc_id
493
+ })
494
+
495
+ for doc in processed_documents:
496
+ doc_id = doc.get("doc_id")
497
+ source = doc.get("source")
498
+ text_content = doc.get("text", {}).get("content", [])
499
+ metadata_list = doc.get("text", {}).get("Metadata", [])
500
+
501
+ # Merge text content
502
+ grouped_by_doc_id[doc_id]["text_content"].extend(text_content)
503
+
504
+ # Set metadata (if not already set)
505
+ if grouped_by_doc_id[doc_id]["metadata"] is None and metadata_list:
506
+ metadata = metadata_list[0] # Assuming metadata is consistent
507
+ grouped_by_doc_id[doc_id]["metadata"] = {
508
+ "source": source,
509
+ "filetype": metadata.get("filetype"),
510
+ "file_directory": metadata.get("file_directory"),
511
+ "filename": metadata.get("filename"),
512
+ "languages": str(metadata.get("languages")),
513
+ }
514
+
515
+ # Convert grouped content into Document objects
516
+ grouped_documents = []
517
+ for doc_id, data in grouped_by_doc_id.items():
518
+ grouped_documents.append(
519
+ Document(
520
+ id=doc_id,
521
+ page_content=" ".join(data["text_content"]).strip(),
522
+ metadata=data["metadata"],
523
+ )
524
+ )
525
+
526
+ # Output the grouped documents
527
+ for document in grouped_documents:
528
+ print(document)
529
+
530
+
531
+ #Dirctory loader for loading the text data only to specific db
532
+ '''
533
+ loader = DirectoryLoader(data_path, glob="*.*")
534
+ documents = loader.load()
535
+
536
+ # update the metadata adding filname to the met
537
+ for doc in documents:
538
+ unique_id = str(uuid.uuid4())
539
+ doc.id = unique_id
540
+ path=doc.metadata.get("source")
541
+ match = re.search(r'([^\\]+\.[^\\]+)$', path)
542
+ doc.metadata.update({"filename":match.group(1)})
543
+ return documents,
544
+ '''
545
+ return grouped_documents
546
+ #documents,processed_documents,table_document = load_document(data_path)
547
+
548
+
549
+ ########################################################################################################################################################
550
+ ####-------------------------------------------------------------- Chunking the Text --------------------------------------------------------------####
551
+ ########################################################################################################################################################
552
+
553
+ def split_text(documents: list[Document]):
554
+ text_splitter = RecursiveCharacterTextSplitter(
555
+ chunk_size=1000,
556
+ chunk_overlap=500,
557
+ length_function=len,
558
+ add_start_index=True,
559
+ )
560
+ chunks = text_splitter.split_documents(documents) # splitting the document into chunks
561
+ for index in chunks:
562
+ index.metadata["start_index"]=str(index.metadata["start_index"]) # the converstion of int metadata to str was done to store it in sqlite3
563
+ print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
564
+ return chunks
565
+
566
+ ########################################################################################################################################################
567
+ ####---------------------------------------------------- Creating and Storeing Data in Vector DB --------------------------------------------------####
568
+ ########################################################################################################################################################
569
+
570
+ #def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
571
+ def save_to_chroma(chunks: list[Document], name: str):
572
+ CHROMA_PATH = f"./VectorDB/chroma_{name}"
573
+ #TABLE_PATH = f"./TableDB/chroma_{name}"
574
+ if os.path.exists(CHROMA_PATH):
575
+ shutil.rmtree(CHROMA_PATH)
576
+ # if os.path.exists(TABLE_PATH):
577
+ # shutil.rmtree(TABLE_PATH)
578
+
579
+ try:
580
+ # Load the embedding model
581
+ embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
582
+ #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
583
+ # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
584
+ print("Creating document vector database...")
585
+ db = Chroma.from_documents(
586
+ documents=chunks,
587
+ embedding=embedding_function,
588
+ persist_directory=CHROMA_PATH,
589
+ )
590
+ print("Document database successfully saved.")
591
+
592
+ # # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
593
+ # if tables:
594
+ # print("Creating table vector database...")
595
+ # tdb = Chroma.from_documents(
596
+ # documents=tables,
597
+ # embedding=embedding_function,
598
+ # persist_directory=TABLE_PATH,
599
+ # )
600
+ # print("Table database successfully saved.")
601
+ # else:
602
+ # tdb = None
603
+
604
+ #return db, tdb
605
+ return db
606
+
607
+ except Exception as e:
608
+ print("Error while saving to Chroma:", e)
609
+ return None
610
+
611
+ # def get_unique_sources(chroma_path):
612
+ # db = Chroma(persist_directory=chroma_path)
613
+ # metadata_list = db.get()["metadatas"]
614
+ # unique_sources = {metadata["source"] for metadata in metadata_list if "source" in metadata}
615
+ # return list(unique_sources)
616
+
617
+ ########################################################################################################################################################
618
+ ####----------------------------------------------------------- Updating Existing Data in Vector DB -----------------------------------------------####
619
+ ########################################################################################################################################################
620
+
621
+ # def add_document_to_existing_db(new_documents: list[Document], db_name: str):
622
+ # CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
623
+
624
+ # if not os.path.exists(CHROMA_PATH):
625
+ # print(f"Database '{db_name}' does not exist. Please create it first.")
626
+ # return
627
+
628
+ # try:
629
+ # embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
630
+ # #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
631
+ # db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
632
+
633
+ # print("Adding new documents to the existing database...")
634
+ # chunks = split_text(new_documents)
635
+ # db.add_documents(chunks)
636
+ # db.persist()
637
+ # print("New documents added and database updated successfully.")
638
+ # except Exception as e:
639
+ # print("Error while adding documents to existing database:", e)
640
+
641
+ # def delete_chunks_by_source(chroma_path, source_to_delete):
642
+ # if not os.path.exists(chroma_path):
643
+ # print(f"Database at path '{chroma_path}' does not exist.")
644
+ # return
645
+
646
+ # try:
647
+ # #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
648
+ # embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
649
+ # db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)
650
+
651
+ # print(f"Retrieving all metadata to identify chunks with source '{source_to_delete}'...")
652
+ # metadata_list = db.get()["metadatas"]
653
+
654
+ # # Identify indices of chunks to delete
655
+ # indices_to_delete = [
656
+ # idx for idx, metadata in enumerate(metadata_list) if metadata.get("source") == source_to_delete
657
+ # ]
658
+
659
+ # if not indices_to_delete:
660
+ # print(f"No chunks found with source '{source_to_delete}'.")
661
+ # return
662
+
663
+ # print(f"Deleting {len(indices_to_delete)} chunks with source '{source_to_delete}'...")
664
+ # db.delete(indices=indices_to_delete)
665
+ # db.persist()
666
+ # print("Chunks deleted and database updated successfully.")
667
+ # except Exception as e:
668
+ # print(f"Error while deleting chunks by source: {e}")
669
+
670
+ # # update a data store
671
+ # def update_data_store(file_path, db_name):
672
+ # CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
673
+ # print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
674
+
675
+ # try:
676
+ # documents,table_document = load_document(file_path)
677
+ # print("Documents loaded successfully.")
678
+ # except Exception as e:
679
+ # print(f"Error loading documents: {e}")
680
+ # return
681
+
682
+ # try:
683
+ # chunks = split_text(documents)
684
+ # print(f"Text split into {len(chunks)} chunks.")
685
+ # except Exception as e:
686
+ # print(f"Error splitting text: {e}")
687
+ # return
688
+
689
+ # try:
690
+ # asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
691
+ # print(f"Data saved to Chroma for database {db_name}.")
692
+ # except Exception as e:
693
+ # print(f"Error saving to Chroma: {e}")
694
+ # return
695
+
696
+ ########################################################################################################################################################
697
+ ####------------------------------------------------------- Combine Process of Load, Chunk and Store ----------------------------------------------####
698
+ ########################################################################################################################################################
699
+
700
+ def generate_data_store(file_path, db_name):
701
+ CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
702
+ print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
703
+
704
+ try:
705
+ #documents,grouped_documents = load_document(file_path)
706
+ grouped_documents = load_document(file_path)
707
+ print("Documents loaded successfully.")
708
+ except Exception as e:
709
+ print(f"Error loading documents: {e}")
710
+ return
711
+
712
+ try:
713
+ chunks = split_text(grouped_documents)
714
+ print(f"Text split into {len(chunks)} chunks.")
715
+ except Exception as e:
716
+ print(f"Error splitting text: {e}")
717
+ return
718
+
719
+ try:
720
+ #asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
721
+ asyncio.run(save_to_chroma(chunks, db_name))
722
+ print(f"Data saved to Chroma for database {db_name}.")
723
+ except Exception as e:
724
+ print(f"Error saving to Chroma: {e}")
725
+ return
726