Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Update N.TXT
Browse files
    	
        N.TXT
    CHANGED
    
    | @@ -296,4 +296,431 @@ def update_db(db_name): | |
| 296 | 
             
            if __name__ == "__main__":
         | 
| 297 | 
             
                app.run(debug=False, use_reloader=False)
         | 
| 298 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 299 |  | 
|  | |
| 296 | 
             
            if __name__ == "__main__":
         | 
| 297 | 
             
                app.run(debug=False, use_reloader=False)
         | 
| 298 |  | 
| 299 | 
            +
             | 
| 300 | 
            +
             | 
| 301 | 
            +
            RETRIVAL PY
         | 
| 302 | 
            +
             | 
| 303 | 
            +
             | 
| 304 | 
            +
            from langchain_community.document_loaders import DirectoryLoader
         | 
| 305 | 
            +
            from langchain.embeddings import HuggingFaceEmbeddings
         | 
| 306 | 
            +
            from langchain.text_splitter import RecursiveCharacterTextSplitter
         | 
| 307 | 
            +
            from langchain.schema import Document
         | 
| 308 | 
            +
            from langchain_core.documents import Document
         | 
| 309 | 
            +
            from langchain_community.vectorstores import Chroma
         | 
| 310 | 
            +
            import os
         | 
| 311 | 
            +
            import shutil
         | 
| 312 | 
            +
            import asyncio
         | 
| 313 | 
            +
            from unstructured.partition.pdf import partition_pdf
         | 
| 314 | 
            +
            from unstructured.partition.auto import partition
         | 
| 315 | 
            +
            import pytesseract
         | 
| 316 | 
            +
            import os
         | 
| 317 | 
            +
            import re
         | 
| 318 | 
            +
            import uuid
         | 
| 319 | 
            +
            from collections import defaultdict
         | 
| 320 | 
            +
             | 
| 321 | 
            +
            pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
         | 
| 322 | 
            +
             | 
| 323 | 
            +
            # Configurations
         | 
| 324 | 
            +
            UPLOAD_FOLDER = "./uploads"
         | 
| 325 | 
            +
            VECTOR_DB_FOLDER = "./VectorDB"
         | 
| 326 | 
            +
            IMAGE_DB_FOLDER = "./Images"
         | 
| 327 | 
            +
            os.makedirs(UPLOAD_FOLDER, exist_ok=True)
         | 
| 328 | 
            +
            os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
         | 
| 329 | 
            +
             | 
| 330 | 
            +
            ########################################################################################################################################################
         | 
| 331 | 
            +
            ####--------------------------------------------------------------  Documnet Loader  ---------------------------------------------------------------####
         | 
| 332 | 
            +
            ########################################################################################################################################################
         | 
| 333 | 
            +
            # Loaders for loading Document text, tables and images from any file format.
         | 
| 334 | 
            +
            #data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
         | 
| 335 | 
            +
            def load_document(data_path):
         | 
| 336 | 
            +
                processed_documents = []
         | 
| 337 | 
            +
                element_content = []
         | 
| 338 | 
            +
                table_document = []
         | 
| 339 | 
            +
                #having different process for the pdf
         | 
| 340 | 
            +
                for root, _, files in os.walk(data_path):
         | 
| 341 | 
            +
                    for file in files:
         | 
| 342 | 
            +
                        file_path = os.path.join(root, file)
         | 
| 343 | 
            +
                        doc_id = str(uuid.uuid4())  # Generate a unique ID for the document
         | 
| 344 | 
            +
             | 
| 345 | 
            +
                        print(f"Processing document ID: {doc_id}, Path: {file_path}")
         | 
| 346 | 
            +
             | 
| 347 | 
            +
                        try:
         | 
| 348 | 
            +
                            # Determine the file type based on extension
         | 
| 349 | 
            +
                            filename, file_extension = os.path.splitext(file.lower())
         | 
| 350 | 
            +
                            image_output = f"./Images/{filename}/"
         | 
| 351 | 
            +
                            # Use specific partition techniques based on file extension
         | 
| 352 | 
            +
                            if file_extension == ".pdf":
         | 
| 353 | 
            +
                                elements = partition_pdf(
         | 
| 354 | 
            +
                                    filename=file_path,
         | 
| 355 | 
            +
                                    strategy="hi_res",  # Use layout detection
         | 
| 356 | 
            +
                                    infer_table_structure=True,
         | 
| 357 | 
            +
                                    hi_res_model_name="yolox",
         | 
| 358 | 
            +
                                    extract_images_in_pdf=True,
         | 
| 359 | 
            +
                                    extract_image_block_types=["Image","Table"],
         | 
| 360 | 
            +
                                    extract_image_block_output_dir=image_output,
         | 
| 361 | 
            +
                                    show_progress=True,
         | 
| 362 | 
            +
                                    #chunking_strategy="by_title",
         | 
| 363 | 
            +
                                )
         | 
| 364 | 
            +
                            else:
         | 
| 365 | 
            +
                                # Default to auto partition if no specific handler is found
         | 
| 366 | 
            +
                                elements = partition(
         | 
| 367 | 
            +
                                    filename=file_path,
         | 
| 368 | 
            +
                                    strategy="hi_res",
         | 
| 369 | 
            +
                                    infer_table_structure=True,
         | 
| 370 | 
            +
                                    show_progress=True,
         | 
| 371 | 
            +
                                    #chunking_strategy="by_title"
         | 
| 372 | 
            +
                                )
         | 
| 373 | 
            +
                        except Exception as e:
         | 
| 374 | 
            +
                            print(f"Failed to process document {file_path}: {e}")
         | 
| 375 | 
            +
                            continue
         | 
| 376 | 
            +
                        categorized_content = {                
         | 
| 377 | 
            +
                            "tables": {"content": [], "Metadata": []},
         | 
| 378 | 
            +
                            "images": {"content": [], "Metadata": []},
         | 
| 379 | 
            +
                            "text": {"content": [], "Metadata": []},
         | 
| 380 | 
            +
                            "text2": {"content": [], "Metadata": []}                           
         | 
| 381 | 
            +
                        }
         | 
| 382 | 
            +
                        element_content.append(elements)
         | 
| 383 | 
            +
                        CNT=1
         | 
| 384 | 
            +
                        for chunk in elements:
         | 
| 385 | 
            +
                            # Safely extract metadata and text
         | 
| 386 | 
            +
                            chunk_type = str(type(chunk))
         | 
| 387 | 
            +
                            chunk_metadata = chunk.metadata.to_dict() if chunk.metadata else {}                
         | 
| 388 | 
            +
                            chunk_text = getattr(chunk, "text", None)
         | 
| 389 | 
            +
             | 
| 390 | 
            +
                            # Separate content into categories
         | 
| 391 | 
            +
                            #if "Table" in chunk_type:
         | 
| 392 | 
            +
                            if any(
         | 
| 393 | 
            +
                                keyword in chunk_type
         | 
| 394 | 
            +
                                for keyword in [
         | 
| 395 | 
            +
                                    "Table",
         | 
| 396 | 
            +
                                    "TableChunk"]):                    
         | 
| 397 | 
            +
                                categorized_content["tables"]["content"].append(chunk_text)
         | 
| 398 | 
            +
                                categorized_content["tables"]["Metadata"].append(chunk_metadata)
         | 
| 399 | 
            +
                                
         | 
| 400 | 
            +
                                #test1
         | 
| 401 | 
            +
                                TABLE_DATA=f"Table number {CNT} "+chunk_metadata.get("text_as_html", "")+" "
         | 
| 402 | 
            +
                                CNT+=1
         | 
| 403 | 
            +
                                categorized_content["text"]["content"].append(TABLE_DATA)
         | 
| 404 | 
            +
                                categorized_content["text"]["Metadata"].append(chunk_metadata) 
         | 
| 405 | 
            +
             | 
| 406 | 
            +
                            elif "Image" in chunk_type:
         | 
| 407 | 
            +
                                categorized_content["images"]["content"].append(chunk_text)
         | 
| 408 | 
            +
                                categorized_content["images"]["Metadata"].append(chunk_metadata)
         | 
| 409 | 
            +
                            elif any(
         | 
| 410 | 
            +
                                keyword in chunk_type
         | 
| 411 | 
            +
                                for keyword in [
         | 
| 412 | 
            +
                                    "CompositeElement",
         | 
| 413 | 
            +
                                    "Text",
         | 
| 414 | 
            +
                                    "NarrativeText",
         | 
| 415 | 
            +
                                    "Title",
         | 
| 416 | 
            +
                                    "Header",
         | 
| 417 | 
            +
                                    "Footer",
         | 
| 418 | 
            +
                                    "FigureCaption",
         | 
| 419 | 
            +
                                    "ListItem",
         | 
| 420 | 
            +
                                    "UncategorizedText",
         | 
| 421 | 
            +
                                    "Formula",
         | 
| 422 | 
            +
                                    "CodeSnippet",
         | 
| 423 | 
            +
                                    "Address",
         | 
| 424 | 
            +
                                    "EmailAddress",
         | 
| 425 | 
            +
                                    "PageBreak",
         | 
| 426 | 
            +
                                ]
         | 
| 427 | 
            +
                            ):
         | 
| 428 | 
            +
                                categorized_content["text"]["content"].append(chunk_text)
         | 
| 429 | 
            +
                                categorized_content["text"]["Metadata"].append(chunk_metadata)                     
         | 
| 430 | 
            +
                                            
         | 
| 431 | 
            +
                            else:
         | 
| 432 | 
            +
                                continue
         | 
| 433 | 
            +
                        # Append processed document
         | 
| 434 | 
            +
                        processed_documents.append({
         | 
| 435 | 
            +
                            "doc_id": doc_id,
         | 
| 436 | 
            +
                            "source": file_path,
         | 
| 437 | 
            +
                            **categorized_content,
         | 
| 438 | 
            +
                        })
         | 
| 439 | 
            +
                        
         | 
| 440 | 
            +
                # Loop over tables and match text from the same document and page
         | 
| 441 | 
            +
                
         | 
| 442 | 
            +
                '''
         | 
| 443 | 
            +
                for doc in processed_documents:
         | 
| 444 | 
            +
                    cnt=1 # count for storing number of the table
         | 
| 445 | 
            +
                    for table_metadata in doc.get("tables", {}).get("Metadata", []):
         | 
| 446 | 
            +
                        page_number = table_metadata.get("page_number")
         | 
| 447 | 
            +
                        source = doc.get("source")
         | 
| 448 | 
            +
                        page_content = ""
         | 
| 449 | 
            +
                
         | 
| 450 | 
            +
                        for text_metadata, text_content in zip(
         | 
| 451 | 
            +
                            doc.get("text", {}).get("Metadata", []),
         | 
| 452 | 
            +
                            doc.get("text", {}).get("content", [])
         | 
| 453 | 
            +
                        ):
         | 
| 454 | 
            +
                            page_number2 = text_metadata.get("page_number")
         | 
| 455 | 
            +
                            source2 = doc.get("source")
         | 
| 456 | 
            +
                        
         | 
| 457 | 
            +
                            if source == source2 and page_number == page_number2:
         | 
| 458 | 
            +
                                print(f"Matching text found for source: {source}, page: {page_number}")
         | 
| 459 | 
            +
                                page_content += f"{text_content} "  # Concatenate text with a space
         | 
| 460 | 
            +
                    
         | 
| 461 | 
            +
                        # Add the matched content to the table metadata 
         | 
| 462 | 
            +
                        table_metadata["page_content"] =f"Table number {cnt} "+table_metadata.get("text_as_html", "")+" "+page_content.strip()  # Remove trailing spaces and have the content proper here 
         | 
| 463 | 
            +
                        table_metadata["text_as_html"] = table_metadata.get("text_as_html", "") # we are also storing it seperatly
         | 
| 464 | 
            +
                        table_metadata["Table_number"] = cnt  # addiing the table number it will be use in retrival
         | 
| 465 | 
            +
                        cnt+=1
         | 
| 466 | 
            +
                    
         | 
| 467 | 
            +
                        # Custom loader of document which will store the table along with the text on that page specifically
         | 
| 468 | 
            +
                        # making document of each table with its content
         | 
| 469 | 
            +
                        unique_id = str(uuid.uuid4())
         | 
| 470 | 
            +
                        table_document.append(
         | 
| 471 | 
            +
                            Document(
         | 
| 472 | 
            +
                                
         | 
| 473 | 
            +
                                id =unique_id, # Add doc_id directly
         | 
| 474 | 
            +
                                page_content=table_metadata.get("page_content", ""),  # Get page_content from metadata, default to empty string if missing
         | 
| 475 | 
            +
                                metadata={
         | 
| 476 | 
            +
                                    "source": doc["source"],
         | 
| 477 | 
            +
                                    "text_as_html": table_metadata.get("text_as_html", ""),
         | 
| 478 | 
            +
                                    "filetype": table_metadata.get("filetype", ""),
         | 
| 479 | 
            +
                                    "page_number": str(table_metadata.get("page_number", 0)),  # Default to 0 if missing
         | 
| 480 | 
            +
                                    "image_path": table_metadata.get("image_path", ""),
         | 
| 481 | 
            +
                                    "file_directory": table_metadata.get("file_directory", ""),
         | 
| 482 | 
            +
                                    "filename": table_metadata.get("filename", ""),
         | 
| 483 | 
            +
                                    "Table_number": str(table_metadata.get("Table_number", 0))  # Default to 0 if missing
         | 
| 484 | 
            +
                                }
         | 
| 485 | 
            +
                            )
         | 
| 486 | 
            +
                        )
         | 
| 487 | 
            +
                '''
         | 
| 488 | 
            +
             | 
| 489 | 
            +
                # Initialize a structure to group content by doc_id
         | 
| 490 | 
            +
                grouped_by_doc_id = defaultdict(lambda: {
         | 
| 491 | 
            +
                    "text_content": [],
         | 
| 492 | 
            +
                    "metadata": None,  # Metadata will only be set once per doc_id
         | 
| 493 | 
            +
                })
         | 
| 494 | 
            +
             | 
| 495 | 
            +
                for doc in processed_documents:
         | 
| 496 | 
            +
                    doc_id = doc.get("doc_id")
         | 
| 497 | 
            +
                    source = doc.get("source")
         | 
| 498 | 
            +
                    text_content = doc.get("text", {}).get("content", [])
         | 
| 499 | 
            +
                    metadata_list = doc.get("text", {}).get("Metadata", [])
         | 
| 500 | 
            +
             | 
| 501 | 
            +
                    # Merge text content
         | 
| 502 | 
            +
                    grouped_by_doc_id[doc_id]["text_content"].extend(text_content)
         | 
| 503 | 
            +
             | 
| 504 | 
            +
                    # Set metadata (if not already set)
         | 
| 505 | 
            +
                    if grouped_by_doc_id[doc_id]["metadata"] is None and metadata_list:
         | 
| 506 | 
            +
                        metadata = metadata_list[0]  # Assuming metadata is consistent
         | 
| 507 | 
            +
                        grouped_by_doc_id[doc_id]["metadata"] = {
         | 
| 508 | 
            +
                            "source": source,
         | 
| 509 | 
            +
                            "filetype": metadata.get("filetype"),
         | 
| 510 | 
            +
                            "file_directory": metadata.get("file_directory"),
         | 
| 511 | 
            +
                            "filename": metadata.get("filename"),
         | 
| 512 | 
            +
                            "languages": str(metadata.get("languages")),
         | 
| 513 | 
            +
                        }
         | 
| 514 | 
            +
             | 
| 515 | 
            +
                # Convert grouped content into Document objects
         | 
| 516 | 
            +
                grouped_documents = []
         | 
| 517 | 
            +
                for doc_id, data in grouped_by_doc_id.items():
         | 
| 518 | 
            +
                    grouped_documents.append(
         | 
| 519 | 
            +
                        Document(
         | 
| 520 | 
            +
                            id=doc_id,
         | 
| 521 | 
            +
                            page_content=" ".join(data["text_content"]).strip(),
         | 
| 522 | 
            +
                            metadata=data["metadata"],
         | 
| 523 | 
            +
                        )
         | 
| 524 | 
            +
                    )
         | 
| 525 | 
            +
             | 
| 526 | 
            +
                # Output the grouped documents
         | 
| 527 | 
            +
                for document in grouped_documents:
         | 
| 528 | 
            +
                    print(document)
         | 
| 529 | 
            +
             | 
| 530 | 
            +
             | 
| 531 | 
            +
                #Dirctory loader for loading the text data only to specific db
         | 
| 532 | 
            +
                '''
         | 
| 533 | 
            +
                loader = DirectoryLoader(data_path, glob="*.*")
         | 
| 534 | 
            +
                documents = loader.load()
         | 
| 535 | 
            +
                
         | 
| 536 | 
            +
                # update the metadata adding filname to the met
         | 
| 537 | 
            +
                for doc in documents:   
         | 
| 538 | 
            +
                    unique_id = str(uuid.uuid4()) 
         | 
| 539 | 
            +
                    doc.id = unique_id  
         | 
| 540 | 
            +
                    path=doc.metadata.get("source")
         | 
| 541 | 
            +
                    match = re.search(r'([^\\]+\.[^\\]+)$', path)
         | 
| 542 | 
            +
                    doc.metadata.update({"filename":match.group(1)})
         | 
| 543 | 
            +
                return documents,
         | 
| 544 | 
            +
                '''
         | 
| 545 | 
            +
                return grouped_documents
         | 
| 546 | 
            +
            #documents,processed_documents,table_document = load_document(data_path)
         | 
| 547 | 
            +
             | 
| 548 | 
            +
             | 
| 549 | 
            +
            ########################################################################################################################################################
         | 
| 550 | 
            +
            ####-------------------------------------------------------------- Chunking the Text  --------------------------------------------------------------####
         | 
| 551 | 
            +
            ########################################################################################################################################################
         | 
| 552 | 
            +
             | 
| 553 | 
            +
            def split_text(documents: list[Document]):
         | 
| 554 | 
            +
                text_splitter = RecursiveCharacterTextSplitter(
         | 
| 555 | 
            +
                    chunk_size=1000,
         | 
| 556 | 
            +
                    chunk_overlap=500,
         | 
| 557 | 
            +
                    length_function=len,
         | 
| 558 | 
            +
                    add_start_index=True,
         | 
| 559 | 
            +
                )
         | 
| 560 | 
            +
                chunks = text_splitter.split_documents(documents) # splitting the document into chunks 
         | 
| 561 | 
            +
                for index in chunks:
         | 
| 562 | 
            +
                    index.metadata["start_index"]=str(index.metadata["start_index"]) # the converstion of int metadata to str was done to store it in sqlite3 
         | 
| 563 | 
            +
                print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
         | 
| 564 | 
            +
                return chunks
         | 
| 565 | 
            +
             | 
| 566 | 
            +
            ########################################################################################################################################################
         | 
| 567 | 
            +
            ####---------------------------------------------------- Creating and Storeing Data in Vector DB  --------------------------------------------------####
         | 
| 568 | 
            +
            ########################################################################################################################################################
         | 
| 569 | 
            +
             | 
| 570 | 
            +
            #def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
         | 
| 571 | 
            +
            def save_to_chroma(chunks: list[Document], name: str):
         | 
| 572 | 
            +
                CHROMA_PATH = f"./VectorDB/chroma_{name}"
         | 
| 573 | 
            +
                #TABLE_PATH = f"./TableDB/chroma_{name}"
         | 
| 574 | 
            +
                if os.path.exists(CHROMA_PATH):
         | 
| 575 | 
            +
                    shutil.rmtree(CHROMA_PATH)
         | 
| 576 | 
            +
                # if os.path.exists(TABLE_PATH):
         | 
| 577 | 
            +
                #     shutil.rmtree(TABLE_PATH)
         | 
| 578 | 
            +
             | 
| 579 | 
            +
                try:
         | 
| 580 | 
            +
                    # Load the embedding model
         | 
| 581 | 
            +
                    embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         | 
| 582 | 
            +
                    #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
         | 
| 583 | 
            +
                    # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
         | 
| 584 | 
            +
                    print("Creating document vector database...")
         | 
| 585 | 
            +
                    db = Chroma.from_documents(
         | 
| 586 | 
            +
                        documents=chunks,
         | 
| 587 | 
            +
                        embedding=embedding_function,
         | 
| 588 | 
            +
                        persist_directory=CHROMA_PATH,
         | 
| 589 | 
            +
                    )
         | 
| 590 | 
            +
                    print("Document database successfully saved.")
         | 
| 591 | 
            +
             | 
| 592 | 
            +
                    # # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
         | 
| 593 | 
            +
                    # if tables:
         | 
| 594 | 
            +
                    #     print("Creating table vector database...")
         | 
| 595 | 
            +
                    #     tdb = Chroma.from_documents(
         | 
| 596 | 
            +
                    #         documents=tables,
         | 
| 597 | 
            +
                    #         embedding=embedding_function,
         | 
| 598 | 
            +
                    #         persist_directory=TABLE_PATH,
         | 
| 599 | 
            +
                    #     )
         | 
| 600 | 
            +
                    #     print("Table database successfully saved.")
         | 
| 601 | 
            +
                    # else:
         | 
| 602 | 
            +
                    #     tdb = None
         | 
| 603 | 
            +
             | 
| 604 | 
            +
                    #return db, tdb
         | 
| 605 | 
            +
                    return db
         | 
| 606 | 
            +
                
         | 
| 607 | 
            +
                except Exception as e:
         | 
| 608 | 
            +
                    print("Error while saving to Chroma:", e)
         | 
| 609 | 
            +
                    return None
         | 
| 610 | 
            +
             | 
| 611 | 
            +
            # def get_unique_sources(chroma_path):
         | 
| 612 | 
            +
            #     db = Chroma(persist_directory=chroma_path)
         | 
| 613 | 
            +
            #     metadata_list = db.get()["metadatas"]
         | 
| 614 | 
            +
            #     unique_sources = {metadata["source"] for metadata in metadata_list if "source" in metadata}
         | 
| 615 | 
            +
            #     return list(unique_sources)
         | 
| 616 | 
            +
             | 
| 617 | 
            +
            ########################################################################################################################################################
         | 
| 618 | 
            +
            ####----------------------------------------------------------- Updating Existing Data in Vector DB  -----------------------------------------------####
         | 
| 619 | 
            +
            ########################################################################################################################################################
         | 
| 620 | 
            +
             | 
| 621 | 
            +
            # def add_document_to_existing_db(new_documents: list[Document], db_name: str):
         | 
| 622 | 
            +
            #     CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
         | 
| 623 | 
            +
             | 
| 624 | 
            +
            #     if not os.path.exists(CHROMA_PATH):
         | 
| 625 | 
            +
            #         print(f"Database '{db_name}' does not exist. Please create it first.")
         | 
| 626 | 
            +
            #         return
         | 
| 627 | 
            +
             | 
| 628 | 
            +
            #     try:
         | 
| 629 | 
            +
            #         embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         | 
| 630 | 
            +
            #         #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
         | 
| 631 | 
            +
            #         db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
         | 
| 632 | 
            +
             | 
| 633 | 
            +
            #         print("Adding new documents to the existing database...")
         | 
| 634 | 
            +
            #         chunks = split_text(new_documents)
         | 
| 635 | 
            +
            #         db.add_documents(chunks)
         | 
| 636 | 
            +
            #         db.persist()
         | 
| 637 | 
            +
            #         print("New documents added and database updated successfully.")
         | 
| 638 | 
            +
            #     except Exception as e:
         | 
| 639 | 
            +
            #         print("Error while adding documents to existing database:", e)
         | 
| 640 | 
            +
             | 
| 641 | 
            +
            # def delete_chunks_by_source(chroma_path, source_to_delete):
         | 
| 642 | 
            +
            #     if not os.path.exists(chroma_path):
         | 
| 643 | 
            +
            #         print(f"Database at path '{chroma_path}' does not exist.")
         | 
| 644 | 
            +
            #         return
         | 
| 645 | 
            +
             | 
| 646 | 
            +
            #     try:
         | 
| 647 | 
            +
            #         #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         | 
| 648 | 
            +
            #         embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
         | 
| 649 | 
            +
            #         db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)
         | 
| 650 | 
            +
             | 
| 651 | 
            +
            #         print(f"Retrieving all metadata to identify chunks with source '{source_to_delete}'...")
         | 
| 652 | 
            +
            #         metadata_list = db.get()["metadatas"]
         | 
| 653 | 
            +
             | 
| 654 | 
            +
            #         # Identify indices of chunks to delete
         | 
| 655 | 
            +
            #         indices_to_delete = [
         | 
| 656 | 
            +
            #             idx for idx, metadata in enumerate(metadata_list) if metadata.get("source") == source_to_delete
         | 
| 657 | 
            +
            #         ]
         | 
| 658 | 
            +
             | 
| 659 | 
            +
            #         if not indices_to_delete:
         | 
| 660 | 
            +
            #             print(f"No chunks found with source '{source_to_delete}'.")
         | 
| 661 | 
            +
            #             return
         | 
| 662 | 
            +
             | 
| 663 | 
            +
            #         print(f"Deleting {len(indices_to_delete)} chunks with source '{source_to_delete}'...")
         | 
| 664 | 
            +
            #         db.delete(indices=indices_to_delete)
         | 
| 665 | 
            +
            #         db.persist()
         | 
| 666 | 
            +
            #         print("Chunks deleted and database updated successfully.")
         | 
| 667 | 
            +
            #     except Exception as e:
         | 
| 668 | 
            +
            #         print(f"Error while deleting chunks by source: {e}")
         | 
| 669 | 
            +
                    
         | 
| 670 | 
            +
            # # update a data store        
         | 
| 671 | 
            +
            # def update_data_store(file_path, db_name):
         | 
| 672 | 
            +
            #     CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
         | 
| 673 | 
            +
            #     print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")
         | 
| 674 | 
            +
             | 
| 675 | 
            +
            #     try:
         | 
| 676 | 
            +
            #         documents,table_document = load_document(file_path)
         | 
| 677 | 
            +
            #         print("Documents loaded successfully.")
         | 
| 678 | 
            +
            #     except Exception as e:
         | 
| 679 | 
            +
            #         print(f"Error loading documents: {e}")
         | 
| 680 | 
            +
            #         return
         | 
| 681 | 
            +
             | 
| 682 | 
            +
            #     try:
         | 
| 683 | 
            +
            #         chunks = split_text(documents)
         | 
| 684 | 
            +
            #         print(f"Text split into {len(chunks)} chunks.")
         | 
| 685 | 
            +
            #     except Exception as e:
         | 
| 686 | 
            +
            #         print(f"Error splitting text: {e}")
         | 
| 687 | 
            +
            #         return
         | 
| 688 | 
            +
             | 
| 689 | 
            +
            #     try:
         | 
| 690 | 
            +
            #         asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
         | 
| 691 | 
            +
            #         print(f"Data saved to Chroma for database {db_name}.")
         | 
| 692 | 
            +
            #     except Exception as e:
         | 
| 693 | 
            +
            #         print(f"Error saving to Chroma: {e}")
         | 
| 694 | 
            +
            #         return
         | 
| 695 | 
            +
             | 
| 696 | 
            +
            ########################################################################################################################################################
         | 
| 697 | 
            +
            ####------------------------------------------------------- Combine Process of Load, Chunk and Store  ----------------------------------------------####
         | 
| 698 | 
            +
            ########################################################################################################################################################
         | 
| 699 | 
            +
             | 
| 700 | 
            +
            def generate_data_store(file_path, db_name):
         | 
| 701 | 
            +
                CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
         | 
| 702 | 
            +
                print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")
         | 
| 703 | 
            +
             | 
| 704 | 
            +
                try:
         | 
| 705 | 
            +
                    #documents,grouped_documents = load_document(file_path)
         | 
| 706 | 
            +
                    grouped_documents = load_document(file_path)
         | 
| 707 | 
            +
                    print("Documents loaded successfully.")
         | 
| 708 | 
            +
                except Exception as e:
         | 
| 709 | 
            +
                    print(f"Error loading documents: {e}")
         | 
| 710 | 
            +
                    return
         | 
| 711 | 
            +
             | 
| 712 | 
            +
                try:
         | 
| 713 | 
            +
                    chunks = split_text(grouped_documents)
         | 
| 714 | 
            +
                    print(f"Text split into {len(chunks)} chunks.")
         | 
| 715 | 
            +
                except Exception as e:
         | 
| 716 | 
            +
                    print(f"Error splitting text: {e}")
         | 
| 717 | 
            +
                    return
         | 
| 718 | 
            +
             | 
| 719 | 
            +
                try:
         | 
| 720 | 
            +
                    #asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
         | 
| 721 | 
            +
                    asyncio.run(save_to_chroma(chunks, db_name))
         | 
| 722 | 
            +
                    print(f"Data saved to Chroma for database {db_name}.")
         | 
| 723 | 
            +
                except Exception as e:
         | 
| 724 | 
            +
                    print(f"Error saving to Chroma: {e}")
         | 
| 725 | 
            +
                    return
         | 
| 726 |  |