Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on 8 days ago

Commit

bafcf39

1 Parent(s): 3d18b9d

Fixed on deprecated Github workflow functions. Applied linter and formatter to code throughout. Added tests for GUI load.

Browse files

Files changed (41) hide show

.github/scripts/setup_test_data.py +115 -85
.github/workflows/ci.yml +6 -6
.github/workflows/multi-os-test.yml +1 -1
.github/workflows/simple-test.yml +1 -1
.github/workflows/test.yml +3 -3
app.py +0 -0
cdk/app.py +16 -14
cdk/cdk_config.py +211 -104
cdk/cdk_functions.py +525 -336
cdk/cdk_stack.py +0 -0
cdk/check_resources.py +174 -97
cdk/post_cdk_build_quickstart.py +19 -6
cli_redact.py +830 -287
lambda_entrypoint.py +127 -102
load_dynamo_logs.py +28 -23
load_s3_logs.py +40 -22
pyproject.toml +9 -0
test/GUI_TEST_README.md +111 -0
test/demo_single_test.py +48 -31
test/run_gui_test.bat +26 -0
test/run_gui_test.ps1 +34 -0
test/run_tests.py +3 -3
test/test.py +419 -178
test/test_gui_only.py +198 -0
tools/auth.py +39 -26
tools/aws_functions.py +122 -55
tools/aws_textract.py +275 -147
tools/cli_usage_logger.py +86 -64
tools/config.py +422 -208
tools/custom_csvlogger.py +102 -65
tools/custom_image_analyser_engine.py +768 -450
tools/data_anonymise.py +667 -339
tools/file_conversion.py +0 -0
tools/file_redaction.py +0 -0
tools/find_duplicate_pages.py +556 -338
tools/find_duplicate_tabular.py +307 -197
tools/helper_functions.py +423 -208
tools/load_spacy_model_custom_recognisers.py +333 -147
tools/presidio_analyzer_custom.py +108 -106
tools/redaction_review.py +0 -0
tools/textract_batch_call.py +451 -245

.github/scripts/setup_test_data.py CHANGED Viewed

@@ -5,170 +5,199 @@ Creates dummy test files when example data is not available.
 """
 import os
-import sys
 import pandas as pd
-from pathlib import Path
 def create_directories():
     """Create necessary directories."""
-    dirs = [
-        'example_data',
-        'example_data/example_outputs'
-    ]
     for dir_path in dirs:
         os.makedirs(dir_path, exist_ok=True)
         print(f"Created directory: {dir_path}")
 def create_dummy_pdf():
     """Create a dummy PDF for testing."""
     try:
-        from reportlab.pdfgen import canvas
         from reportlab.lib.pagesizes import letter
-        pdf_path = 'example_data/example_of_emails_sent_to_a_professor_before_applying.pdf'
         c = canvas.Canvas(pdf_path, pagesize=letter)
-        c.drawString(100, 750, 'This is a test document for redaction testing.')
-        c.drawString(100, 700, 'Email: [email protected]')
-        c.drawString(100, 650, 'Phone: 123-456-7890')
-        c.drawString(100, 600, 'Name: John Doe')
-        c.drawString(100, 550, 'Address: 123 Test Street, Test City, TC 12345')
         c.showPage()
         # Add second page
-        c.drawString(100, 750, 'Second page content')
-        c.drawString(100, 700, 'More test data: [email protected]')
-        c.drawString(100, 650, 'Another phone: 987-654-3210')
         c.save()
         print(f"Created dummy PDF: {pdf_path}")
     except ImportError:
         print("ReportLab not available, skipping PDF creation")
         # Create a simple text file instead
-        with open('example_data/example_of_emails_sent_to_a_professor_before_applying.pdf', 'w') as f:
             f.write("This is a dummy PDF file for testing")
         print("Created dummy text file instead of PDF")
 def create_dummy_csv():
     """Create dummy CSV files for testing."""
     # Main CSV
     csv_data = {
-        'Case Note': [
-            'Client visited for consultation regarding housing issues',
-            'Follow-up appointment scheduled for next week',
-            'Documentation submitted for review'
         ],
-        'Client': ['John Smith', 'Jane Doe', 'Bob Johnson'],
-        'Date': ['2024-01-15', '2024-01-16', '2024-01-17']
     }
     df = pd.DataFrame(csv_data)
-    df.to_csv('example_data/combined_case_notes.csv', index=False)
     print("Created dummy CSV: example_data/combined_case_notes.csv")
     # Lambeth CSV
     lambeth_data = {
-        'text': [
-            'Lambeth 2030 vision document content',
-            'Our Future Our Lambeth strategic plan',
-            'Community engagement and development'
         ],
-        'page': [1, 2, 3]
     }
     df_lambeth = pd.DataFrame(lambeth_data)
-    df_lambeth.to_csv('example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv', index=False)
     print("Created dummy CSV: example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv")
 def create_dummy_word_doc():
     """Create dummy Word document."""
     try:
         from docx import Document
         doc = Document()
-        doc.add_heading('Test Document for Redaction', 0)
-        doc.add_paragraph('This is a test document for redaction testing.')
-        doc.add_paragraph('Contact Information:')
-        doc.add_paragraph('Email: [email protected]')
-        doc.add_paragraph('Phone: 123-456-7890')
-        doc.add_paragraph('Name: John Doe')
-        doc.add_paragraph('Address: 123 Test Street, Test City, TC 12345')
-        doc.save('example_data/Bold minimalist professional cover letter.docx')
         print("Created dummy Word document")
     except ImportError:
         print("python-docx not available, skipping Word document creation")
 def create_allow_deny_lists():
     """Create dummy allow/deny lists."""
     # Allow lists
-    allow_data = {'word': ['test', 'example', 'document']}
-    pd.DataFrame(allow_data).to_csv('example_data/test_allow_list_graduate.csv', index=False)
-    pd.DataFrame(allow_data).to_csv('example_data/test_allow_list_partnership.csv', index=False)
     print("Created allow lists")
     # Deny lists
-    deny_data = {'word': ['sensitive', 'confidential', 'private']}
-    pd.DataFrame(deny_data).to_csv('example_data/partnership_toolkit_redact_custom_deny_list.csv', index=False)
-    pd.DataFrame(deny_data).to_csv('example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv', index=False)
     print("Created deny lists")
     # Whole page redaction list
-    page_data = {'page': [1, 2]}
-    pd.DataFrame(page_data).to_csv('example_data/partnership_toolkit_redact_some_pages.csv', index=False)
     print("Created whole page redaction list")
 def create_ocr_output():
     """Create dummy OCR output CSV."""
     ocr_data = {
-        'file_name': ['test.pdf', 'test.pdf', 'test.pdf'],
-        'page_number': [1, 2, 3],
-        'text': [
-            'This is page 1 content with some text',
-            'This is page 2 content with different text',
-            'This is page 3 content with more text'
         ],
-        'confidence': [0.95, 0.92, 0.88]
     }
     df = pd.DataFrame(ocr_data)
-    df.to_csv('example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv', index=False)
     print("Created dummy OCR output CSV")
 def create_dummy_image():
     """Create dummy image for testing."""
     try:
         from PIL import Image, ImageDraw, ImageFont
-        img = Image.new('RGB', (800, 600), color='white')
         draw = ImageDraw.Draw(img)
         # Try to use a system font
         try:
-            font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', 20)
-        except:
             try:
-                font = ImageFont.truetype('/System/Library/Fonts/Arial.ttf', 20)
-            except:
                 font = ImageFont.load_default()
         # Add text to image
-        draw.text((50, 50), 'Test Document for Redaction', fill='black', font=font)
-        draw.text((50, 100), 'Email: [email protected]', fill='black', font=font)
-        draw.text((50, 150), 'Phone: 123-456-7890', fill='black', font=font)
-        draw.text((50, 200), 'Name: John Doe', fill='black', font=font)
-        draw.text((50, 250), 'Address: 123 Test Street', fill='black', font=font)
-        img.save('example_data/example_complaint_letter.jpg')
         print("Created dummy image")
     except ImportError:
         print("PIL not available, skipping image creation")
 def main():
     """Main setup function."""
     print("Setting up test data for GitHub Actions...")
     create_directories()
     create_dummy_pdf()
     create_dummy_csv()
@@ -176,12 +205,13 @@ def main():
     create_allow_deny_lists()
     create_ocr_output()
     create_dummy_image()
     print("\nTest data setup complete!")
     print("Created files:")
-    for root, dirs, files in os.walk('example_data'):
         for file in files:
             print(f"  {os.path.join(root, file)}")
 if __name__ == "__main__":
     main()

 """
 import os
 import pandas as pd
 def create_directories():
     """Create necessary directories."""
+    dirs = ["example_data", "example_data/example_outputs"]
     for dir_path in dirs:
         os.makedirs(dir_path, exist_ok=True)
         print(f"Created directory: {dir_path}")
 def create_dummy_pdf():
     """Create a dummy PDF for testing."""
     try:
         from reportlab.lib.pagesizes import letter
+        from reportlab.pdfgen import canvas
+        pdf_path = (
+            "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
+        )
         c = canvas.Canvas(pdf_path, pagesize=letter)
+        c.drawString(100, 750, "This is a test document for redaction testing.")
+        c.drawString(100, 700, "Email: [email protected]")
+        c.drawString(100, 650, "Phone: 123-456-7890")
+        c.drawString(100, 600, "Name: John Doe")
+        c.drawString(100, 550, "Address: 123 Test Street, Test City, TC 12345")
         c.showPage()
         # Add second page
+        c.drawString(100, 750, "Second page content")
+        c.drawString(100, 700, "More test data: [email protected]")
+        c.drawString(100, 650, "Another phone: 987-654-3210")
         c.save()
         print(f"Created dummy PDF: {pdf_path}")
     except ImportError:
         print("ReportLab not available, skipping PDF creation")
         # Create a simple text file instead
+        with open(
+            "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
+            "w",
+        ) as f:
             f.write("This is a dummy PDF file for testing")
         print("Created dummy text file instead of PDF")
 def create_dummy_csv():
     """Create dummy CSV files for testing."""
     # Main CSV
     csv_data = {
+        "Case Note": [
+            "Client visited for consultation regarding housing issues",
+            "Follow-up appointment scheduled for next week",
+            "Documentation submitted for review",
         ],
+        "Client": ["John Smith", "Jane Doe", "Bob Johnson"],
+        "Date": ["2024-01-15", "2024-01-16", "2024-01-17"],
     }
     df = pd.DataFrame(csv_data)
+    df.to_csv("example_data/combined_case_notes.csv", index=False)
     print("Created dummy CSV: example_data/combined_case_notes.csv")
     # Lambeth CSV
     lambeth_data = {
+        "text": [
+            "Lambeth 2030 vision document content",
+            "Our Future Our Lambeth strategic plan",
+            "Community engagement and development",
         ],
+        "page": [1, 2, 3],
     }
     df_lambeth = pd.DataFrame(lambeth_data)
+    df_lambeth.to_csv(
+        "example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv", index=False
+    )
     print("Created dummy CSV: example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv")
 def create_dummy_word_doc():
     """Create dummy Word document."""
     try:
         from docx import Document
         doc = Document()
+        doc.add_heading("Test Document for Redaction", 0)
+        doc.add_paragraph("This is a test document for redaction testing.")
+        doc.add_paragraph("Contact Information:")
+        doc.add_paragraph("Email: [email protected]")
+        doc.add_paragraph("Phone: 123-456-7890")
+        doc.add_paragraph("Name: John Doe")
+        doc.add_paragraph("Address: 123 Test Street, Test City, TC 12345")
+        doc.save("example_data/Bold minimalist professional cover letter.docx")
         print("Created dummy Word document")
     except ImportError:
         print("python-docx not available, skipping Word document creation")
 def create_allow_deny_lists():
     """Create dummy allow/deny lists."""
     # Allow lists
+    allow_data = {"word": ["test", "example", "document"]}
+    pd.DataFrame(allow_data).to_csv(
+        "example_data/test_allow_list_graduate.csv", index=False
+    )
+    pd.DataFrame(allow_data).to_csv(
+        "example_data/test_allow_list_partnership.csv", index=False
+    )
     print("Created allow lists")
     # Deny lists
+    deny_data = {"word": ["sensitive", "confidential", "private"]}
+    pd.DataFrame(deny_data).to_csv(
+        "example_data/partnership_toolkit_redact_custom_deny_list.csv", index=False
+    )
+    pd.DataFrame(deny_data).to_csv(
+        "example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv",
+        index=False,
+    )
     print("Created deny lists")
     # Whole page redaction list
+    page_data = {"page": [1, 2]}
+    pd.DataFrame(page_data).to_csv(
+        "example_data/partnership_toolkit_redact_some_pages.csv", index=False
+    )
     print("Created whole page redaction list")
 def create_ocr_output():
     """Create dummy OCR output CSV."""
     ocr_data = {
+        "file_name": ["test.pdf", "test.pdf", "test.pdf"],
+        "page_number": [1, 2, 3],
+        "text": [
+            "This is page 1 content with some text",
+            "This is page 2 content with different text",
+            "This is page 3 content with more text",
         ],
+        "confidence": [0.95, 0.92, 0.88],
     }
     df = pd.DataFrame(ocr_data)
+    df.to_csv(
+        "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
+        index=False,
+    )
     print("Created dummy OCR output CSV")
 def create_dummy_image():
     """Create dummy image for testing."""
     try:
         from PIL import Image, ImageDraw, ImageFont
+        img = Image.new("RGB", (800, 600), color="white")
         draw = ImageDraw.Draw(img)
         # Try to use a system font
         try:
+            font = ImageFont.truetype(
+                "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20
+            )
+        except Exception as e:
+            print(f"Error loading DejaVuSans font: {e}")
             try:
+                font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 20)
+            except Exception as e:
+                print(f"Error loading Arial font: {e}")
                 font = ImageFont.load_default()
         # Add text to image
+        draw.text((50, 50), "Test Document for Redaction", fill="black", font=font)
+        draw.text((50, 100), "Email: [email protected]", fill="black", font=font)
+        draw.text((50, 150), "Phone: 123-456-7890", fill="black", font=font)
+        draw.text((50, 200), "Name: John Doe", fill="black", font=font)
+        draw.text((50, 250), "Address: 123 Test Street", fill="black", font=font)
+        img.save("example_data/example_complaint_letter.jpg")
         print("Created dummy image")
     except ImportError:
         print("PIL not available, skipping image creation")
 def main():
     """Main setup function."""
     print("Setting up test data for GitHub Actions...")
     create_directories()
     create_dummy_pdf()
     create_dummy_csv()
     create_allow_deny_lists()
     create_ocr_output()
     create_dummy_image()
     print("\nTest data setup complete!")
     print("Created files:")
+    for root, dirs, files in os.walk("example_data"):
         for file in files:
             print(f"  {os.path.join(root, file)}")
 if __name__ == "__main__":
     main()

.github/workflows/ci.yml CHANGED Viewed

@@ -49,7 +49,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Cache pip dependencies
-      uses: actions/cache@v3
       with:
         path: ~/.cache/pip
         key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
@@ -63,7 +63,7 @@ jobs:
           tesseract-ocr \
           tesseract-ocr-eng \
           poppler-utils \
-          libgl1-mesa-glx \
           libglib2.0-0 \
           libsm6 \
           libxext6 \
@@ -107,7 +107,7 @@ jobs:
         fail_ci_if_error: false
     - name: Upload test results
-      uses: actions/upload-artifact@v3
       if: always()
       with:
         name: test-results-python-${{ matrix.python-version }}
@@ -141,7 +141,7 @@ jobs:
           tesseract-ocr \
           tesseract-ocr-eng \
           poppler-utils \
-          libgl1-mesa-glx \
           libglib2.0-0
     - name: Download spaCy model
@@ -189,7 +189,7 @@ jobs:
         bandit -r . -f json -o bandit-report.json || true
     - name: Upload security report
-      uses: actions/upload-artifact@v3
       if: always()
       with:
         name: security-report
@@ -222,7 +222,7 @@ jobs:
         twine check dist/*
     - name: Upload build artifacts
-      uses: actions/upload-artifact@v3
       with:
         name: dist
         path: dist/

         python-version: ${{ matrix.python-version }}
     - name: Cache pip dependencies
+      uses: actions/cache@v4
       with:
         path: ~/.cache/pip
         key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
           tesseract-ocr \
           tesseract-ocr-eng \
           poppler-utils \
+          libgl1-mesa-dri \
           libglib2.0-0 \
           libsm6 \
           libxext6 \
         fail_ci_if_error: false
     - name: Upload test results
+      uses: actions/upload-artifact@v4
       if: always()
       with:
         name: test-results-python-${{ matrix.python-version }}
           tesseract-ocr \
           tesseract-ocr-eng \
           poppler-utils \
+          libgl1-mesa-dri \
           libglib2.0-0
     - name: Download spaCy model
         bandit -r . -f json -o bandit-report.json || true
     - name: Upload security report
+      uses: actions/upload-artifact@v4
       if: always()
       with:
         name: security-report
         twine check dist/*
     - name: Upload build artifacts
+      uses: actions/upload-artifact@v4
       with:
         name: dist
         path: dist/

.github/workflows/multi-os-test.yml CHANGED Viewed

@@ -36,7 +36,7 @@ jobs:
           tesseract-ocr \
           tesseract-ocr-eng \
           poppler-utils \
-          libgl1-mesa-glx \
           libglib2.0-0
     - name: Install system dependencies (macOS)

           tesseract-ocr \
           tesseract-ocr-eng \
           poppler-utils \
+          libgl1-mesa-dri \
           libglib2.0-0
     - name: Install system dependencies (macOS)

.github/workflows/simple-test.yml CHANGED Viewed

@@ -25,7 +25,7 @@ jobs:
           tesseract-ocr \
           tesseract-ocr-eng \
           poppler-utils \
-          libgl1-mesa-glx \
           libglib2.0-0
     - name: Install Python dependencies

           tesseract-ocr \
           tesseract-ocr-eng \
           poppler-utils \
+          libgl1-mesa-dri \
           libglib2.0-0
     - name: Install Python dependencies

.github/workflows/test.yml CHANGED Viewed

@@ -22,7 +22,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Cache pip dependencies
-      uses: actions/cache@v3
       with:
         path: ~/.cache/pip
         key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
@@ -36,7 +36,7 @@ jobs:
           tesseract-ocr \
           tesseract-ocr-eng \
           poppler-utils \
-          libgl1-mesa-glx \
           libglib2.0-0
     - name: Install Python dependencies
@@ -201,7 +201,7 @@ jobs:
         fail_ci_if_error: false
     - name: Upload test results
-      uses: actions/upload-artifact@v3
       if: always()
       with:
         name: test-results-python-${{ matrix.python-version }}

         python-version: ${{ matrix.python-version }}
     - name: Cache pip dependencies
+      uses: actions/cache@v4
       with:
         path: ~/.cache/pip
         key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
           tesseract-ocr \
           tesseract-ocr-eng \
           poppler-utils \
+          libgl1-mesa-dri \
           libglib2.0-0
     - name: Install Python dependencies
         fail_ci_if_error: false
     - name: Upload test results
+      uses: actions/upload-artifact@v4
       if: always()
       with:
         name: test-results-python-${{ matrix.python-version }}

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

cdk/app.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import os
-from aws_cdk import (App, Environment)
-# Assuming these are still relevant for you
-from check_resources import check_and_set_context, CONTEXT_FILE
 from cdk_config import AWS_ACCOUNT_ID, AWS_REGION, RUN_USEAST_STACK, USE_CLOUDFRONT
-from cdk_stack import CdkStack, CdkStackCloudfront#, CdkStackMain
-from cdk_functions import load_context_from_file, create_basic_config_env
 # Initialize the CDK app
 app = App()
@@ -25,7 +26,9 @@ print("Running pre-check script to generate application context...")
 try:
     check_and_set_context()
     if not os.path.exists(CONTEXT_FILE):
-        raise RuntimeError(f"check_and_set_context() finished, but {CONTEXT_FILE} was not created.")
     print(f"Context generated successfully at {CONTEXT_FILE}.")
 except Exception as e:
     raise RuntimeError(f"Failed to generate context via check_and_set_context(): {e}")
@@ -56,12 +59,11 @@ aws_env_regional = Environment(account=AWS_ACCOUNT_ID, region=AWS_REGION)
 #                         public_route_tables=regional_stack.params["public_route_tables"],
 #                         cross_region_references=True)
-regional_stack = CdkStack(app,
-                          "RedactionStack",
-                          env=aws_env_regional,
-                          cross_region_references=True)
-if USE_CLOUDFRONT == 'True' and RUN_USEAST_STACK == 'True':
     # Define the environment for the CloudFront stack (always us-east-1 for CF-level resources like WAFv2 WebACLs for CF)
     aws_env_us_east_1 = Environment(account=AWS_ACCOUNT_ID, region="us-east-1")
@@ -72,10 +74,10 @@ if USE_CLOUDFRONT == 'True' and RUN_USEAST_STACK == 'True':
         env=aws_env_us_east_1,
         alb_arn=regional_stack.params["alb_arn_output"],
         alb_sec_group_id=regional_stack.params["alb_security_group_id"],
-        alb_dns_name=regional_stack.params["alb_dns_name"],
-        cross_region_references=True
     )
 # Synthesize the CloudFormation template
-app.synth(validate_on_synthesis=True)

 import os
+from aws_cdk import App, Environment
 from cdk_config import AWS_ACCOUNT_ID, AWS_REGION, RUN_USEAST_STACK, USE_CLOUDFRONT
+from cdk_functions import create_basic_config_env, load_context_from_file
+from cdk_stack import CdkStack, CdkStackCloudfront  # , CdkStackMain
+# Assuming these are still relevant for you
+from check_resources import CONTEXT_FILE, check_and_set_context
 # Initialize the CDK app
 app = App()
 try:
     check_and_set_context()
     if not os.path.exists(CONTEXT_FILE):
+        raise RuntimeError(
+            f"check_and_set_context() finished, but {CONTEXT_FILE} was not created."
+        )
     print(f"Context generated successfully at {CONTEXT_FILE}.")
 except Exception as e:
     raise RuntimeError(f"Failed to generate context via check_and_set_context(): {e}")
 #                         public_route_tables=regional_stack.params["public_route_tables"],
 #                         cross_region_references=True)
+regional_stack = CdkStack(
+    app, "RedactionStack", env=aws_env_regional, cross_region_references=True
+)
+if USE_CLOUDFRONT == "True" and RUN_USEAST_STACK == "True":
     # Define the environment for the CloudFront stack (always us-east-1 for CF-level resources like WAFv2 WebACLs for CF)
     aws_env_us_east_1 = Environment(account=AWS_ACCOUNT_ID, region="us-east-1")
         env=aws_env_us_east_1,
         alb_arn=regional_stack.params["alb_arn_output"],
         alb_sec_group_id=regional_stack.params["alb_security_group_id"],
+        alb_dns_name=regional_stack.params["alb_dns_name"],
+        cross_region_references=True,
     )
 # Synthesize the CloudFormation template
+app.synth(validate_on_synthesis=True)

cdk/cdk_config.py CHANGED Viewed

@@ -1,28 +1,31 @@
 import os
 import tempfile
 from dotenv import load_dotenv
 # Set or retrieve configuration variables for CDK redaction deployment
-def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
-    '''
     Get an environmental variable, and set it to a default value if it doesn't exist
-    '''
     # Get the environment variable if it exists
     value = os.environ.get(var_name)
     # If it doesn't exist, set the environment variable to the default value
     if value is None:
         os.environ[var_name] = default_value
         value = default_value
-    if print_val == True:
-        print(f'The value of {var_name} is {value}')
     return value
-def ensure_folder_exists(output_folder:str):
-    """Checks if the specified folder exists, creates it if not."""
     if not os.path.exists(output_folder):
         # Create the folder if it doesn't exist
@@ -31,10 +34,11 @@ def ensure_folder_exists(output_folder:str):
     else:
         print(f"The {output_folder} folder already exists.")
 def add_folder_to_path(folder_path: str):
-    '''
     Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
-    '''
     if os.path.exists(folder_path) and os.path.isdir(folder_path):
         print(folder_path, "folder exists.")
@@ -42,192 +46,295 @@ def add_folder_to_path(folder_path: str):
         # Resolve relative path to absolute path
         absolute_path = os.path.abspath(folder_path)
-        current_path = os.environ['PATH']
         if absolute_path not in current_path.split(os.pathsep):
             full_path_extension = absolute_path + os.pathsep + current_path
-            os.environ['PATH'] = full_path_extension
-            #print(f"Updated PATH with: ", full_path_extension)
         else:
             print(f"Directory {folder_path} already exists in PATH.")
     else:
         print(f"Folder not found at {folder_path} - not added to PATH")
 ###
 # LOAD CONFIG FROM ENV FILE
 ###
-CONFIG_FOLDER = get_or_create_env_var('CONFIG_FOLDER', "config/")
 ensure_folder_exists(CONFIG_FOLDER)
 # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/cdk_config.env'
-CDK_CONFIG_PATH = get_or_create_env_var('CDK_CONFIG_PATH', 'config/cdk_config.env') # e.g. config/cdk_config.env
 if CDK_CONFIG_PATH:
     if os.path.exists(CDK_CONFIG_PATH):
         print(f"Loading CDK variables from config file {CDK_CONFIG_PATH}")
         load_dotenv(CDK_CONFIG_PATH)
-    else: print("CDK config file not found at location:", CDK_CONFIG_PATH)
 ###
 # AWS OPTIONS
 ###
-AWS_REGION = get_or_create_env_var('AWS_REGION', '')
-AWS_ACCOUNT_ID = get_or_create_env_var('AWS_ACCOUNT_ID', '')
 ###
 # CDK OPTIONS
 ###
-CDK_PREFIX = get_or_create_env_var('CDK_PREFIX', '')
-CONTEXT_FILE = get_or_create_env_var('CONTEXT_FILE', 'cdk.context.json') # Define the CDK output context file name
-CDK_FOLDER = get_or_create_env_var('CDK_FOLDER', '') # FULL_PATH_TO_CDK_FOLDER_HERE (with forward slash)
-RUN_USEAST_STACK = get_or_create_env_var('RUN_USEAST_STACK', 'False')
 ### VPC and connections
-VPC_NAME = get_or_create_env_var('VPC_NAME', '')
-NEW_VPC_DEFAULT_NAME = get_or_create_env_var('NEW_VPC_DEFAULT_NAME', f'{CDK_PREFIX}vpc')
-NEW_VPC_CIDR = get_or_create_env_var('NEW_VPC_CIDR', '') # "10.0.0.0/24"
-EXISTING_IGW_ID = get_or_create_env_var('EXISTING_IGW_ID', '')
-SINGLE_NAT_GATEWAY_ID = get_or_create_env_var('SINGLE_NAT_GATEWAY_ID', '')
 ### SUBNETS / ROUTE TABLES / NAT GATEWAY
-PUBLIC_SUBNETS_TO_USE = get_or_create_env_var('PUBLIC_SUBNETS_TO_USE', '') # e.g. ['PublicSubnet1', 'PublicSubnet2']
-PUBLIC_SUBNET_CIDR_BLOCKS = get_or_create_env_var('PUBLIC_SUBNET_CIDR_BLOCKS', '') # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
-PUBLIC_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var('PUBLIC_SUBNET_AVAILABILITY_ZONES', '') # e.g. ["eu-east-1b", "eu-east1b"]
-PRIVATE_SUBNETS_TO_USE = get_or_create_env_var('PRIVATE_SUBNETS_TO_USE', '') # e.g. ['PrivateSubnet1', 'PrivateSubnet2']
-PRIVATE_SUBNET_CIDR_BLOCKS = get_or_create_env_var('PRIVATE_SUBNET_CIDR_BLOCKS', '') # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
-PRIVATE_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var('PRIVATE_SUBNET_AVAILABILITY_ZONES', '') # e.g. ["eu-east-1b", "eu-east1b"]
-ROUTE_TABLE_BASE_NAME = get_or_create_env_var('ROUTE_TABLE_BASE_NAME', f'{CDK_PREFIX}PrivateRouteTable')
-NAT_GATEWAY_EIP_NAME = get_or_create_env_var('NAT_GATEWAY_EIP_NAME', f"{CDK_PREFIX}NatGatewayEip")
-NAT_GATEWAY_NAME = get_or_create_env_var('NAT_GATEWAY_NAME', f"{CDK_PREFIX}NatGateway")
 # IAM roles
-AWS_MANAGED_TASK_ROLES_LIST = get_or_create_env_var('AWS_MANAGED_TASK_ROLES_LIST', '["AmazonCognitoReadOnly", "service-role/AmazonECSTaskExecutionRolePolicy", "AmazonS3FullAccess", "AmazonTextractFullAccess", "ComprehendReadOnly", "AmazonDynamoDBFullAccess", "service-role/AWSAppSyncPushToCloudWatchLogs"]')
-POLICY_FILE_LOCATIONS = get_or_create_env_var('POLICY_FILE_LOCATIONS', '') # e.g. '["config/sts_permissions.json"]'
-POLICY_FILE_ARNS = get_or_create_env_var('POLICY_FILE_ARNS', '')
 # GITHUB REPO
-GITHUB_REPO_USERNAME = get_or_create_env_var('GITHUB_REPO_USERNAME', 'seanpedrick-case')
-GITHUB_REPO_NAME = get_or_create_env_var('GITHUB_REPO_NAME', 'doc_redaction')
-GITHUB_REPO_BRANCH = get_or_create_env_var('GITHUB_REPO_BRANCH', 'main')
 ### CODEBUILD
-CODEBUILD_ROLE_NAME = get_or_create_env_var('CODEBUILD_ROLE_NAME', f"{CDK_PREFIX}CodeBuildRole")
-CODEBUILD_PROJECT_NAME = get_or_create_env_var('CODEBUILD_PROJECT_NAME', f"{CDK_PREFIX}CodeBuildProject")
 ### ECR
-ECR_REPO_NAME = get_or_create_env_var('ECR_REPO_NAME', 'doc-redaction') # Beware - cannot have underscores and must be lower case
-ECR_CDK_REPO_NAME = get_or_create_env_var('ECR_CDK_REPO_NAME', f"{CDK_PREFIX}{ECR_REPO_NAME}".lower())
 ### S3
-S3_LOG_CONFIG_BUCKET_NAME = get_or_create_env_var('S3_LOG_CONFIG_BUCKET_NAME', f"{CDK_PREFIX}s3-logs".lower()) # S3 bucket names need to be lower case
-S3_OUTPUT_BUCKET_NAME = get_or_create_env_var('S3_OUTPUT_BUCKET_NAME', f"{CDK_PREFIX}s3-output".lower())
 ### KMS KEYS FOR S3 AND SECRETS MANAGER
-USE_CUSTOM_KMS_KEY = get_or_create_env_var('USE_CUSTOM_KMS_KEY', '1')
-CUSTOM_KMS_KEY_NAME = get_or_create_env_var('CUSTOM_KMS_KEY_NAME', f"alias/{CDK_PREFIX}kms-key".lower())
 ### ECS
-FARGATE_TASK_DEFINITION_NAME = get_or_create_env_var('FARGATE_TASK_DEFINITION_NAME', f"{CDK_PREFIX}FargateTaskDefinition")
-TASK_DEFINITION_FILE_LOCATION = get_or_create_env_var('TASK_DEFINITION_FILE_LOCATION', CDK_FOLDER + CONFIG_FOLDER + "task_definition.json")
-CLUSTER_NAME = get_or_create_env_var('CLUSTER_NAME', f"{CDK_PREFIX}Cluster")
-ECS_SERVICE_NAME = get_or_create_env_var('ECS_SERVICE_NAME', f"{CDK_PREFIX}ECSService")
-ECS_TASK_ROLE_NAME = get_or_create_env_var('ECS_TASK_ROLE_NAME', f"{CDK_PREFIX}TaskRole")
-ECS_TASK_EXECUTION_ROLE_NAME = get_or_create_env_var('ECS_TASK_EXECUTION_ROLE_NAME', f"{CDK_PREFIX}ExecutionRole")
-ECS_SECURITY_GROUP_NAME = get_or_create_env_var('ECS_SECURITY_GROUP_NAME', f"{CDK_PREFIX}SecurityGroupECS")
-ECS_LOG_GROUP_NAME = get_or_create_env_var('ECS_LOG_GROUP_NAME', f"/ecs/{ECS_SERVICE_NAME}-logs".lower())
-ECS_TASK_CPU_SIZE = get_or_create_env_var('ECS_TASK_CPU_SIZE', '1024')
-ECS_TASK_MEMORY_SIZE = get_or_create_env_var('ECS_TASK_MEMORY_SIZE', '4096')
-ECS_USE_FARGATE_SPOT = get_or_create_env_var('USE_FARGATE_SPOT', 'False')
-ECS_READ_ONLY_FILE_SYSTEM = get_or_create_env_var('ECS_READ_ONLY_FILE_SYSTEM', 'True')
 ### Cognito
-COGNITO_USER_POOL_NAME = get_or_create_env_var('COGNITO_USER_POOL_NAME', f"{CDK_PREFIX}UserPool")
-COGNITO_USER_POOL_CLIENT_NAME = get_or_create_env_var('COGNITO_USER_POOL_CLIENT_NAME', f"{CDK_PREFIX}UserPoolClient")
-COGNITO_USER_POOL_CLIENT_SECRET_NAME = get_or_create_env_var('COGNITO_USER_POOL_CLIENT_SECRET_NAME', f"{CDK_PREFIX}ParamCognitoSecret")
-COGNITO_USER_POOL_DOMAIN_PREFIX = get_or_create_env_var('COGNITO_USER_POOL_DOMAIN_PREFIX', "redaction-app-domain") # Should change this to something unique or you'll probably hit an error
 # Application load balancer
-ALB_NAME = get_or_create_env_var('ALB_NAME', f"{CDK_PREFIX}Alb"[-32:]) # Application load balancer name can be max 32 characters, so taking the last 32 characters of the suggested name
-ALB_NAME_SECURITY_GROUP_NAME = get_or_create_env_var('ALB_SECURITY_GROUP_NAME', f"{CDK_PREFIX}SecurityGroupALB")
-ALB_TARGET_GROUP_NAME = get_or_create_env_var('ALB_TARGET_GROUP_NAME', f"{CDK_PREFIX}-tg"[-32:]) # Max 32 characters
-EXISTING_LOAD_BALANCER_ARN = get_or_create_env_var('EXISTING_LOAD_BALANCER_ARN', '')
-EXISTING_LOAD_BALANCER_DNS = get_or_create_env_var('EXISTING_LOAD_BALANCER_ARN', 'placeholder_load_balancer_dns.net')
 ## CLOUDFRONT
-USE_CLOUDFRONT = get_or_create_env_var('USE_CLOUDFRONT', 'True')
-CLOUDFRONT_PREFIX_LIST_ID = get_or_create_env_var('CLOUDFRONT_PREFIX_LIST_ID', 'pl-93a247fa')
-CLOUDFRONT_GEO_RESTRICTION = get_or_create_env_var('CLOUDFRONT_GEO_RESTRICTION', '') # A country that Cloudfront restricts access to. See here: https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/georestrictions.html
-CLOUDFRONT_DISTRIBUTION_NAME = get_or_create_env_var('CLOUDFRONT_DISTRIBUTION_NAME', f"{CDK_PREFIX}CfDist")
-CLOUDFRONT_DOMAIN = get_or_create_env_var('CLOUDFRONT_DOMAIN', "cloudfront_placeholder.net")
 # Certificate for Application load balancer (optional, for HTTPS and logins through the ALB)
-ACM_SSL_CERTIFICATE_ARN = get_or_create_env_var('ACM_SSL_CERTIFICATE_ARN', '')
-SSL_CERTIFICATE_DOMAIN = get_or_create_env_var('SSL_CERTIFICATE_DOMAIN', '') # e.g. example.com or www.example.com
 # This should be the CloudFront domain, the domain linked to your ACM certificate, or the DNS of your application load balancer in console afterwards
 if USE_CLOUDFRONT == "True":
-    COGNITO_REDIRECTION_URL = get_or_create_env_var('COGNITO_REDIRECTION_URL', "https://" + CLOUDFRONT_DOMAIN)
 elif SSL_CERTIFICATE_DOMAIN:
-    COGNITO_REDIRECTION_URL = get_or_create_env_var('COGNITO_REDIRECTION_URL', "https://" + SSL_CERTIFICATE_DOMAIN)
 else:
-    COGNITO_REDIRECTION_URL = get_or_create_env_var('COGNITO_REDIRECTION_URL', "https://" + EXISTING_LOAD_BALANCER_DNS)
 # Custom headers e.g. if routing traffic through Cloudfront
-CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '') # Retrieving or setting CUSTOM_HEADER
-CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '') # Retrieving or setting CUSTOM_HEADER_VALUE
 # Firewall on top of load balancer
-LOAD_BALANCER_WEB_ACL_NAME = get_or_create_env_var('LOAD_BALANCER_WEB_ACL_NAME', f"{CDK_PREFIX}alb-web-acl")
 # Firewall on top of CloudFront
-WEB_ACL_NAME = get_or_create_env_var('WEB_ACL_NAME', f"{CDK_PREFIX}cloudfront-web-acl")
 ###
 # File I/O options
 ###
-OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
-INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
 # Allow for files to be saved in a temporary folder for increased security in some instances
-if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
     # Create a temporary directory
     with tempfile.TemporaryDirectory() as temp_dir:
-        print(f'Temporary directory created at: {temp_dir}')
-        if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
-        if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
 ###
 # LOGGING OPTIONS
 ###
-SAVE_LOGS_TO_CSV = get_or_create_env_var('SAVE_LOGS_TO_CSV', 'True')
 ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
-SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'True')
-ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-access-logs".lower())
-FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('FEEDBACK_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-feedback-logs".lower())
-USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-usage-logs".lower())
 ###
 # REDACTION OPTIONS
 ###
 # Get some environment variables and Launch the Gradio app
-COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
-GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
 ###
 # WHOLE DOCUMENT API OPTIONS
 ###
-DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var('DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS', '7') # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.

 import os
 import tempfile
 from dotenv import load_dotenv
 # Set or retrieve configuration variables for CDK redaction deployment
+def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
+    """
     Get an environmental variable, and set it to a default value if it doesn't exist
+    """
     # Get the environment variable if it exists
     value = os.environ.get(var_name)
     # If it doesn't exist, set the environment variable to the default value
     if value is None:
         os.environ[var_name] = default_value
         value = default_value
+    if print_val is True:
+        print(f"The value of {var_name} is {value}")
     return value
+def ensure_folder_exists(output_folder: str):
+    """Checks if the specified folder exists, creates it if not."""
     if not os.path.exists(output_folder):
         # Create the folder if it doesn't exist
     else:
         print(f"The {output_folder} folder already exists.")
 def add_folder_to_path(folder_path: str):
+    """
     Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
+    """
     if os.path.exists(folder_path) and os.path.isdir(folder_path):
         print(folder_path, "folder exists.")
         # Resolve relative path to absolute path
         absolute_path = os.path.abspath(folder_path)
+        current_path = os.environ["PATH"]
         if absolute_path not in current_path.split(os.pathsep):
             full_path_extension = absolute_path + os.pathsep + current_path
+            os.environ["PATH"] = full_path_extension
+            # print(f"Updated PATH with: ", full_path_extension)
         else:
             print(f"Directory {folder_path} already exists in PATH.")
     else:
         print(f"Folder not found at {folder_path} - not added to PATH")
 ###
 # LOAD CONFIG FROM ENV FILE
 ###
+CONFIG_FOLDER = get_or_create_env_var("CONFIG_FOLDER", "config/")
 ensure_folder_exists(CONFIG_FOLDER)
 # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/cdk_config.env'
+CDK_CONFIG_PATH = get_or_create_env_var(
+    "CDK_CONFIG_PATH", "config/cdk_config.env"
+)  # e.g. config/cdk_config.env
 if CDK_CONFIG_PATH:
     if os.path.exists(CDK_CONFIG_PATH):
         print(f"Loading CDK variables from config file {CDK_CONFIG_PATH}")
         load_dotenv(CDK_CONFIG_PATH)
+    else:
+        print("CDK config file not found at location:", CDK_CONFIG_PATH)
 ###
 # AWS OPTIONS
 ###
+AWS_REGION = get_or_create_env_var("AWS_REGION", "")
+AWS_ACCOUNT_ID = get_or_create_env_var("AWS_ACCOUNT_ID", "")
 ###
 # CDK OPTIONS
 ###
+CDK_PREFIX = get_or_create_env_var("CDK_PREFIX", "")
+CONTEXT_FILE = get_or_create_env_var(
+    "CONTEXT_FILE", "cdk.context.json"
+)  # Define the CDK output context file name
+CDK_FOLDER = get_or_create_env_var(
+    "CDK_FOLDER", ""
+)  # FULL_PATH_TO_CDK_FOLDER_HERE (with forward slash)
+RUN_USEAST_STACK = get_or_create_env_var("RUN_USEAST_STACK", "False")
 ### VPC and connections
+VPC_NAME = get_or_create_env_var("VPC_NAME", "")
+NEW_VPC_DEFAULT_NAME = get_or_create_env_var("NEW_VPC_DEFAULT_NAME", f"{CDK_PREFIX}vpc")
+NEW_VPC_CIDR = get_or_create_env_var("NEW_VPC_CIDR", "")  # "10.0.0.0/24"
+EXISTING_IGW_ID = get_or_create_env_var("EXISTING_IGW_ID", "")
+SINGLE_NAT_GATEWAY_ID = get_or_create_env_var("SINGLE_NAT_GATEWAY_ID", "")
 ### SUBNETS / ROUTE TABLES / NAT GATEWAY
+PUBLIC_SUBNETS_TO_USE = get_or_create_env_var(
+    "PUBLIC_SUBNETS_TO_USE", ""
+)  # e.g. ['PublicSubnet1', 'PublicSubnet2']
+PUBLIC_SUBNET_CIDR_BLOCKS = get_or_create_env_var(
+    "PUBLIC_SUBNET_CIDR_BLOCKS", ""
+)  # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
+PUBLIC_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var(
+    "PUBLIC_SUBNET_AVAILABILITY_ZONES", ""
+)  # e.g. ["eu-east-1b", "eu-east1b"]
+PRIVATE_SUBNETS_TO_USE = get_or_create_env_var(
+    "PRIVATE_SUBNETS_TO_USE", ""
+)  # e.g. ['PrivateSubnet1', 'PrivateSubnet2']
+PRIVATE_SUBNET_CIDR_BLOCKS = get_or_create_env_var(
+    "PRIVATE_SUBNET_CIDR_BLOCKS", ""
+)  # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
+PRIVATE_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var(
+    "PRIVATE_SUBNET_AVAILABILITY_ZONES", ""
+)  # e.g. ["eu-east-1b", "eu-east1b"]
+ROUTE_TABLE_BASE_NAME = get_or_create_env_var(
+    "ROUTE_TABLE_BASE_NAME", f"{CDK_PREFIX}PrivateRouteTable"
+)
+NAT_GATEWAY_EIP_NAME = get_or_create_env_var(
+    "NAT_GATEWAY_EIP_NAME", f"{CDK_PREFIX}NatGatewayEip"
+)
+NAT_GATEWAY_NAME = get_or_create_env_var("NAT_GATEWAY_NAME", f"{CDK_PREFIX}NatGateway")
 # IAM roles
+AWS_MANAGED_TASK_ROLES_LIST = get_or_create_env_var(
+    "AWS_MANAGED_TASK_ROLES_LIST",
+    '["AmazonCognitoReadOnly", "service-role/AmazonECSTaskExecutionRolePolicy", "AmazonS3FullAccess", "AmazonTextractFullAccess", "ComprehendReadOnly", "AmazonDynamoDBFullAccess", "service-role/AWSAppSyncPushToCloudWatchLogs"]',
+)
+POLICY_FILE_LOCATIONS = get_or_create_env_var(
+    "POLICY_FILE_LOCATIONS", ""
+)  # e.g. '["config/sts_permissions.json"]'
+POLICY_FILE_ARNS = get_or_create_env_var("POLICY_FILE_ARNS", "")
 # GITHUB REPO
+GITHUB_REPO_USERNAME = get_or_create_env_var("GITHUB_REPO_USERNAME", "seanpedrick-case")
+GITHUB_REPO_NAME = get_or_create_env_var("GITHUB_REPO_NAME", "doc_redaction")
+GITHUB_REPO_BRANCH = get_or_create_env_var("GITHUB_REPO_BRANCH", "main")
 ### CODEBUILD
+CODEBUILD_ROLE_NAME = get_or_create_env_var(
+    "CODEBUILD_ROLE_NAME", f"{CDK_PREFIX}CodeBuildRole"
+)
+CODEBUILD_PROJECT_NAME = get_or_create_env_var(
+    "CODEBUILD_PROJECT_NAME", f"{CDK_PREFIX}CodeBuildProject"
+)
 ### ECR
+ECR_REPO_NAME = get_or_create_env_var(
+    "ECR_REPO_NAME", "doc-redaction"
+)  # Beware - cannot have underscores and must be lower case
+ECR_CDK_REPO_NAME = get_or_create_env_var(
+    "ECR_CDK_REPO_NAME", f"{CDK_PREFIX}{ECR_REPO_NAME}".lower()
+)
 ### S3
+S3_LOG_CONFIG_BUCKET_NAME = get_or_create_env_var(
+    "S3_LOG_CONFIG_BUCKET_NAME", f"{CDK_PREFIX}s3-logs".lower()
+)  # S3 bucket names need to be lower case
+S3_OUTPUT_BUCKET_NAME = get_or_create_env_var(
+    "S3_OUTPUT_BUCKET_NAME", f"{CDK_PREFIX}s3-output".lower()
+)
 ### KMS KEYS FOR S3 AND SECRETS MANAGER
+USE_CUSTOM_KMS_KEY = get_or_create_env_var("USE_CUSTOM_KMS_KEY", "1")
+CUSTOM_KMS_KEY_NAME = get_or_create_env_var(
+    "CUSTOM_KMS_KEY_NAME", f"alias/{CDK_PREFIX}kms-key".lower()
+)
 ### ECS
+FARGATE_TASK_DEFINITION_NAME = get_or_create_env_var(
+    "FARGATE_TASK_DEFINITION_NAME", f"{CDK_PREFIX}FargateTaskDefinition"
+)
+TASK_DEFINITION_FILE_LOCATION = get_or_create_env_var(
+    "TASK_DEFINITION_FILE_LOCATION", CDK_FOLDER + CONFIG_FOLDER + "task_definition.json"
+)
+CLUSTER_NAME = get_or_create_env_var("CLUSTER_NAME", f"{CDK_PREFIX}Cluster")
+ECS_SERVICE_NAME = get_or_create_env_var("ECS_SERVICE_NAME", f"{CDK_PREFIX}ECSService")
+ECS_TASK_ROLE_NAME = get_or_create_env_var(
+    "ECS_TASK_ROLE_NAME", f"{CDK_PREFIX}TaskRole"
+)
+ECS_TASK_EXECUTION_ROLE_NAME = get_or_create_env_var(
+    "ECS_TASK_EXECUTION_ROLE_NAME", f"{CDK_PREFIX}ExecutionRole"
+)
+ECS_SECURITY_GROUP_NAME = get_or_create_env_var(
+    "ECS_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupECS"
+)
+ECS_LOG_GROUP_NAME = get_or_create_env_var(
+    "ECS_LOG_GROUP_NAME", f"/ecs/{ECS_SERVICE_NAME}-logs".lower()
+)
+ECS_TASK_CPU_SIZE = get_or_create_env_var("ECS_TASK_CPU_SIZE", "1024")
+ECS_TASK_MEMORY_SIZE = get_or_create_env_var("ECS_TASK_MEMORY_SIZE", "4096")
+ECS_USE_FARGATE_SPOT = get_or_create_env_var("USE_FARGATE_SPOT", "False")
+ECS_READ_ONLY_FILE_SYSTEM = get_or_create_env_var("ECS_READ_ONLY_FILE_SYSTEM", "True")
 ### Cognito
+COGNITO_USER_POOL_NAME = get_or_create_env_var(
+    "COGNITO_USER_POOL_NAME", f"{CDK_PREFIX}UserPool"
+)
+COGNITO_USER_POOL_CLIENT_NAME = get_or_create_env_var(
+    "COGNITO_USER_POOL_CLIENT_NAME", f"{CDK_PREFIX}UserPoolClient"
+)
+COGNITO_USER_POOL_CLIENT_SECRET_NAME = get_or_create_env_var(
+    "COGNITO_USER_POOL_CLIENT_SECRET_NAME", f"{CDK_PREFIX}ParamCognitoSecret"
+)
+COGNITO_USER_POOL_DOMAIN_PREFIX = get_or_create_env_var(
+    "COGNITO_USER_POOL_DOMAIN_PREFIX", "redaction-app-domain"
+)  # Should change this to something unique or you'll probably hit an error
 # Application load balancer
+ALB_NAME = get_or_create_env_var(
+    "ALB_NAME", f"{CDK_PREFIX}Alb"[-32:]
+)  # Application load balancer name can be max 32 characters, so taking the last 32 characters of the suggested name
+ALB_NAME_SECURITY_GROUP_NAME = get_or_create_env_var(
+    "ALB_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupALB"
+)
+ALB_TARGET_GROUP_NAME = get_or_create_env_var(
+    "ALB_TARGET_GROUP_NAME", f"{CDK_PREFIX}-tg"[-32:]
+)  # Max 32 characters
+EXISTING_LOAD_BALANCER_ARN = get_or_create_env_var("EXISTING_LOAD_BALANCER_ARN", "")
+EXISTING_LOAD_BALANCER_DNS = get_or_create_env_var(
+    "EXISTING_LOAD_BALANCER_ARN", "placeholder_load_balancer_dns.net"
+)
 ## CLOUDFRONT
+USE_CLOUDFRONT = get_or_create_env_var("USE_CLOUDFRONT", "True")
+CLOUDFRONT_PREFIX_LIST_ID = get_or_create_env_var(
+    "CLOUDFRONT_PREFIX_LIST_ID", "pl-93a247fa"
+)
+CLOUDFRONT_GEO_RESTRICTION = get_or_create_env_var(
+    "CLOUDFRONT_GEO_RESTRICTION", ""
+)  # A country that Cloudfront restricts access to. See here: https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/georestrictions.html
+CLOUDFRONT_DISTRIBUTION_NAME = get_or_create_env_var(
+    "CLOUDFRONT_DISTRIBUTION_NAME", f"{CDK_PREFIX}CfDist"
+)
+CLOUDFRONT_DOMAIN = get_or_create_env_var(
+    "CLOUDFRONT_DOMAIN", "cloudfront_placeholder.net"
+)
 # Certificate for Application load balancer (optional, for HTTPS and logins through the ALB)
+ACM_SSL_CERTIFICATE_ARN = get_or_create_env_var("ACM_SSL_CERTIFICATE_ARN", "")
+SSL_CERTIFICATE_DOMAIN = get_or_create_env_var(
+    "SSL_CERTIFICATE_DOMAIN", ""
+)  # e.g. example.com or www.example.com
 # This should be the CloudFront domain, the domain linked to your ACM certificate, or the DNS of your application load balancer in console afterwards
 if USE_CLOUDFRONT == "True":
+    COGNITO_REDIRECTION_URL = get_or_create_env_var(
+        "COGNITO_REDIRECTION_URL", "https://" + CLOUDFRONT_DOMAIN
+    )
 elif SSL_CERTIFICATE_DOMAIN:
+    COGNITO_REDIRECTION_URL = get_or_create_env_var(
+        "COGNITO_REDIRECTION_URL", "https://" + SSL_CERTIFICATE_DOMAIN
+    )
 else:
+    COGNITO_REDIRECTION_URL = get_or_create_env_var(
+        "COGNITO_REDIRECTION_URL", "https://" + EXISTING_LOAD_BALANCER_DNS
+    )
 # Custom headers e.g. if routing traffic through Cloudfront
+CUSTOM_HEADER = get_or_create_env_var(
+    "CUSTOM_HEADER", ""
+)  # Retrieving or setting CUSTOM_HEADER
+CUSTOM_HEADER_VALUE = get_or_create_env_var(
+    "CUSTOM_HEADER_VALUE", ""
+)  # Retrieving or setting CUSTOM_HEADER_VALUE
 # Firewall on top of load balancer
+LOAD_BALANCER_WEB_ACL_NAME = get_or_create_env_var(
+    "LOAD_BALANCER_WEB_ACL_NAME", f"{CDK_PREFIX}alb-web-acl"
+)
 # Firewall on top of CloudFront
+WEB_ACL_NAME = get_or_create_env_var("WEB_ACL_NAME", f"{CDK_PREFIX}cloudfront-web-acl")
 ###
 # File I/O options
 ###
+OUTPUT_FOLDER = get_or_create_env_var("GRADIO_OUTPUT_FOLDER", "output/")  # 'output/'
+INPUT_FOLDER = get_or_create_env_var("GRADIO_INPUT_FOLDER", "input/")  # 'input/'
 # Allow for files to be saved in a temporary folder for increased security in some instances
+if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
     # Create a temporary directory
     with tempfile.TemporaryDirectory() as temp_dir:
+        print(f"Temporary directory created at: {temp_dir}")
+        if OUTPUT_FOLDER == "TEMP":
+            OUTPUT_FOLDER = temp_dir + "/"
+        if INPUT_FOLDER == "TEMP":
+            INPUT_FOLDER = temp_dir + "/"
 ###
 # LOGGING OPTIONS
 ###
+SAVE_LOGS_TO_CSV = get_or_create_env_var("SAVE_LOGS_TO_CSV", "True")
 ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
+SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var("SAVE_LOGS_TO_DYNAMODB", "True")
+ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "ACCESS_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-access-logs".lower()
+)
+FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "FEEDBACK_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-feedback-logs".lower()
+)
+USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "USAGE_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-usage-logs".lower()
+)
 ###
 # REDACTION OPTIONS
 ###
 # Get some environment variables and Launch the Gradio app
+COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
+GRADIO_SERVER_PORT = int(get_or_create_env_var("GRADIO_SERVER_PORT", "7860"))
 ###
 # WHOLE DOCUMENT API OPTIONS
 ###
+DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var(
+    "DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS", "7"
+)  # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.

cdk/cdk_functions.py CHANGED Viewed

@@ -1,34 +1,42 @@
-import boto3
-from botocore.exceptions import ClientError
 import json
 import os
 import pandas as pd
-import ipaddress
 from constructs import Construct
 from dotenv import set_key
-from typing import List, Tuple, Optional, Dict, Any
-from aws_cdk import (
-    App,
-    CfnTag,
-    aws_ec2 as ec2,
-    aws_wafv2 as wafv2,
-    aws_elasticloadbalancingv2 as elb,
-    aws_elasticloadbalancingv2_actions as elb_act,
-    aws_certificatemanager as acm, # You might need this if you were looking up a cert, but not strictly for ARN
-    aws_cognito as cognito,
-    aws_iam as iam,
-    CfnOutput,
-    Tags
-)
-from cdk_config import PUBLIC_SUBNETS_TO_USE, PRIVATE_SUBNETS_TO_USE, PUBLIC_SUBNET_CIDR_BLOCKS, PRIVATE_SUBNET_CIDR_BLOCKS, PUBLIC_SUBNET_AVAILABILITY_ZONES, PRIVATE_SUBNET_AVAILABILITY_ZONES, POLICY_FILE_LOCATIONS, NAT_GATEWAY_EIP_NAME, S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, AWS_REGION
 # --- Function to load context from file ---
 def load_context_from_file(app: App, file_path: str):
     if os.path.exists(file_path):
-        with open(file_path, 'r') as f:
             context_data = json.load(f)
             for key, value in context_data.items():
                 app.node.set_context(key, value)
@@ -36,35 +44,47 @@ def load_context_from_file(app: App, file_path: str):
     else:
         print(f"Context file not found: {file_path}")
 # --- Helper to parse environment variables into lists ---
 def _get_env_list(env_var_name: str) -> List[str]:
     """Parses a comma-separated environment variable into a list of strings."""
-    value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
     if not value:
         return []
     # Split by comma and filter out any empty strings that might result from extra commas
-    return [s.strip() for s in value.split(',') if s.strip()]
 # 1. Try to load CIDR/AZs from environment variables
-if PUBLIC_SUBNETS_TO_USE: PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
-if PRIVATE_SUBNETS_TO_USE: PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
-if PUBLIC_SUBNET_CIDR_BLOCKS: PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
-if PUBLIC_SUBNET_AVAILABILITY_ZONES: PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
-if PRIVATE_SUBNET_CIDR_BLOCKS: PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
-if PRIVATE_SUBNET_AVAILABILITY_ZONES: PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list("PRIVATE_SUBNET_AVAILABILITY_ZONES")
-if POLICY_FILE_LOCATIONS: POLICY_FILE_LOCATIONS = _get_env_list(POLICY_FILE_LOCATIONS)
-def check_for_existing_role(role_name:str):
     try:
-        iam = boto3.client('iam')
-        #iam.get_role(RoleName=role_name)
         response = iam.get_role(RoleName=role_name)
-        role = response['Role']['Arn']
-        print("Response Role:", role)
         return True, role, ""
     except iam.exceptions.NoSuchEntityException:
@@ -72,12 +92,8 @@ def check_for_existing_role(role_name:str):
     except Exception as e:
         raise Exception("Getting information on IAM role failed due to:", e)
-import json
-from typing import List, Dict, Any, Union, Optional
-from aws_cdk import (
-    aws_iam as iam,
-)
-from constructs import Construct
 # Assume POLICY_FILE_LOCATIONS is defined globally or passed as a default
 # For example:
@@ -93,11 +109,13 @@ def add_statement_to_policy(role: iam.IRole, policy_document: Dict[str, Any]):
         policy_document: A Python dictionary representing an IAM policy document.
     """
     # Ensure the loaded JSON is a valid policy document structure
-    if 'Statement' not in policy_document or not isinstance(policy_document['Statement'], list):
-        print(f"Warning: Policy document does not contain a 'Statement' list. Skipping.")
-        return # Do not return role, just log and exit
-    for statement_dict in policy_document['Statement']:
         try:
             # Create a CDK PolicyStatement from the dictionary
             cdk_policy_statement = iam.PolicyStatement.from_json(statement_dict)
@@ -106,13 +124,16 @@ def add_statement_to_policy(role: iam.IRole, policy_document: Dict[str, Any]):
             role.add_to_policy(cdk_policy_statement)
             print(f"  - Added statement: {statement_dict.get('Sid', 'No Sid')}")
         except Exception as e:
-            print(f"Warning: Could not process policy statement: {statement_dict}. Error: {e}")
 def add_custom_policies(
-    scope: Construct, # Not strictly used here, but good practice if you expand to ManagedPolicies
     role: iam.IRole,
     policy_file_locations: Optional[List[str]] = None,
-    custom_policy_text: Optional[str] = None
 ) -> iam.IRole:
     """
     Loads custom policies from JSON files or a string and attaches them to a CDK Role.
@@ -129,7 +150,7 @@ def add_custom_policies(
     if policy_file_locations is None:
         policy_file_locations = []
-    current_source = "unknown source" # For error messages
     try:
         if policy_file_locations:
@@ -137,20 +158,26 @@ def add_custom_policies(
             for path in policy_file_locations:
                 current_source = f"file: {path}"
                 try:
-                    with open(path, 'r') as f:
                         policy_document = json.load(f)
                     print(f"Processing policy from {current_source}...")
                     add_statement_to_policy(role, policy_document)
                 except FileNotFoundError:
                     print(f"Warning: Policy file not found at {path}. Skipping.")
                 except json.JSONDecodeError as e:
-                    print(f"Warning: Invalid JSON in policy file {path}: {e}. Skipping.")
                 except Exception as e:
-                    print(f"An unexpected error occurred processing policy from {path}: {e}. Skipping.")
         if custom_policy_text:
             current_source = "custom policy text string"
-            print(f"Attempting to add policy from custom text to role {role.node.id}...")
             try:
                 # *** FIX: Parse the JSON string into a Python dictionary ***
                 policy_document = json.loads(custom_policy_text)
@@ -159,20 +186,28 @@ def add_custom_policies(
             except json.JSONDecodeError as e:
                 print(f"Warning: Invalid JSON in custom_policy_text: {e}. Skipping.")
             except Exception as e:
-                print(f"An unexpected error occurred processing policy from custom_policy_text: {e}. Skipping.")
         # You might want a final success message, but individual processing messages are also good.
         print(f"Finished processing custom policies for role {role.node.id}.")
     except Exception as e:
-        print(f"An unhandled error occurred during policy addition for {current_source}: {e}")
     return role
 # Import the S3 Bucket class if you intend to return a CDK object later
 # from aws_cdk import aws_s3 as s3
-def check_s3_bucket_exists(bucket_name: str): # Return type hint depends on what you return
     """
     Checks if an S3 bucket with the given name exists and is accessible.
@@ -186,72 +221,89 @@ def check_s3_bucket_exists(bucket_name: str): # Return type hint depends on what
               or the bucket name for CDK lookups/creations.
               For this example, let's return the boolean and the name.
     """
-    s3_client = boto3.client('s3')
     try:
         # Use head_bucket to check for existence and access
         s3_client.head_bucket(Bucket=bucket_name)
         print(f"Bucket '{bucket_name}' exists and is accessible.")
-        return True, bucket_name # Return True and the bucket name
     except ClientError as e:
         # If a ClientError occurs, check the error code.
         # '404' means the bucket does not exist.
         # '403' means the bucket exists but you don't have permission.
-        error_code = e.response['Error']['Code']
-        if error_code == '404':
             print(f"Bucket '{bucket_name}' does not exist.")
             return False, None
-        elif error_code == '403':
-             # The bucket exists, but you can't access it.
-             # Depending on your requirements, this might be treated as "exists"
-             # or "not accessible for our purpose". For checking existence,
-             # we'll say it exists here, but note the permission issue.
-             # NOTE - when I tested this, it was returning 403 even for buckets that don't exist. So I will return False instead
-            print(f"Bucket '{bucket_name}' returned 403, which indicates it may exist but is not accessible due to permissions, or that it doesn't exist. Returning False for existence just in case.")
-            return False, bucket_name # It exists, even if not accessible
         else:
             # For other errors, it's better to raise the exception
             # to indicate something unexpected happened.
-            print(f"An unexpected AWS ClientError occurred checking bucket '{bucket_name}': {e}")
             # Decide how to handle other errors - raising might be safer
-            raise # Re-raise the original exception
     except Exception as e:
-        print(f"An unexpected non-ClientError occurred checking bucket '{bucket_name}': {e}")
         # Decide how to handle other errors
-        raise # Re-raise the original exception
 # Example usage in your check_resources.py:
 # exists, bucket_name_if_exists = check_s3_bucket_exists(log_bucket_name)
 # context_data[f"exists:{log_bucket_name}"] = exists
 # # You don't necessarily need to store the name in context if using from_bucket_name
 # Delete an S3 bucket
-def delete_s3_bucket(bucket_name:str):
-    s3 = boto3.client('s3')
     try:
         # List and delete all objects
         response = s3.list_object_versions(Bucket=bucket_name)
-        versions = response.get('Versions', []) + response.get('DeleteMarkers', [])
         for version in versions:
-            s3.delete_object(Bucket=bucket_name, Key=version['Key'], VersionId=version['VersionId'])
         # Delete the bucket
         s3.delete_bucket(Bucket=bucket_name)
-        return {'Status': 'SUCCESS'}
     except Exception as e:
-        return {'Status': 'FAILED', 'Reason': str(e)}
 # Function to get subnet ID from subnet name
-def get_subnet_id(vpc:str, ec2_client:str, subnet_name:str):
-    response = ec2_client.describe_subnets(Filters=[{'Name': 'vpc-id', 'Values': [vpc.vpc_id]}])
-    for subnet in response['Subnets']:
-        if subnet['Tags'] and any(tag['Key'] == 'Name' and tag['Value'] == subnet_name for tag in subnet['Tags']):
-            return subnet['SubnetId']
     return None
 def check_ecr_repo_exists(repo_name: str) -> tuple[bool, dict]:
     """
     Checks if an ECR repository with the given name exists.
@@ -262,16 +314,16 @@ def check_ecr_repo_exists(repo_name: str) -> tuple[bool, dict]:
     Returns:
         True if the repository exists, False otherwise.
     """
-    ecr_client = boto3.client('ecr')
     try:
         print("ecr repo_name to check:", repo_name)
         response = ecr_client.describe_repositories(repositoryNames=[repo_name])
         # If describe_repositories succeeds and returns a list of repositories,
         # and the list is not empty, the repository exists.
-        return len(response['repositories']) > 0, response['repositories'][0]
     except ClientError as e:
         # Check for the specific error code indicating the repository doesn't exist
-        if e.response['Error']['Code'] == 'RepositoryNotFoundException':
             return False, {}
         else:
             # Re-raise other exceptions to handle unexpected errors
@@ -279,8 +331,11 @@ def check_ecr_repo_exists(repo_name: str) -> tuple[bool, dict]:
     except Exception as e:
         print(f"An unexpected error occurred: {e}")
         return False, {}
-def check_codebuild_project_exists(project_name: str): # Adjust return type hint as needed
     """
     Checks if a CodeBuild project with the given name exists.
@@ -293,27 +348,35 @@ def check_codebuild_project_exists(project_name: str): # Adjust return type hint
         - The second element is the project object (dictionary) if found,
           None otherwise.
     """
-    codebuild_client = boto3.client('codebuild')
     try:
         # Use batch_get_projects with a list containing the single project name
         response = codebuild_client.batch_get_projects(names=[project_name])
         # The response for batch_get_projects includes 'projects' (found)
         # and 'projectsNotFound' (not found).
-        if response['projects']:
             # If the project is found in the 'projects' list
             print(f"CodeBuild project '{project_name}' found.")
-            return True, response['projects'][0]['arn']  # Return True and the project details dict
-        elif response['projectsNotFound'] and project_name in response['projectsNotFound']:
-             # If the project name is explicitly in the 'projectsNotFound' list
-             print(f"CodeBuild project '{project_name}' not found.")
-             return False, None
         else:
             # This case is less expected for a single name lookup,
             # but could happen if there's an internal issue or the response
             # structure is slightly different than expected for an error.
             # It's safer to assume it wasn't found if not in 'projects'.
-            print(f"CodeBuild project '{project_name}' not found (not in 'projects' list).")
             return False, None
     except ClientError as e:
@@ -321,50 +384,53 @@ def check_codebuild_project_exists(project_name: str): # Adjust return type hint
         # 'InvalidInputException' for a non-existent project name if the
         # name format is valid. It typically just lists it in projectsNotFound.
         # However, other ClientErrors are possible (e.g., permissions).
-        print(f"An AWS ClientError occurred checking CodeBuild project '{project_name}': {e}")
         # Decide how to handle other ClientErrors - raising might be safer
-        raise # Re-raise the original exception
     except Exception as e:
-        print(f"An unexpected non-ClientError occurred checking CodeBuild project '{project_name}': {e}")
         # Decide how to handle other errors
-        raise # Re-raise the original exception
 def get_vpc_id_by_name(vpc_name: str) -> Optional[str]:
     """
     Finds a VPC ID by its 'Name' tag.
     """
-    ec2_client = boto3.client('ec2')
     try:
         response = ec2_client.describe_vpcs(
-            Filters=[
-                {'Name': 'tag:Name', 'Values': [vpc_name]}
-            ]
         )
-        if response and response['Vpcs']:
-            vpc_id = response['Vpcs'][0]['VpcId']
             print(f"VPC '{vpc_name}' found with ID: {vpc_id}")
             # In get_vpc_id_by_name, after finding VPC ID:
             # Look for NAT Gateways in this VPC
-            ec2_client = boto3.client('ec2')
             nat_gateways = []
             try:
                 response = ec2_client.describe_nat_gateways(
                     Filters=[
-                        {'Name': 'vpc-id', 'Values': [vpc_id]},
                         # Optional: Add a tag filter if you consistently tag your NATs
                         # {'Name': 'tag:Name', 'Values': [f"{prefix}-nat-gateway"]}
                     ]
                 )
-                nat_gateways = response.get('NatGateways', [])
             except Exception as e:
-                print(f"Warning: Could not describe NAT Gateways in VPC '{vpc_id}': {e}")
                 # Decide how to handle this error - proceed or raise?
             # Decide how to identify the specific NAT Gateway you want to check for.
             return vpc_id, nat_gateways
         else:
@@ -374,6 +440,7 @@ def get_vpc_id_by_name(vpc_name: str) -> Optional[str]:
         print(f"An unexpected error occurred finding VPC '{vpc_name}': {e}")
         raise
 # --- Helper to fetch all existing subnets in a VPC once ---
 def _get_existing_subnets_in_vpc(vpc_id: str) -> Dict[str, Any]:
     """
@@ -381,21 +448,26 @@ def _get_existing_subnets_in_vpc(vpc_id: str) -> Dict[str, Any]:
     Returns a dictionary with 'by_name' (map of name to subnet data),
     'by_id' (map of id to subnet data), and 'cidr_networks' (list of ipaddress.IPv4Network).
     """
-    ec2_client = boto3.client('ec2')
     existing_subnets_data = {
         "by_name": {},  # {subnet_name: {'id': 'subnet-id', 'cidr': 'x.x.x.x/x'}}
-        "by_id": {},    # {subnet_id: {'name': 'subnet-name', 'cidr': 'x.x.x.x/x'}}
-        "cidr_networks": [] # List of ipaddress.IPv4Network objects
     }
     try:
-        response = ec2_client.describe_subnets(Filters=[{'Name': 'vpc-id', 'Values': [vpc_id]}])
-        for s in response.get('Subnets', []):
-            subnet_id = s['SubnetId']
-            cidr_block = s.get('CidrBlock')
             # Extract 'Name' tag, which is crucial for lookup by name
-            name_tag = next((tag['Value'] for tag in s.get('Tags', []) if tag['Key'] == 'Name'), None)
-            subnet_info = {'id': subnet_id, 'cidr': cidr_block, 'name': name_tag}
             if name_tag:
                 existing_subnets_data["by_name"][name_tag] = subnet_info
@@ -403,22 +475,35 @@ def _get_existing_subnets_in_vpc(vpc_id: str) -> Dict[str, Any]:
             if cidr_block:
                 try:
-                    existing_subnets_data["cidr_networks"].append(ipaddress.ip_network(cidr_block, strict=False))
                 except ValueError:
-                    print(f"Warning: Existing subnet {subnet_id} has an invalid CIDR: {cidr_block}. Skipping for overlap check.")
-        print(f"Fetched {len(response.get('Subnets', []))} existing subnets from VPC '{vpc_id}'.")
     except Exception as e:
-        print(f"Error describing existing subnets in VPC '{vpc_id}': {e}. Cannot perform full validation.")
-        raise # Re-raise if this essential step fails
     return existing_subnets_data
 # --- Modified validate_subnet_creation_parameters to take pre-fetched data ---
 def validate_subnet_creation_parameters(
     vpc_id: str,
-    proposed_subnets_data: List[Dict[str, str]], # e.g., [{'name': 'my-public-subnet', 'cidr': '10.0.0.0/24', 'az': 'us-east-1a'}]
-    existing_aws_subnets_data: Dict[str, Any] # Pre-fetched data from _get_existing_subnets_in_vpc
 ) -> None:
     """
     Validates proposed subnet names and CIDR blocks against existing AWS subnets
@@ -440,9 +525,11 @@ def validate_subnet_creation_parameters(
         print("No proposed subnet data provided for validation. Skipping.")
         return
-    print(f"--- Starting pre-synth validation for VPC '{vpc_id}' with proposed subnets ---")
-    print("Existing subnet data:", pd.DataFrame(existing_aws_subnets_data['by_name']))
     existing_aws_subnet_names = set(existing_aws_subnets_data["by_name"].keys())
     existing_aws_cidr_networks = existing_aws_subnets_data["cidr_networks"]
@@ -452,27 +539,35 @@ def validate_subnet_creation_parameters(
     proposed_cidr_networks_seen: List[ipaddress.IPv4Network] = []
     for i, proposed_subnet in enumerate(proposed_subnets_data):
-        subnet_name = proposed_subnet.get('name')
-        cidr_block_str = proposed_subnet.get('cidr')
-        availability_zone = proposed_subnet.get('az')
         if not all([subnet_name, cidr_block_str, availability_zone]):
-            raise ValueError(f"Proposed subnet at index {i} is incomplete. Requires 'name', 'cidr', and 'az'.")
         # 1. Check for duplicate names within the proposed batch
         if subnet_name in proposed_names_seen:
-            raise ValueError(f"Proposed subnet name '{subnet_name}' is duplicated within the input list.")
         proposed_names_seen.add(subnet_name)
         # 2. Check for duplicate names against existing AWS subnets
         if subnet_name in existing_aws_subnet_names:
-            print(f"Proposed subnet name '{subnet_name}' already exists in VPC '{vpc_id}'.")
         # Parse proposed CIDR
         try:
             proposed_net = ipaddress.ip_network(cidr_block_str, strict=False)
         except ValueError as e:
-            raise ValueError(f"Invalid CIDR format '{cidr_block_str}' for proposed subnet '{subnet_name}': {e}")
         # 3. Check for overlapping CIDRs within the proposed batch
         for existing_proposed_net in proposed_cidr_networks_seen:
@@ -494,14 +589,18 @@ def validate_subnet_creation_parameters(
         # If all checks pass for this subnet, add its network to the list for subsequent checks
         proposed_cidr_networks_seen.append(proposed_net)
-        print(f"Validation successful for proposed subnet '{subnet_name}' with CIDR '{cidr_block_str}'.")
-    print(f"--- All proposed subnets passed pre-synth validation checks for VPC '{vpc_id}'. ---")
 # --- Modified check_subnet_exists_by_name (Uses pre-fetched data) ---
 def check_subnet_exists_by_name(
-    subnet_name: str,
-    existing_aws_subnets_data: Dict[str, Any]
 ) -> Tuple[bool, Optional[str]]:
     """
     Checks if a subnet with the given name exists within the pre-fetched data.
@@ -519,51 +618,63 @@ def check_subnet_exists_by_name(
     subnet_info = existing_aws_subnets_data["by_name"].get(subnet_name)
     if subnet_info:
         print(f"Subnet '{subnet_name}' found with ID: {subnet_info['id']}")
-        return True, subnet_info['id']
     else:
         print(f"Subnet '{subnet_name}' not found.")
         return False, None
 def create_nat_gateway(
     scope: Construct,
-    public_subnet_for_nat: ec2.ISubnet, # Expects a proper ISubnet
     nat_gateway_name: str,
-    nat_gateway_id_context_key: str
 ) -> str:
     """
     Creates a single NAT Gateway in the specified public subnet.
     It does not handle lookup from context; the calling stack should do that.
     Returns the CloudFormation Ref of the NAT Gateway ID.
     """
-    print(f"Defining a new NAT Gateway '{nat_gateway_name}' in subnet '{public_subnet_for_nat.subnet_id}'.")
     # Create an Elastic IP for the NAT Gateway
-    eip = ec2.CfnEIP(scope, NAT_GATEWAY_EIP_NAME,
-        tags=[CfnTag(key="Name", value=NAT_GATEWAY_EIP_NAME)]
     )
     # Create the NAT Gateway
-    nat_gateway_logical_id = nat_gateway_name.replace('-', '') + "NatGateway"
-    nat_gateway = ec2.CfnNatGateway(scope, nat_gateway_logical_id,
         subnet_id=public_subnet_for_nat.subnet_id,  # Associate with the public subnet
-        allocation_id=eip.attr_allocation_id,       # Associate with the EIP
-        tags=[CfnTag(key="Name", value=nat_gateway_name)]
     )
     # The NAT GW depends on the EIP. The dependency on the subnet is implicit via subnet_id.
     nat_gateway.add_dependency(eip)
     # *** CRUCIAL: Use CfnOutput to export the ID after deployment ***
     # This is how you will get the ID to put into cdk.context.json
-    CfnOutput(scope, "SingleNatGatewayIdOutput",
         value=nat_gateway.ref,
         description=f"Physical ID of the Single NAT Gateway. Add this to cdk.context.json under the key '{nat_gateway_id_context_key}'.",
-        export_name=f"{scope.stack_name}-NatGatewayId" # Make export name unique
     )
-    print(f"CDK: Defined new NAT Gateway '{nat_gateway.ref}'. Its physical ID will be available in the stack outputs after deployment.")
     # Return the tokenised reference for use within this synthesis
     return nat_gateway.ref
 def create_subnets(
     scope: Construct,
     vpc: ec2.IVpc,
@@ -573,7 +684,7 @@ def create_subnets(
     availability_zones: List[str],
     is_public: bool,
     internet_gateway_id: Optional[str] = None,
-    single_nat_gateway_id: Optional[str] = None
 ) -> Tuple[List[ec2.CfnSubnet], List[ec2.CfnRouteTable]]:
     """
     Creates subnets using L2 constructs but returns the underlying L1 Cfn objects
@@ -581,11 +692,15 @@ def create_subnets(
     """
     # --- Validations remain the same ---
     if not (len(subnet_names) == len(cidr_blocks) == len(availability_zones) > 0):
-        raise ValueError("Subnet names, CIDR blocks, and Availability Zones lists must be non-empty and match in length.")
     if is_public and not internet_gateway_id:
         raise ValueError("internet_gateway_id must be provided for public subnets.")
     if not is_public and not single_nat_gateway_id:
-        raise ValueError("single_nat_gateway_id must be provided for private subnets when using a single NAT Gateway.")
     # --- We will populate these lists with the L1 objects to return ---
     created_subnets: List[ec2.CfnSubnet] = []
@@ -603,16 +718,16 @@ def create_subnets(
             vpc_id=vpc.vpc_id,
             cidr_block=cidr_blocks[i],
             availability_zone=availability_zones[i],
-            map_public_ip_on_launch=is_public
         )
         Tags.of(subnet).add("Name", subnet_name)
         Tags.of(subnet).add("Type", subnet_type_tag)
         if is_public:
             # The subnet's route_table is automatically created by the L2 Subnet construct
             try:
                 subnet.add_route(
-                    "DefaultInternetRoute", # A logical ID for the CfnRoute resource
                     router_id=internet_gateway_id,
                     router_type=ec2.RouterType.GATEWAY,
                     # destination_cidr_block="0.0.0.0/0" is the default for this method
@@ -624,22 +739,25 @@ def create_subnets(
             try:
                 # Using .add_route() for private subnets as well for consistency
                 subnet.add_route(
-                    "DefaultNatRoute", # A logical ID for the CfnRoute resource
                     router_id=single_nat_gateway_id,
                     router_type=ec2.RouterType.NAT_GATEWAY,
                 )
             except Exception as e:
                 print("Could not create NAT gateway route for public subnet due to:", e)
-            print(f"CDK: Defined private L2 subnet '{subnet_name}' and added NAT GW route.")
         route_table = subnet.route_table
         created_subnets.append(subnet)
         created_route_tables.append(route_table)
     return created_subnets, created_route_tables
-def ingress_rule_exists(security_group:str, peer:str, port:str):
     for rule in security_group.connections.security_groups:
         if port:
             if rule.peer == peer and rule.connection == port:
@@ -649,26 +767,32 @@ def ingress_rule_exists(security_group:str, peer:str, port:str):
                 return True
     return False
-def check_for_existing_user_pool(user_pool_name:str):
     cognito_client = boto3.client("cognito-idp")
-    list_pools_response = cognito_client.list_user_pools(MaxResults=60) # MaxResults up to 60
     # ListUserPools might require pagination if you have more than 60 pools
     # This simple example doesn't handle pagination, which could miss your pool
     existing_user_pool_id = ""
-    for pool in list_pools_response.get('UserPools', []):
-        if pool.get('Name') == user_pool_name:
-            existing_user_pool_id = pool['Id']
-            print(f"Found existing user pool by name '{user_pool_name}' with ID: {existing_user_pool_id}")
-            break # Found the one we're looking for
     if existing_user_pool_id:
         return True, existing_user_pool_id, pool
     else:
         return False, "", ""
 def check_for_existing_user_pool_client(user_pool_id: str, user_pool_client_name: str):
     """
     Checks if a Cognito User Pool Client with the given name exists in the specified User Pool.
@@ -683,39 +807,39 @@ def check_for_existing_user_pool_client(user_pool_id: str, user_pool_client_name
         - False, "", {} otherwise.
     """
     cognito_client = boto3.client("cognito-idp")
-    next_token = 'string'
     while True:
         try:
             response = cognito_client.list_user_pool_clients(
-                UserPoolId=user_pool_id,
-                MaxResults=60,
-                NextToken=next_token
             )
         except cognito_client.exceptions.ResourceNotFoundException:
             print(f"Error: User pool with ID '{user_pool_id}' not found.")
             return False, "", {}
         except cognito_client.exceptions.InvalidParameterException:
             print(f"Error: No app clients for '{user_pool_id}' found.")
             return False, "", {}
         except Exception as e:
             print("Could not check User Pool clients due to:", e)
-        for client in response.get('UserPoolClients', []):
-            if client.get('ClientName') == user_pool_client_name:
-                print(f"Found existing user pool client '{user_pool_client_name}' with ID: {client['ClientId']}")
-                return True, client['ClientId'], client
-        next_token = response.get('NextToken')
         if not next_token:
             break
     return False, "", {}
-def check_for_secret(secret_name: str, secret_value: dict=""):
     """
     Checks if a Secrets Manager secret with the given name exists.
     If it doesn't exist, it creates the secret.
@@ -741,8 +865,11 @@ def check_for_secret(secret_name: str, secret_value: dict=""):
         # Handle other potential exceptions during the get operation
         print(f"Error checking for secret '{secret_name}': {e}")
         return False, {}
-def check_alb_exists(load_balancer_name: str, region_name: str = None) -> tuple[bool, dict]:
     """
     Checks if an Application Load Balancer (ALB) with the given name exists.
@@ -759,18 +886,21 @@ def check_alb_exists(load_balancer_name: str, region_name: str = None) -> tuple[
           the LoadBalancers list from the describe_load_balancers response.
     """
     if region_name:
-        elbv2_client = boto3.client('elbv2', region_name=region_name)
     else:
-        elbv2_client = boto3.client('elbv2')
     try:
         response = elbv2_client.describe_load_balancers(Names=[load_balancer_name])
-        if response['LoadBalancers']:
-            return True, response['LoadBalancers'][0]  # Return True and the first ALB object
         else:
             return False, {}
     except ClientError as e:
         #  If the error indicates the ALB doesn't exist, return False
-        if e.response['Error']['Code'] == 'LoadBalancerNotFound':
             return False, {}
         else:
             # Re-raise other exceptions
@@ -778,8 +908,11 @@ def check_alb_exists(load_balancer_name: str, region_name: str = None) -> tuple[
     except Exception as e:
         print(f"An unexpected error occurred: {e}")
         return False, {}
-def check_fargate_task_definition_exists(task_definition_name: str, region_name: str = None) -> tuple[bool, dict]:
     """
     Checks if a Fargate task definition with the given name exists.
@@ -796,17 +929,23 @@ def check_fargate_task_definition_exists(task_definition_name: str, region_name:
           taskDefinitions list from the describe_task_definition response.
     """
     if region_name:
-        ecs_client = boto3.client('ecs', region_name=region_name)
     else:
-        ecs_client = boto3.client('ecs')
     try:
-        response = ecs_client.describe_task_definition(taskDefinition=task_definition_name)
         # If describe_task_definition succeeds, it returns the task definition.
         # We can directly return True and the task definition.
-        return True, response['taskDefinition']
     except ClientError as e:
         # Check for the error code indicating the task definition doesn't exist.
-        if e.response['Error']['Code'] == 'ClientException' and 'Task definition' in e.response['Message'] and 'does not exist' in e.response['Message']:
             return False, {}
         else:
             # Re-raise other exceptions.
@@ -814,8 +953,11 @@ def check_fargate_task_definition_exists(task_definition_name: str, region_name:
     except Exception as e:
         print(f"An unexpected error occurred: {e}")
         return False, {}
-def check_ecs_service_exists(cluster_name: str, service_name: str, region_name: str = None) -> tuple[bool, dict]:
     """
     Checks if an ECS service with the given name exists in the specified cluster.
@@ -832,20 +974,25 @@ def check_ecs_service_exists(cluster_name: str, service_name: str, region_name:
           None otherwise.
     """
     if region_name:
-        ecs_client = boto3.client('ecs', region_name=region_name)
     else:
-        ecs_client = boto3.client('ecs')
     try:
-        response = ecs_client.describe_services(cluster=cluster_name, services=[service_name])
-        if response['services']:
-            return True, response['services'][0]  # Return True and the first service object
         else:
             return False, {}
     except ClientError as e:
         # Check for the error code indicating the service doesn't exist.
-        if e.response['Error']['Code'] == 'ClusterNotFoundException':
             return False, {}
-        elif e.response['Error']['Code'] == 'ServiceNotFoundException':
             return False, {}
         else:
             # Re-raise other exceptions.
@@ -853,8 +1000,11 @@ def check_ecs_service_exists(cluster_name: str, service_name: str, region_name:
     except Exception as e:
         print(f"An unexpected error occurred: {e}")
         return False, {}
-def check_cloudfront_distribution_exists(distribution_name: str, region_name: str = None) -> tuple[bool, dict | None]:
     """
     Checks if a CloudFront distribution with the given name exists.
@@ -873,22 +1023,25 @@ def check_cloudfront_distribution_exists(distribution_name: str, region_name: st
           DistributionList from the ListDistributions response.
     """
     if region_name:
-        cf_client = boto3.client('cloudfront', region_name=region_name)
     else:
-        cf_client = boto3.client('cloudfront')
     try:
         response = cf_client.list_distributions()
-        if 'Items' in response['DistributionList']:
-            for distribution in response['DistributionList']['Items']:
                 # CloudFront doesn't directly filter by name, so we have to iterate.
-                if distribution['AliasSet']['Items'] and distribution['AliasSet']['Items'][0] == distribution_name:
                     return True, distribution
             return False, None
         else:
             return False, None
     except ClientError as e:
         #  If the error indicates the Distribution doesn't exist, return False
-        if e.response['Error']['Code'] == 'NoSuchDistribution':
             return False, None
         else:
             # Re-raise other exceptions
@@ -897,19 +1050,22 @@ def check_cloudfront_distribution_exists(distribution_name: str, region_name: st
         print(f"An unexpected error occurred: {e}")
         return False, None
-def create_web_acl_with_common_rules(scope:Construct, web_acl_name: str, waf_scope:str="CLOUDFRONT"):
-    '''
     Use CDK to create a web ACL based on an AWS common rule set with overrides.
     This function now expects a 'scope' argument, typically 'self' from your stack,
     as CfnWebACL requires a construct scope.
-    '''
     # Create full list of rules
     rules = []
     aws_ruleset_names = [
         "AWSManagedRulesCommonRuleSet",
         "AWSManagedRulesKnownBadInputsRuleSet",
-        "AWSManagedRulesAmazonIpReputationList"
     ]
     # Use a separate counter to assign unique priorities sequentially
@@ -917,7 +1073,7 @@ def create_web_acl_with_common_rules(scope:Construct, web_acl_name: str, waf_sco
     for aws_rule_name in aws_ruleset_names:
         current_rule_action_overrides = None
         # All managed rule groups need an override_action.
         # 'none' means use the managed rule group's default action.
         current_override_action = wafv2.CfnWebACL.OverrideActionProperty(none={})
@@ -929,9 +1085,7 @@ def create_web_acl_with_common_rules(scope:Construct, web_acl_name: str, waf_sco
             current_rule_action_overrides = [
                 wafv2.CfnWebACL.RuleActionOverrideProperty(
                     name="SizeRestrictions_BODY",
-                    action_to_use=wafv2.CfnWebACL.RuleActionProperty(
-                        allow={}
-                    )
                 )
             ]
             # No need to set current_override_action here, it's already set above.
@@ -945,39 +1099,38 @@ def create_web_acl_with_common_rules(scope:Construct, web_acl_name: str, waf_sco
                 managed_rule_group_statement=wafv2.CfnWebACL.ManagedRuleGroupStatementProperty(
                     vendor_name="AWS",
                     name=aws_rule_name,
-                    rule_action_overrides=current_rule_action_overrides
                 )
             ),
             visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
                 cloud_watch_metrics_enabled=True,
                 metric_name=aws_rule_name,
-                sampled_requests_enabled=True
             ),
-            override_action=current_override_action # THIS IS THE CRUCIAL PART FOR ALL MANAGED RULES
         )
         rules.append(rule_property)
     # Add the rate limit rule
-    rate_limit_priority = priority_counter # Use the next available priority
-    rules.append(wafv2.CfnWebACL.RuleProperty(
-        name="RateLimitRule",
-        priority=rate_limit_priority,
-        statement=wafv2.CfnWebACL.StatementProperty(
-            rate_based_statement=wafv2.CfnWebACL.RateBasedStatementProperty(
-                limit=1000,
-                aggregate_key_type="IP"
-            )
-        ),
-        visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
-            cloud_watch_metrics_enabled=True,
-            metric_name="RateLimitRule",
-            sampled_requests_enabled=True
-        ),
-        action=wafv2.CfnWebACL.RuleActionProperty(
-            block={}
         )
-    ))
     web_acl = wafv2.CfnWebACL(
         scope,
@@ -988,16 +1141,19 @@ def create_web_acl_with_common_rules(scope:Construct, web_acl_name: str, waf_sco
         visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
             cloud_watch_metrics_enabled=True,
             metric_name="webACL",
-            sampled_requests_enabled=True
         ),
-        rules=rules
     )
     CfnOutput(scope, "WebACLArn", value=web_acl.attr_arn)
     return web_acl
-def check_web_acl_exists(web_acl_name: str, scope: str, region_name: str = None) -> tuple[bool, dict]:
     """
     Checks if a Web ACL with the given name and scope exists.
@@ -1014,33 +1170,35 @@ def check_web_acl_exists(web_acl_name: str, scope: str, region_name: str = None)
         - The second element is the Web ACL object (dictionary) if found,
           None otherwise.
     """
-    if scope not in ['CLOUDFRONT', 'REGIONAL']:
         raise ValueError("Scope must be either 'CLOUDFRONT' or 'REGIONAL'")
-    if scope == 'REGIONAL' and not region_name:
         raise ValueError("Region name is required for REGIONAL scope")
-    if scope == 'CLOUDFRONT':
-        region_name = 'us-east-1'  # CloudFront scope requires us-east-1
     if region_name:
-        waf_client = boto3.client('wafv2', region_name=region_name)
     else:
-        waf_client = boto3.client('wafv2')
     try:
         response = waf_client.list_web_acls(Scope=scope)
-        if 'WebACLs' in response:
-            for web_acl in response['WebACLs']:
-                if web_acl['Name'] == web_acl_name:
                     # Describe the Web ACL to get the full object.
-                    describe_response = waf_client.describe_web_acl(Name=web_acl_name, Scope=scope)
-                    return True, describe_response['WebACL']
             return False, {}
         else:
             return False, {}
     except ClientError as e:
         # Check for the error code indicating the web ACL doesn't exist.
-        if e.response['Error']['Code'] == 'ResourceNotFoundException':
             return False, {}
         else:
             # Re-raise other exceptions.
@@ -1048,23 +1206,30 @@ def check_web_acl_exists(web_acl_name: str, scope: str, region_name: str = None)
     except Exception as e:
         print(f"An unexpected error occurred: {e}")
         return False, {}
 def add_alb_https_listener_with_cert(
     scope: Construct,
-    logical_id: str, # A unique ID for this listener construct
     alb: elb.ApplicationLoadBalancer,
-    acm_certificate_arn: Optional[str], # Optional: If None, no HTTPS listener will be created
-    default_target_group: elb.ITargetGroup, # Mandatory: The target group to forward traffic to
     listener_port_https: int = 443,
-    listener_open_to_internet: bool = False, # Be cautious with True, ensure ALB security group restricts access
     # --- Cognito Authentication Parameters ---
     enable_cognito_auth: bool = False,
     cognito_user_pool: Optional[cognito.IUserPool] = None,
     cognito_user_pool_client: Optional[cognito.IUserPoolClient] = None,
-    cognito_user_pool_domain: Optional[str] = None, # E.g., "my-app-domain" for "my-app-domain.auth.region.amazoncognito.com"
-    cognito_auth_scope: Optional[str] = "openid profile email", # Default recommended scope
     cognito_auth_on_unauthenticated_request: elb.UnauthenticatedAction = elb.UnauthenticatedAction.AUTHENTICATE,
-    stickiness_cookie_duration=None
     # --- End Cognito Parameters ---
 ) -> Optional[elb.ApplicationListener]:
     """
@@ -1098,25 +1263,33 @@ def add_alb_https_listener_with_cert(
     https_listener = None
     if acm_certificate_arn:
         certificates_list = [elb.ListenerCertificate.from_arn(acm_certificate_arn)]
-        print(f"Attempting to add ALB HTTPS listener on port {listener_port_https} with ACM certificate: {acm_certificate_arn}")
         # Determine the default action based on whether Cognito auth is enabled
         default_action = None
-        if enable_cognito_auth == True:
-            if not all([cognito_user_pool, cognito_user_pool_client, cognito_user_pool_domain]):
                 raise ValueError(
                     "Cognito User Pool, Client, and Domain must be provided if enable_cognito_auth is True."
                 )
-            print(f"Enabling Cognito authentication with User Pool: {cognito_user_pool.user_pool_id}")
             default_action = elb_act.AuthenticateCognitoAction(
-                next=elb.ListenerAction.forward([default_target_group]), # After successful auth, forward to TG
                 user_pool=cognito_user_pool,
                 user_pool_client=cognito_user_pool_client,
-                user_pool_domain=cognito_user_pool_domain,
                 scope=cognito_auth_scope,
-                on_unauthenticated_request=cognito_auth_on_unauthenticated_request,
-                session_timeout=stickiness_cookie_duration
                 # Additional options you might want to configure:
                 # session_cookie_name="AWSELBCookies"
             )
@@ -1130,7 +1303,7 @@ def add_alb_https_listener_with_cert(
             port=listener_port_https,
             open=listener_open_to_internet,
             certificates=certificates_list,
-            default_action=default_action # Use the determined default action
         )
         print(f"ALB HTTPS listener on port {listener_port_https} defined.")
     else:
@@ -1139,8 +1312,8 @@ def add_alb_https_listener_with_cert(
     return https_listener
-def ensure_folder_exists(output_folder:str):
-    """Checks if the specified folder exists, creates it if not."""
     if not os.path.exists(output_folder):
         # Create the folder if it doesn't exist
@@ -1149,62 +1322,70 @@ def ensure_folder_exists(output_folder:str):
     else:
         print(f"The {output_folder} folder already exists.")
-def create_basic_config_env(out_dir:str="config", S3_LOG_CONFIG_BUCKET_NAME=S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME=S3_OUTPUT_BUCKET_NAME, ACCESS_LOG_DYNAMODB_TABLE_NAME=ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME=USAGE_LOG_DYNAMODB_TABLE_NAME):
-    '''
     Create a basic config.env file for the user to use with their newly deployed redaction app.
-    '''
     variables = {
-    'COGNITO_AUTH':'1',
-    'RUN_AWS_FUNCTIONS':'1',
-    'DISPLAY_FILE_NAMES_IN_LOGS':'False',
-    'SESSION_OUTPUT_FOLDER':'True',
-    'SAVE_LOGS_TO_DYNAMODB':'True',
-    'SHOW_COSTS':'True',
-    'SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS':'True',
-    'LOAD_PREVIOUS_TEXTRACT_JOBS_S3':'True',
-    'DOCUMENT_REDACTION_BUCKET':S3_LOG_CONFIG_BUCKET_NAME,
-    'TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET':S3_OUTPUT_BUCKET_NAME,
-    'ACCESS_LOG_DYNAMODB_TABLE_NAME':ACCESS_LOG_DYNAMODB_TABLE_NAME,
-    'FEEDBACK_LOG_DYNAMODB_TABLE_NAME':FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
-    'USAGE_LOG_DYNAMODB_TABLE_NAME':USAGE_LOG_DYNAMODB_TABLE_NAME,
-    'DISPLAY_FILE_NAMES_IN_LOGS':'False'
     }
     # Write variables to .env file
     ensure_folder_exists(out_dir + "/")
-    env_file_path = os.path.abspath(os.path.join(out_dir, 'config.env'))
     # It's good practice to ensure the file exists before calling set_key repeatedly.
     # set_key will create it, but for a loop, it might be cleaner to ensure it's empty/exists once.
     if not os.path.exists(env_file_path):
-        with open(env_file_path, 'w') as f:
-            pass # Create empty file
     for key, value in variables.items():
         set_key(env_file_path, key, str(value), quote_mode="never")
     return variables
-def start_codebuild_build(PROJECT_NAME:str, AWS_REGION:str = AWS_REGION):
-    '''
     Start an existing Codebuild project build
-    '''
     # --- Initialize CodeBuild client ---
-    client = boto3.client('codebuild', region_name=AWS_REGION)
     try:
         print(f"Attempting to start build for project: {PROJECT_NAME}")
-        response = client.start_build(
-            projectName=PROJECT_NAME
-        )
-        build_id = response['build']['id']
         print(f"Successfully started build with ID: {build_id}")
         print(f"Build ARN: {response['build']['arn']}")
-        print(f"Build URL (approximate - construct based on region and ID):")
-        print(f"https://{AWS_REGION}.console.aws.amazon.com/codesuite/codebuild/projects/{PROJECT_NAME}/build/{build_id.split(':')[-1]}/detail")
         # You can inspect the full response if needed
         # print("\nFull response:")
@@ -1216,7 +1397,13 @@ def start_codebuild_build(PROJECT_NAME:str, AWS_REGION:str = AWS_REGION):
     except Exception as e:
         print(f"An unexpected error occurred: {e}")
-def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str, RUN_AWS_FUNCTIONS:str = "1"):
     """
     Uploads a file from local machine to Amazon S3.
@@ -1235,14 +1422,14 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str, RUN
         try:
             if s3_bucket and local_file_paths:
-                s3_client = boto3.client('s3', region_name=AWS_REGION)
                 if isinstance(local_file_paths, str):
                     local_file_paths = [local_file_paths]
                 for file in local_file_paths:
                     if s3_client:
-                        #print(s3_client)
                         try:
                             # Get file name off file path
                             file_name = os.path.basename(file)
@@ -1251,18 +1438,24 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str, RUN
                             print("S3 key: ", s3_key_full)
                             s3_client.upload_file(file, s3_bucket, s3_key_full)
-                            out_message = "File " + file_name + " uploaded successfully!"
                             print(out_message)
                         except Exception as e:
                             out_message = f"Error uploading file(s): {e}"
                             print(out_message)
                         final_out_message.append(out_message)
-                        final_out_message_str = '\n'.join(final_out_message)
-                    else: final_out_message_str = "Could not connect to AWS."
-            else: final_out_message_str = "At least one essential variable is empty, could not upload to S3"
         except Exception as e:
             final_out_message_str = "Could not upload files to S3 due to: " + str(e)
             print(final_out_message_str)
@@ -1271,23 +1464,19 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str, RUN
     return final_out_message_str
 # Initialize ECS client
 def start_ecs_task(cluster_name, service_name):
-    ecs_client = boto3.client('ecs')
     try:
         # Update the service to set the desired count to 1
-        response = ecs_client.update_service(
-            cluster=cluster_name,
-            service=service_name,
-            desiredCount=1
         )
         return {
             "statusCode": 200,
-            "body": f"Service {service_name} in cluster {cluster_name} has been updated to 1 task."
         }
     except Exception as e:
-        return {
-            "statusCode": 500,
-            "body": f"Error updating service: {str(e)}"
-        }

+import ipaddress
 import json
 import os
+from typing import Any, Dict, List, Optional, Tuple
+import boto3
 import pandas as pd
+from aws_cdk import App, CfnOutput, CfnTag, Tags
+from aws_cdk import aws_cognito as cognito
+from aws_cdk import aws_ec2 as ec2
+from aws_cdk import aws_elasticloadbalancingv2 as elb
+from aws_cdk import aws_elasticloadbalancingv2_actions as elb_act
+from aws_cdk import aws_iam as iam
+from aws_cdk import aws_wafv2 as wafv2
+from botocore.exceptions import ClientError
+from cdk_config import (
+    ACCESS_LOG_DYNAMODB_TABLE_NAME,
+    AWS_REGION,
+    FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+    NAT_GATEWAY_EIP_NAME,
+    POLICY_FILE_LOCATIONS,
+    PRIVATE_SUBNET_AVAILABILITY_ZONES,
+    PRIVATE_SUBNET_CIDR_BLOCKS,
+    PRIVATE_SUBNETS_TO_USE,
+    PUBLIC_SUBNET_AVAILABILITY_ZONES,
+    PUBLIC_SUBNET_CIDR_BLOCKS,
+    PUBLIC_SUBNETS_TO_USE,
+    S3_LOG_CONFIG_BUCKET_NAME,
+    S3_OUTPUT_BUCKET_NAME,
+    USAGE_LOG_DYNAMODB_TABLE_NAME,
+)
 from constructs import Construct
 from dotenv import set_key
 # --- Function to load context from file ---
 def load_context_from_file(app: App, file_path: str):
     if os.path.exists(file_path):
+        with open(file_path, "r") as f:
             context_data = json.load(f)
             for key, value in context_data.items():
                 app.node.set_context(key, value)
     else:
         print(f"Context file not found: {file_path}")
 # --- Helper to parse environment variables into lists ---
 def _get_env_list(env_var_name: str) -> List[str]:
     """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
     if not value:
         return []
     # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(",") if s.strip()]
 # 1. Try to load CIDR/AZs from environment variables
+if PUBLIC_SUBNETS_TO_USE:
+    PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
+if PRIVATE_SUBNETS_TO_USE:
+    PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
+if PUBLIC_SUBNET_CIDR_BLOCKS:
+    PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
+if PUBLIC_SUBNET_AVAILABILITY_ZONES:
+    PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
+if PRIVATE_SUBNET_CIDR_BLOCKS:
+    PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
+if PRIVATE_SUBNET_AVAILABILITY_ZONES:
+    PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(
+        "PRIVATE_SUBNET_AVAILABILITY_ZONES"
+    )
+if POLICY_FILE_LOCATIONS:
+    POLICY_FILE_LOCATIONS = _get_env_list(POLICY_FILE_LOCATIONS)
+def check_for_existing_role(role_name: str):
     try:
+        iam = boto3.client("iam")
+        # iam.get_role(RoleName=role_name)
         response = iam.get_role(RoleName=role_name)
+        role = response["Role"]["Arn"]
+        print("Response Role:", role)
         return True, role, ""
     except iam.exceptions.NoSuchEntityException:
     except Exception as e:
         raise Exception("Getting information on IAM role failed due to:", e)
+from typing import List
 # Assume POLICY_FILE_LOCATIONS is defined globally or passed as a default
 # For example:
         policy_document: A Python dictionary representing an IAM policy document.
     """
     # Ensure the loaded JSON is a valid policy document structure
+    if "Statement" not in policy_document or not isinstance(
+        policy_document["Statement"], list
+    ):
+        print("Warning: Policy document does not contain a 'Statement' list. Skipping.")
+        return  # Do not return role, just log and exit
+    for statement_dict in policy_document["Statement"]:
         try:
             # Create a CDK PolicyStatement from the dictionary
             cdk_policy_statement = iam.PolicyStatement.from_json(statement_dict)
             role.add_to_policy(cdk_policy_statement)
             print(f"  - Added statement: {statement_dict.get('Sid', 'No Sid')}")
         except Exception as e:
+            print(
+                f"Warning: Could not process policy statement: {statement_dict}. Error: {e}"
+            )
 def add_custom_policies(
+    scope: Construct,  # Not strictly used here, but good practice if you expand to ManagedPolicies
     role: iam.IRole,
     policy_file_locations: Optional[List[str]] = None,
+    custom_policy_text: Optional[str] = None,
 ) -> iam.IRole:
     """
     Loads custom policies from JSON files or a string and attaches them to a CDK Role.
     if policy_file_locations is None:
         policy_file_locations = []
+    current_source = "unknown source"  # For error messages
     try:
         if policy_file_locations:
             for path in policy_file_locations:
                 current_source = f"file: {path}"
                 try:
+                    with open(path, "r") as f:
                         policy_document = json.load(f)
                     print(f"Processing policy from {current_source}...")
                     add_statement_to_policy(role, policy_document)
                 except FileNotFoundError:
                     print(f"Warning: Policy file not found at {path}. Skipping.")
                 except json.JSONDecodeError as e:
+                    print(
+                        f"Warning: Invalid JSON in policy file {path}: {e}. Skipping."
+                    )
                 except Exception as e:
+                    print(
+                        f"An unexpected error occurred processing policy from {path}: {e}. Skipping."
+                    )
         if custom_policy_text:
             current_source = "custom policy text string"
+            print(
+                f"Attempting to add policy from custom text to role {role.node.id}..."
+            )
             try:
                 # *** FIX: Parse the JSON string into a Python dictionary ***
                 policy_document = json.loads(custom_policy_text)
             except json.JSONDecodeError as e:
                 print(f"Warning: Invalid JSON in custom_policy_text: {e}. Skipping.")
             except Exception as e:
+                print(
+                    f"An unexpected error occurred processing policy from custom_policy_text: {e}. Skipping."
+                )
         # You might want a final success message, but individual processing messages are also good.
         print(f"Finished processing custom policies for role {role.node.id}.")
     except Exception as e:
+        print(
+            f"An unhandled error occurred during policy addition for {current_source}: {e}"
+        )
     return role
 # Import the S3 Bucket class if you intend to return a CDK object later
 # from aws_cdk import aws_s3 as s3
+def check_s3_bucket_exists(
+    bucket_name: str,
+):  # Return type hint depends on what you return
     """
     Checks if an S3 bucket with the given name exists and is accessible.
               or the bucket name for CDK lookups/creations.
               For this example, let's return the boolean and the name.
     """
+    s3_client = boto3.client("s3")
     try:
         # Use head_bucket to check for existence and access
         s3_client.head_bucket(Bucket=bucket_name)
         print(f"Bucket '{bucket_name}' exists and is accessible.")
+        return True, bucket_name  # Return True and the bucket name
     except ClientError as e:
         # If a ClientError occurs, check the error code.
         # '404' means the bucket does not exist.
         # '403' means the bucket exists but you don't have permission.
+        error_code = e.response["Error"]["Code"]
+        if error_code == "404":
             print(f"Bucket '{bucket_name}' does not exist.")
             return False, None
+        elif error_code == "403":
+            # The bucket exists, but you can't access it.
+            # Depending on your requirements, this might be treated as "exists"
+            # or "not accessible for our purpose". For checking existence,
+            # we'll say it exists here, but note the permission issue.
+            # NOTE - when I tested this, it was returning 403 even for buckets that don't exist. So I will return False instead
+            print(
+                f"Bucket '{bucket_name}' returned 403, which indicates it may exist but is not accessible due to permissions, or that it doesn't exist. Returning False for existence just in case."
+            )
+            return False, bucket_name  # It exists, even if not accessible
         else:
             # For other errors, it's better to raise the exception
             # to indicate something unexpected happened.
+            print(
+                f"An unexpected AWS ClientError occurred checking bucket '{bucket_name}': {e}"
+            )
             # Decide how to handle other errors - raising might be safer
+            raise  # Re-raise the original exception
     except Exception as e:
+        print(
+            f"An unexpected non-ClientError occurred checking bucket '{bucket_name}': {e}"
+        )
         # Decide how to handle other errors
+        raise  # Re-raise the original exception
 # Example usage in your check_resources.py:
 # exists, bucket_name_if_exists = check_s3_bucket_exists(log_bucket_name)
 # context_data[f"exists:{log_bucket_name}"] = exists
 # # You don't necessarily need to store the name in context if using from_bucket_name
 # Delete an S3 bucket
+def delete_s3_bucket(bucket_name: str):
+    s3 = boto3.client("s3")
     try:
         # List and delete all objects
         response = s3.list_object_versions(Bucket=bucket_name)
+        versions = response.get("Versions", []) + response.get("DeleteMarkers", [])
         for version in versions:
+            s3.delete_object(
+                Bucket=bucket_name, Key=version["Key"], VersionId=version["VersionId"]
+            )
         # Delete the bucket
         s3.delete_bucket(Bucket=bucket_name)
+        return {"Status": "SUCCESS"}
     except Exception as e:
+        return {"Status": "FAILED", "Reason": str(e)}
 # Function to get subnet ID from subnet name
+def get_subnet_id(vpc: str, ec2_client: str, subnet_name: str):
+    response = ec2_client.describe_subnets(
+        Filters=[{"Name": "vpc-id", "Values": [vpc.vpc_id]}]
+    )
+    for subnet in response["Subnets"]:
+        if subnet["Tags"] and any(
+            tag["Key"] == "Name" and tag["Value"] == subnet_name
+            for tag in subnet["Tags"]
+        ):
+            return subnet["SubnetId"]
     return None
 def check_ecr_repo_exists(repo_name: str) -> tuple[bool, dict]:
     """
     Checks if an ECR repository with the given name exists.
     Returns:
         True if the repository exists, False otherwise.
     """
+    ecr_client = boto3.client("ecr")
     try:
         print("ecr repo_name to check:", repo_name)
         response = ecr_client.describe_repositories(repositoryNames=[repo_name])
         # If describe_repositories succeeds and returns a list of repositories,
         # and the list is not empty, the repository exists.
+        return len(response["repositories"]) > 0, response["repositories"][0]
     except ClientError as e:
         # Check for the specific error code indicating the repository doesn't exist
+        if e.response["Error"]["Code"] == "RepositoryNotFoundException":
             return False, {}
         else:
             # Re-raise other exceptions to handle unexpected errors
     except Exception as e:
         print(f"An unexpected error occurred: {e}")
         return False, {}
+def check_codebuild_project_exists(
+    project_name: str,
+):  # Adjust return type hint as needed
     """
     Checks if a CodeBuild project with the given name exists.
         - The second element is the project object (dictionary) if found,
           None otherwise.
     """
+    codebuild_client = boto3.client("codebuild")
     try:
         # Use batch_get_projects with a list containing the single project name
         response = codebuild_client.batch_get_projects(names=[project_name])
         # The response for batch_get_projects includes 'projects' (found)
         # and 'projectsNotFound' (not found).
+        if response["projects"]:
             # If the project is found in the 'projects' list
             print(f"CodeBuild project '{project_name}' found.")
+            return (
+                True,
+                response["projects"][0]["arn"],
+            )  # Return True and the project details dict
+        elif (
+            response["projectsNotFound"]
+            and project_name in response["projectsNotFound"]
+        ):
+            # If the project name is explicitly in the 'projectsNotFound' list
+            print(f"CodeBuild project '{project_name}' not found.")
+            return False, None
         else:
             # This case is less expected for a single name lookup,
             # but could happen if there's an internal issue or the response
             # structure is slightly different than expected for an error.
             # It's safer to assume it wasn't found if not in 'projects'.
+            print(
+                f"CodeBuild project '{project_name}' not found (not in 'projects' list)."
+            )
             return False, None
     except ClientError as e:
         # 'InvalidInputException' for a non-existent project name if the
         # name format is valid. It typically just lists it in projectsNotFound.
         # However, other ClientErrors are possible (e.g., permissions).
+        print(
+            f"An AWS ClientError occurred checking CodeBuild project '{project_name}': {e}"
+        )
         # Decide how to handle other ClientErrors - raising might be safer
+        raise  # Re-raise the original exception
     except Exception as e:
+        print(
+            f"An unexpected non-ClientError occurred checking CodeBuild project '{project_name}': {e}"
+        )
         # Decide how to handle other errors
+        raise  # Re-raise the original exception
 def get_vpc_id_by_name(vpc_name: str) -> Optional[str]:
     """
     Finds a VPC ID by its 'Name' tag.
     """
+    ec2_client = boto3.client("ec2")
     try:
         response = ec2_client.describe_vpcs(
+            Filters=[{"Name": "tag:Name", "Values": [vpc_name]}]
         )
+        if response and response["Vpcs"]:
+            vpc_id = response["Vpcs"][0]["VpcId"]
             print(f"VPC '{vpc_name}' found with ID: {vpc_id}")
             # In get_vpc_id_by_name, after finding VPC ID:
             # Look for NAT Gateways in this VPC
+            ec2_client = boto3.client("ec2")
             nat_gateways = []
             try:
                 response = ec2_client.describe_nat_gateways(
                     Filters=[
+                        {"Name": "vpc-id", "Values": [vpc_id]},
                         # Optional: Add a tag filter if you consistently tag your NATs
                         # {'Name': 'tag:Name', 'Values': [f"{prefix}-nat-gateway"]}
                     ]
                 )
+                nat_gateways = response.get("NatGateways", [])
             except Exception as e:
+                print(
+                    f"Warning: Could not describe NAT Gateways in VPC '{vpc_id}': {e}"
+                )
                 # Decide how to handle this error - proceed or raise?
             # Decide how to identify the specific NAT Gateway you want to check for.
             return vpc_id, nat_gateways
         else:
         print(f"An unexpected error occurred finding VPC '{vpc_name}': {e}")
         raise
 # --- Helper to fetch all existing subnets in a VPC once ---
 def _get_existing_subnets_in_vpc(vpc_id: str) -> Dict[str, Any]:
     """
     Returns a dictionary with 'by_name' (map of name to subnet data),
     'by_id' (map of id to subnet data), and 'cidr_networks' (list of ipaddress.IPv4Network).
     """
+    ec2_client = boto3.client("ec2")
     existing_subnets_data = {
         "by_name": {},  # {subnet_name: {'id': 'subnet-id', 'cidr': 'x.x.x.x/x'}}
+        "by_id": {},  # {subnet_id: {'name': 'subnet-name', 'cidr': 'x.x.x.x/x'}}
+        "cidr_networks": [],  # List of ipaddress.IPv4Network objects
     }
     try:
+        response = ec2_client.describe_subnets(
+            Filters=[{"Name": "vpc-id", "Values": [vpc_id]}]
+        )
+        for s in response.get("Subnets", []):
+            subnet_id = s["SubnetId"]
+            cidr_block = s.get("CidrBlock")
             # Extract 'Name' tag, which is crucial for lookup by name
+            name_tag = next(
+                (tag["Value"] for tag in s.get("Tags", []) if tag["Key"] == "Name"),
+                None,
+            )
+            subnet_info = {"id": subnet_id, "cidr": cidr_block, "name": name_tag}
             if name_tag:
                 existing_subnets_data["by_name"][name_tag] = subnet_info
             if cidr_block:
                 try:
+                    existing_subnets_data["cidr_networks"].append(
+                        ipaddress.ip_network(cidr_block, strict=False)
+                    )
                 except ValueError:
+                    print(
+                        f"Warning: Existing subnet {subnet_id} has an invalid CIDR: {cidr_block}. Skipping for overlap check."
+                    )
+        print(
+            f"Fetched {len(response.get('Subnets', []))} existing subnets from VPC '{vpc_id}'."
+        )
     except Exception as e:
+        print(
+            f"Error describing existing subnets in VPC '{vpc_id}': {e}. Cannot perform full validation."
+        )
+        raise  # Re-raise if this essential step fails
     return existing_subnets_data
 # --- Modified validate_subnet_creation_parameters to take pre-fetched data ---
 def validate_subnet_creation_parameters(
     vpc_id: str,
+    proposed_subnets_data: List[
+        Dict[str, str]
+    ],  # e.g., [{'name': 'my-public-subnet', 'cidr': '10.0.0.0/24', 'az': 'us-east-1a'}]
+    existing_aws_subnets_data: Dict[
+        str, Any
+    ],  # Pre-fetched data from _get_existing_subnets_in_vpc
 ) -> None:
     """
     Validates proposed subnet names and CIDR blocks against existing AWS subnets
         print("No proposed subnet data provided for validation. Skipping.")
         return
+    print(
+        f"--- Starting pre-synth validation for VPC '{vpc_id}' with proposed subnets ---"
+    )
+    print("Existing subnet data:", pd.DataFrame(existing_aws_subnets_data["by_name"]))
     existing_aws_subnet_names = set(existing_aws_subnets_data["by_name"].keys())
     existing_aws_cidr_networks = existing_aws_subnets_data["cidr_networks"]
     proposed_cidr_networks_seen: List[ipaddress.IPv4Network] = []
     for i, proposed_subnet in enumerate(proposed_subnets_data):
+        subnet_name = proposed_subnet.get("name")
+        cidr_block_str = proposed_subnet.get("cidr")
+        availability_zone = proposed_subnet.get("az")
         if not all([subnet_name, cidr_block_str, availability_zone]):
+            raise ValueError(
+                f"Proposed subnet at index {i} is incomplete. Requires 'name', 'cidr', and 'az'."
+            )
         # 1. Check for duplicate names within the proposed batch
         if subnet_name in proposed_names_seen:
+            raise ValueError(
+                f"Proposed subnet name '{subnet_name}' is duplicated within the input list."
+            )
         proposed_names_seen.add(subnet_name)
         # 2. Check for duplicate names against existing AWS subnets
         if subnet_name in existing_aws_subnet_names:
+            print(
+                f"Proposed subnet name '{subnet_name}' already exists in VPC '{vpc_id}'."
+            )
         # Parse proposed CIDR
         try:
             proposed_net = ipaddress.ip_network(cidr_block_str, strict=False)
         except ValueError as e:
+            raise ValueError(
+                f"Invalid CIDR format '{cidr_block_str}' for proposed subnet '{subnet_name}': {e}"
+            )
         # 3. Check for overlapping CIDRs within the proposed batch
         for existing_proposed_net in proposed_cidr_networks_seen:
         # If all checks pass for this subnet, add its network to the list for subsequent checks
         proposed_cidr_networks_seen.append(proposed_net)
+        print(
+            f"Validation successful for proposed subnet '{subnet_name}' with CIDR '{cidr_block_str}'."
+        )
+    print(
+        f"--- All proposed subnets passed pre-synth validation checks for VPC '{vpc_id}'. ---"
+    )
 # --- Modified check_subnet_exists_by_name (Uses pre-fetched data) ---
 def check_subnet_exists_by_name(
+    subnet_name: str, existing_aws_subnets_data: Dict[str, Any]
 ) -> Tuple[bool, Optional[str]]:
     """
     Checks if a subnet with the given name exists within the pre-fetched data.
     subnet_info = existing_aws_subnets_data["by_name"].get(subnet_name)
     if subnet_info:
         print(f"Subnet '{subnet_name}' found with ID: {subnet_info['id']}")
+        return True, subnet_info["id"]
     else:
         print(f"Subnet '{subnet_name}' not found.")
         return False, None
 def create_nat_gateway(
     scope: Construct,
+    public_subnet_for_nat: ec2.ISubnet,  # Expects a proper ISubnet
     nat_gateway_name: str,
+    nat_gateway_id_context_key: str,
 ) -> str:
     """
     Creates a single NAT Gateway in the specified public subnet.
     It does not handle lookup from context; the calling stack should do that.
     Returns the CloudFormation Ref of the NAT Gateway ID.
     """
+    print(
+        f"Defining a new NAT Gateway '{nat_gateway_name}' in subnet '{public_subnet_for_nat.subnet_id}'."
+    )
     # Create an Elastic IP for the NAT Gateway
+    eip = ec2.CfnEIP(
+        scope,
+        NAT_GATEWAY_EIP_NAME,
+        tags=[CfnTag(key="Name", value=NAT_GATEWAY_EIP_NAME)],
     )
     # Create the NAT Gateway
+    nat_gateway_logical_id = nat_gateway_name.replace("-", "") + "NatGateway"
+    nat_gateway = ec2.CfnNatGateway(
+        scope,
+        nat_gateway_logical_id,
         subnet_id=public_subnet_for_nat.subnet_id,  # Associate with the public subnet
+        allocation_id=eip.attr_allocation_id,  # Associate with the EIP
+        tags=[CfnTag(key="Name", value=nat_gateway_name)],
     )
     # The NAT GW depends on the EIP. The dependency on the subnet is implicit via subnet_id.
     nat_gateway.add_dependency(eip)
     # *** CRUCIAL: Use CfnOutput to export the ID after deployment ***
     # This is how you will get the ID to put into cdk.context.json
+    CfnOutput(
+        scope,
+        "SingleNatGatewayIdOutput",
         value=nat_gateway.ref,
         description=f"Physical ID of the Single NAT Gateway. Add this to cdk.context.json under the key '{nat_gateway_id_context_key}'.",
+        export_name=f"{scope.stack_name}-NatGatewayId",  # Make export name unique
     )
+    print(
+        f"CDK: Defined new NAT Gateway '{nat_gateway.ref}'. Its physical ID will be available in the stack outputs after deployment."
+    )
     # Return the tokenised reference for use within this synthesis
     return nat_gateway.ref
 def create_subnets(
     scope: Construct,
     vpc: ec2.IVpc,
     availability_zones: List[str],
     is_public: bool,
     internet_gateway_id: Optional[str] = None,
+    single_nat_gateway_id: Optional[str] = None,
 ) -> Tuple[List[ec2.CfnSubnet], List[ec2.CfnRouteTable]]:
     """
     Creates subnets using L2 constructs but returns the underlying L1 Cfn objects
     """
     # --- Validations remain the same ---
     if not (len(subnet_names) == len(cidr_blocks) == len(availability_zones) > 0):
+        raise ValueError(
+            "Subnet names, CIDR blocks, and Availability Zones lists must be non-empty and match in length."
+        )
     if is_public and not internet_gateway_id:
         raise ValueError("internet_gateway_id must be provided for public subnets.")
     if not is_public and not single_nat_gateway_id:
+        raise ValueError(
+            "single_nat_gateway_id must be provided for private subnets when using a single NAT Gateway."
+        )
     # --- We will populate these lists with the L1 objects to return ---
     created_subnets: List[ec2.CfnSubnet] = []
             vpc_id=vpc.vpc_id,
             cidr_block=cidr_blocks[i],
             availability_zone=availability_zones[i],
+            map_public_ip_on_launch=is_public,
         )
         Tags.of(subnet).add("Name", subnet_name)
         Tags.of(subnet).add("Type", subnet_type_tag)
         if is_public:
             # The subnet's route_table is automatically created by the L2 Subnet construct
             try:
                 subnet.add_route(
+                    "DefaultInternetRoute",  # A logical ID for the CfnRoute resource
                     router_id=internet_gateway_id,
                     router_type=ec2.RouterType.GATEWAY,
                     # destination_cidr_block="0.0.0.0/0" is the default for this method
             try:
                 # Using .add_route() for private subnets as well for consistency
                 subnet.add_route(
+                    "DefaultNatRoute",  # A logical ID for the CfnRoute resource
                     router_id=single_nat_gateway_id,
                     router_type=ec2.RouterType.NAT_GATEWAY,
                 )
             except Exception as e:
                 print("Could not create NAT gateway route for public subnet due to:", e)
+            print(
+                f"CDK: Defined private L2 subnet '{subnet_name}' and added NAT GW route."
+            )
         route_table = subnet.route_table
         created_subnets.append(subnet)
         created_route_tables.append(route_table)
     return created_subnets, created_route_tables
+def ingress_rule_exists(security_group: str, peer: str, port: str):
     for rule in security_group.connections.security_groups:
         if port:
             if rule.peer == peer and rule.connection == port:
                 return True
     return False
+def check_for_existing_user_pool(user_pool_name: str):
     cognito_client = boto3.client("cognito-idp")
+    list_pools_response = cognito_client.list_user_pools(
+        MaxResults=60
+    )  # MaxResults up to 60
     # ListUserPools might require pagination if you have more than 60 pools
     # This simple example doesn't handle pagination, which could miss your pool
     existing_user_pool_id = ""
+    for pool in list_pools_response.get("UserPools", []):
+        if pool.get("Name") == user_pool_name:
+            existing_user_pool_id = pool["Id"]
+            print(
+                f"Found existing user pool by name '{user_pool_name}' with ID: {existing_user_pool_id}"
+            )
+            break  # Found the one we're looking for
     if existing_user_pool_id:
         return True, existing_user_pool_id, pool
     else:
         return False, "", ""
 def check_for_existing_user_pool_client(user_pool_id: str, user_pool_client_name: str):
     """
     Checks if a Cognito User Pool Client with the given name exists in the specified User Pool.
         - False, "", {} otherwise.
     """
     cognito_client = boto3.client("cognito-idp")
+    next_token = "string"
     while True:
         try:
             response = cognito_client.list_user_pool_clients(
+                UserPoolId=user_pool_id, MaxResults=60, NextToken=next_token
             )
         except cognito_client.exceptions.ResourceNotFoundException:
             print(f"Error: User pool with ID '{user_pool_id}' not found.")
             return False, "", {}
         except cognito_client.exceptions.InvalidParameterException:
             print(f"Error: No app clients for '{user_pool_id}' found.")
             return False, "", {}
         except Exception as e:
             print("Could not check User Pool clients due to:", e)
+        for client in response.get("UserPoolClients", []):
+            if client.get("ClientName") == user_pool_client_name:
+                print(
+                    f"Found existing user pool client '{user_pool_client_name}' with ID: {client['ClientId']}"
+                )
+                return True, client["ClientId"], client
+        next_token = response.get("NextToken")
         if not next_token:
             break
     return False, "", {}
+def check_for_secret(secret_name: str, secret_value: dict = ""):
     """
     Checks if a Secrets Manager secret with the given name exists.
     If it doesn't exist, it creates the secret.
         # Handle other potential exceptions during the get operation
         print(f"Error checking for secret '{secret_name}': {e}")
         return False, {}
+def check_alb_exists(
+    load_balancer_name: str, region_name: str = None
+) -> tuple[bool, dict]:
     """
     Checks if an Application Load Balancer (ALB) with the given name exists.
           the LoadBalancers list from the describe_load_balancers response.
     """
     if region_name:
+        elbv2_client = boto3.client("elbv2", region_name=region_name)
     else:
+        elbv2_client = boto3.client("elbv2")
     try:
         response = elbv2_client.describe_load_balancers(Names=[load_balancer_name])
+        if response["LoadBalancers"]:
+            return (
+                True,
+                response["LoadBalancers"][0],
+            )  # Return True and the first ALB object
         else:
             return False, {}
     except ClientError as e:
         #  If the error indicates the ALB doesn't exist, return False
+        if e.response["Error"]["Code"] == "LoadBalancerNotFound":
             return False, {}
         else:
             # Re-raise other exceptions
     except Exception as e:
         print(f"An unexpected error occurred: {e}")
         return False, {}
+def check_fargate_task_definition_exists(
+    task_definition_name: str, region_name: str = None
+) -> tuple[bool, dict]:
     """
     Checks if a Fargate task definition with the given name exists.
           taskDefinitions list from the describe_task_definition response.
     """
     if region_name:
+        ecs_client = boto3.client("ecs", region_name=region_name)
     else:
+        ecs_client = boto3.client("ecs")
     try:
+        response = ecs_client.describe_task_definition(
+            taskDefinition=task_definition_name
+        )
         # If describe_task_definition succeeds, it returns the task definition.
         # We can directly return True and the task definition.
+        return True, response["taskDefinition"]
     except ClientError as e:
         # Check for the error code indicating the task definition doesn't exist.
+        if (
+            e.response["Error"]["Code"] == "ClientException"
+            and "Task definition" in e.response["Message"]
+            and "does not exist" in e.response["Message"]
+        ):
             return False, {}
         else:
             # Re-raise other exceptions.
     except Exception as e:
         print(f"An unexpected error occurred: {e}")
         return False, {}
+def check_ecs_service_exists(
+    cluster_name: str, service_name: str, region_name: str = None
+) -> tuple[bool, dict]:
     """
     Checks if an ECS service with the given name exists in the specified cluster.
           None otherwise.
     """
     if region_name:
+        ecs_client = boto3.client("ecs", region_name=region_name)
     else:
+        ecs_client = boto3.client("ecs")
     try:
+        response = ecs_client.describe_services(
+            cluster=cluster_name, services=[service_name]
+        )
+        if response["services"]:
+            return (
+                True,
+                response["services"][0],
+            )  # Return True and the first service object
         else:
             return False, {}
     except ClientError as e:
         # Check for the error code indicating the service doesn't exist.
+        if e.response["Error"]["Code"] == "ClusterNotFoundException":
             return False, {}
+        elif e.response["Error"]["Code"] == "ServiceNotFoundException":
             return False, {}
         else:
             # Re-raise other exceptions.
     except Exception as e:
         print(f"An unexpected error occurred: {e}")
         return False, {}
+def check_cloudfront_distribution_exists(
+    distribution_name: str, region_name: str = None
+) -> tuple[bool, dict | None]:
     """
     Checks if a CloudFront distribution with the given name exists.
           DistributionList from the ListDistributions response.
     """
     if region_name:
+        cf_client = boto3.client("cloudfront", region_name=region_name)
     else:
+        cf_client = boto3.client("cloudfront")
     try:
         response = cf_client.list_distributions()
+        if "Items" in response["DistributionList"]:
+            for distribution in response["DistributionList"]["Items"]:
                 # CloudFront doesn't directly filter by name, so we have to iterate.
+                if (
+                    distribution["AliasSet"]["Items"]
+                    and distribution["AliasSet"]["Items"][0] == distribution_name
+                ):
                     return True, distribution
             return False, None
         else:
             return False, None
     except ClientError as e:
         #  If the error indicates the Distribution doesn't exist, return False
+        if e.response["Error"]["Code"] == "NoSuchDistribution":
             return False, None
         else:
             # Re-raise other exceptions
         print(f"An unexpected error occurred: {e}")
         return False, None
+def create_web_acl_with_common_rules(
+    scope: Construct, web_acl_name: str, waf_scope: str = "CLOUDFRONT"
+):
+    """
     Use CDK to create a web ACL based on an AWS common rule set with overrides.
     This function now expects a 'scope' argument, typically 'self' from your stack,
     as CfnWebACL requires a construct scope.
+    """
     # Create full list of rules
     rules = []
     aws_ruleset_names = [
         "AWSManagedRulesCommonRuleSet",
         "AWSManagedRulesKnownBadInputsRuleSet",
+        "AWSManagedRulesAmazonIpReputationList",
     ]
     # Use a separate counter to assign unique priorities sequentially
     for aws_rule_name in aws_ruleset_names:
         current_rule_action_overrides = None
         # All managed rule groups need an override_action.
         # 'none' means use the managed rule group's default action.
         current_override_action = wafv2.CfnWebACL.OverrideActionProperty(none={})
             current_rule_action_overrides = [
                 wafv2.CfnWebACL.RuleActionOverrideProperty(
                     name="SizeRestrictions_BODY",
+                    action_to_use=wafv2.CfnWebACL.RuleActionProperty(allow={}),
                 )
             ]
             # No need to set current_override_action here, it's already set above.
                 managed_rule_group_statement=wafv2.CfnWebACL.ManagedRuleGroupStatementProperty(
                     vendor_name="AWS",
                     name=aws_rule_name,
+                    rule_action_overrides=current_rule_action_overrides,
                 )
             ),
             visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
                 cloud_watch_metrics_enabled=True,
                 metric_name=aws_rule_name,
+                sampled_requests_enabled=True,
             ),
+            override_action=current_override_action,  # THIS IS THE CRUCIAL PART FOR ALL MANAGED RULES
         )
         rules.append(rule_property)
     # Add the rate limit rule
+    rate_limit_priority = priority_counter  # Use the next available priority
+    rules.append(
+        wafv2.CfnWebACL.RuleProperty(
+            name="RateLimitRule",
+            priority=rate_limit_priority,
+            statement=wafv2.CfnWebACL.StatementProperty(
+                rate_based_statement=wafv2.CfnWebACL.RateBasedStatementProperty(
+                    limit=1000, aggregate_key_type="IP"
+                )
+            ),
+            visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
+                cloud_watch_metrics_enabled=True,
+                metric_name="RateLimitRule",
+                sampled_requests_enabled=True,
+            ),
+            action=wafv2.CfnWebACL.RuleActionProperty(block={}),
         )
+    )
     web_acl = wafv2.CfnWebACL(
         scope,
         visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
             cloud_watch_metrics_enabled=True,
             metric_name="webACL",
+            sampled_requests_enabled=True,
         ),
+        rules=rules,
     )
     CfnOutput(scope, "WebACLArn", value=web_acl.attr_arn)
     return web_acl
+def check_web_acl_exists(
+    web_acl_name: str, scope: str, region_name: str = None
+) -> tuple[bool, dict]:
     """
     Checks if a Web ACL with the given name and scope exists.
         - The second element is the Web ACL object (dictionary) if found,
           None otherwise.
     """
+    if scope not in ["CLOUDFRONT", "REGIONAL"]:
         raise ValueError("Scope must be either 'CLOUDFRONT' or 'REGIONAL'")
+    if scope == "REGIONAL" and not region_name:
         raise ValueError("Region name is required for REGIONAL scope")
+    if scope == "CLOUDFRONT":
+        region_name = "us-east-1"  # CloudFront scope requires us-east-1
     if region_name:
+        waf_client = boto3.client("wafv2", region_name=region_name)
     else:
+        waf_client = boto3.client("wafv2")
     try:
         response = waf_client.list_web_acls(Scope=scope)
+        if "WebACLs" in response:
+            for web_acl in response["WebACLs"]:
+                if web_acl["Name"] == web_acl_name:
                     # Describe the Web ACL to get the full object.
+                    describe_response = waf_client.describe_web_acl(
+                        Name=web_acl_name, Scope=scope
+                    )
+                    return True, describe_response["WebACL"]
             return False, {}
         else:
             return False, {}
     except ClientError as e:
         # Check for the error code indicating the web ACL doesn't exist.
+        if e.response["Error"]["Code"] == "ResourceNotFoundException":
             return False, {}
         else:
             # Re-raise other exceptions.
     except Exception as e:
         print(f"An unexpected error occurred: {e}")
         return False, {}
 def add_alb_https_listener_with_cert(
     scope: Construct,
+    logical_id: str,  # A unique ID for this listener construct
     alb: elb.ApplicationLoadBalancer,
+    acm_certificate_arn: Optional[
+        str
+    ],  # Optional: If None, no HTTPS listener will be created
+    default_target_group: elb.ITargetGroup,  # Mandatory: The target group to forward traffic to
     listener_port_https: int = 443,
+    listener_open_to_internet: bool = False,  # Be cautious with True, ensure ALB security group restricts access
     # --- Cognito Authentication Parameters ---
     enable_cognito_auth: bool = False,
     cognito_user_pool: Optional[cognito.IUserPool] = None,
     cognito_user_pool_client: Optional[cognito.IUserPoolClient] = None,
+    cognito_user_pool_domain: Optional[
+        str
+    ] = None,  # E.g., "my-app-domain" for "my-app-domain.auth.region.amazoncognito.com"
+    cognito_auth_scope: Optional[
+        str
+    ] = "openid profile email",  # Default recommended scope
     cognito_auth_on_unauthenticated_request: elb.UnauthenticatedAction = elb.UnauthenticatedAction.AUTHENTICATE,
+    stickiness_cookie_duration=None,
     # --- End Cognito Parameters ---
 ) -> Optional[elb.ApplicationListener]:
     """
     https_listener = None
     if acm_certificate_arn:
         certificates_list = [elb.ListenerCertificate.from_arn(acm_certificate_arn)]
+        print(
+            f"Attempting to add ALB HTTPS listener on port {listener_port_https} with ACM certificate: {acm_certificate_arn}"
+        )
         # Determine the default action based on whether Cognito auth is enabled
         default_action = None
+        if enable_cognito_auth is True:
+            if not all(
+                [cognito_user_pool, cognito_user_pool_client, cognito_user_pool_domain]
+            ):
                 raise ValueError(
                     "Cognito User Pool, Client, and Domain must be provided if enable_cognito_auth is True."
                 )
+            print(
+                f"Enabling Cognito authentication with User Pool: {cognito_user_pool.user_pool_id}"
+            )
             default_action = elb_act.AuthenticateCognitoAction(
+                next=elb.ListenerAction.forward(
+                    [default_target_group]
+                ),  # After successful auth, forward to TG
                 user_pool=cognito_user_pool,
                 user_pool_client=cognito_user_pool_client,
+                user_pool_domain=cognito_user_pool_domain,
                 scope=cognito_auth_scope,
+                on_unauthenticated_request=cognito_auth_on_unauthenticated_request,
+                session_timeout=stickiness_cookie_duration,
                 # Additional options you might want to configure:
                 # session_cookie_name="AWSELBCookies"
             )
             port=listener_port_https,
             open=listener_open_to_internet,
             certificates=certificates_list,
+            default_action=default_action,  # Use the determined default action
         )
         print(f"ALB HTTPS listener on port {listener_port_https} defined.")
     else:
     return https_listener
+def ensure_folder_exists(output_folder: str):
+    """Checks if the specified folder exists, creates it if not."""
     if not os.path.exists(output_folder):
         # Create the folder if it doesn't exist
     else:
         print(f"The {output_folder} folder already exists.")
+def create_basic_config_env(
+    out_dir: str = "config",
+    S3_LOG_CONFIG_BUCKET_NAME=S3_LOG_CONFIG_BUCKET_NAME,
+    S3_OUTPUT_BUCKET_NAME=S3_OUTPUT_BUCKET_NAME,
+    ACCESS_LOG_DYNAMODB_TABLE_NAME=ACCESS_LOG_DYNAMODB_TABLE_NAME,
+    FEEDBACK_LOG_DYNAMODB_TABLE_NAME=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+    USAGE_LOG_DYNAMODB_TABLE_NAME=USAGE_LOG_DYNAMODB_TABLE_NAME,
+):
+    """
     Create a basic config.env file for the user to use with their newly deployed redaction app.
+    """
     variables = {
+        "COGNITO_AUTH": "1",
+        "RUN_AWS_FUNCTIONS": "1",
+        "DISPLAY_FILE_NAMES_IN_LOGS": "False",
+        "SESSION_OUTPUT_FOLDER": "True",
+        "SAVE_LOGS_TO_DYNAMODB": "True",
+        "SHOW_COSTS": "True",
+        "SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS": "True",
+        "LOAD_PREVIOUS_TEXTRACT_JOBS_S3": "True",
+        "DOCUMENT_REDACTION_BUCKET": S3_LOG_CONFIG_BUCKET_NAME,
+        "TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET": S3_OUTPUT_BUCKET_NAME,
+        "ACCESS_LOG_DYNAMODB_TABLE_NAME": ACCESS_LOG_DYNAMODB_TABLE_NAME,
+        "FEEDBACK_LOG_DYNAMODB_TABLE_NAME": FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+        "USAGE_LOG_DYNAMODB_TABLE_NAME": USAGE_LOG_DYNAMODB_TABLE_NAME,
     }
     # Write variables to .env file
     ensure_folder_exists(out_dir + "/")
+    env_file_path = os.path.abspath(os.path.join(out_dir, "config.env"))
     # It's good practice to ensure the file exists before calling set_key repeatedly.
     # set_key will create it, but for a loop, it might be cleaner to ensure it's empty/exists once.
     if not os.path.exists(env_file_path):
+        with open(env_file_path, "w"):
+            pass  # Create empty file
     for key, value in variables.items():
         set_key(env_file_path, key, str(value), quote_mode="never")
     return variables
+def start_codebuild_build(PROJECT_NAME: str, AWS_REGION: str = AWS_REGION):
+    """
     Start an existing Codebuild project build
+    """
     # --- Initialize CodeBuild client ---
+    client = boto3.client("codebuild", region_name=AWS_REGION)
     try:
         print(f"Attempting to start build for project: {PROJECT_NAME}")
+        response = client.start_build(projectName=PROJECT_NAME)
+        build_id = response["build"]["id"]
         print(f"Successfully started build with ID: {build_id}")
         print(f"Build ARN: {response['build']['arn']}")
+        print("Build URL (approximate - construct based on region and ID):")
+        print(
+            f"https://{AWS_REGION}.console.aws.amazon.com/codesuite/codebuild/projects/{PROJECT_NAME}/build/{build_id.split(':')[-1]}/detail"
+        )
         # You can inspect the full response if needed
         # print("\nFull response:")
     except Exception as e:
         print(f"An unexpected error occurred: {e}")
+def upload_file_to_s3(
+    local_file_paths: List[str],
+    s3_key: str,
+    s3_bucket: str,
+    RUN_AWS_FUNCTIONS: str = "1",
+):
     """
     Uploads a file from local machine to Amazon S3.
         try:
             if s3_bucket and local_file_paths:
+                s3_client = boto3.client("s3", region_name=AWS_REGION)
                 if isinstance(local_file_paths, str):
                     local_file_paths = [local_file_paths]
                 for file in local_file_paths:
                     if s3_client:
+                        # print(s3_client)
                         try:
                             # Get file name off file path
                             file_name = os.path.basename(file)
                             print("S3 key: ", s3_key_full)
                             s3_client.upload_file(file, s3_bucket, s3_key_full)
+                            out_message = (
+                                "File " + file_name + " uploaded successfully!"
+                            )
                             print(out_message)
                         except Exception as e:
                             out_message = f"Error uploading file(s): {e}"
                             print(out_message)
                         final_out_message.append(out_message)
+                        final_out_message_str = "\n".join(final_out_message)
+                    else:
+                        final_out_message_str = "Could not connect to AWS."
+            else:
+                final_out_message_str = (
+                    "At least one essential variable is empty, could not upload to S3"
+                )
         except Exception as e:
             final_out_message_str = "Could not upload files to S3 due to: " + str(e)
             print(final_out_message_str)
     return final_out_message_str
 # Initialize ECS client
 def start_ecs_task(cluster_name, service_name):
+    ecs_client = boto3.client("ecs")
     try:
         # Update the service to set the desired count to 1
+        ecs_client.update_service(
+            cluster=cluster_name, service=service_name, desiredCount=1
         )
         return {
             "statusCode": 200,
+            "body": f"Service {service_name} in cluster {cluster_name} has been updated to 1 task.",
         }
     except Exception as e:
+        return {"statusCode": 500, "body": f"Error updating service: {str(e)}"}

cdk/cdk_stack.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

cdk/check_resources.py CHANGED Viewed

@@ -1,50 +1,85 @@
 import json
 import os
-from cdk_config import CDK_PREFIX, VPC_NAME, AWS_REGION, PUBLIC_SUBNETS_TO_USE, PRIVATE_SUBNETS_TO_USE, CODEBUILD_ROLE_NAME, ECS_TASK_ROLE_NAME, ECS_TASK_EXECUTION_ROLE_NAME, S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME, ECR_CDK_REPO_NAME, CODEBUILD_PROJECT_NAME, ALB_NAME, COGNITO_USER_POOL_NAME, COGNITO_USER_POOL_CLIENT_NAME, COGNITO_USER_POOL_CLIENT_SECRET_NAME, WEB_ACL_NAME, CONTEXT_FILE, PUBLIC_SUBNET_CIDR_BLOCKS, PRIVATE_SUBNET_CIDR_BLOCKS, PUBLIC_SUBNET_AVAILABILITY_ZONES, PRIVATE_SUBNET_AVAILABILITY_ZONES, CDK_FOLDER, CDK_CONFIG_PATH  # Import necessary config
-from cdk_functions import ( # Import your check functions (assuming they use Boto3)
-    get_vpc_id_by_name,
-    check_subnet_exists_by_name,
-    check_for_existing_role,
-    check_s3_bucket_exists,
-    check_ecr_repo_exists,
-    check_codebuild_project_exists,
     check_alb_exists,
     check_for_existing_user_pool,
     check_for_existing_user_pool_client,
     check_for_secret,
-    check_cloudfront_distribution_exists,
     check_web_acl_exists,
-    _get_existing_subnets_in_vpc,
-    validate_subnet_creation_parameters
     # Add other check functions as needed
 )
-from typing import List, Dict, Any
-cdk_folder = CDK_FOLDER #<FULL_PATH_TO_CDK_FOLDER_HERE>
 # Full path needed to find config file
 os.environ["CDK_CONFIG_PATH"] = cdk_folder + CDK_CONFIG_PATH
 # --- Helper to parse environment variables into lists ---
 def _get_env_list(env_var_name: str) -> List[str]:
     """Parses a comma-separated environment variable into a list of strings."""
-    value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
     if not value:
         return []
     # Split by comma and filter out any empty strings that might result from extra commas
-    return [s.strip() for s in value.split(',') if s.strip()]
-if PUBLIC_SUBNETS_TO_USE and not isinstance(PUBLIC_SUBNETS_TO_USE, list): PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
-if PRIVATE_SUBNETS_TO_USE and not isinstance(PRIVATE_SUBNETS_TO_USE, list): PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
-if PUBLIC_SUBNET_CIDR_BLOCKS and not isinstance(PUBLIC_SUBNET_CIDR_BLOCKS, list): PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list(PUBLIC_SUBNET_CIDR_BLOCKS)
-if PUBLIC_SUBNET_AVAILABILITY_ZONES and not isinstance(PUBLIC_SUBNET_AVAILABILITY_ZONES, list): PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list(PUBLIC_SUBNET_AVAILABILITY_ZONES)
-if PRIVATE_SUBNET_CIDR_BLOCKS and not isinstance(PRIVATE_SUBNET_CIDR_BLOCKS, list): PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list(PRIVATE_SUBNET_CIDR_BLOCKS)
-if PRIVATE_SUBNET_AVAILABILITY_ZONES and not isinstance(PRIVATE_SUBNET_AVAILABILITY_ZONES, list): PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(PRIVATE_SUBNET_AVAILABILITY_ZONES)
 # Check for the existence of elements in your AWS environment to see if it's necessary to create new versions of the same
 def check_and_set_context():
     context_data = {}
@@ -58,7 +93,9 @@ def check_and_set_context():
             # For simplicity, let's just check if *any* NAT exists in the VPC
             # A more robust check would match by subnet, AZ, or a specific tag.
             context_data["exists:NatGateway"] = True
-            context_data["id:NatGateway"] = nat_gateways[0]['NatGatewayId'] # Store the ID of the first one found
         else:
             context_data["exists:NatGateway"] = False
             context_data["id:NatGateway"] = None
@@ -66,9 +103,11 @@ def check_and_set_context():
         if not vpc_id:
             # If the VPC doesn't exist, you might not be able to check/create subnets.
             # Decide how to handle this: raise an error, set a flag, etc.
-            raise RuntimeError(f"Required VPC '{VPC_NAME}' not found. Cannot proceed with subnet checks.")
-        context_data["vpc_id"] = vpc_id # Store VPC ID in context
         # SUBNET CHECKS
         context_data: Dict[str, Any] = {}
@@ -80,14 +119,14 @@ def check_and_set_context():
         # Determine if full validation mode is possible/desired
         # It's 'desired' if CIDR/AZs are provided, and their lengths match the name lists.
         public_ready_for_full_validation = (
-            len(PUBLIC_SUBNETS_TO_USE) > 0 and
-            len(PUBLIC_SUBNET_CIDR_BLOCKS) == len(PUBLIC_SUBNETS_TO_USE) and
-            len(PUBLIC_SUBNET_AVAILABILITY_ZONES) == len(PUBLIC_SUBNETS_TO_USE)
         )
         private_ready_for_full_validation = (
-            len(PRIVATE_SUBNETS_TO_USE) > 0 and
-            len(PRIVATE_SUBNET_CIDR_BLOCKS) == len(PRIVATE_SUBNETS_TO_USE) and
-            len(PRIVATE_SUBNET_AVAILABILITY_ZONES) == len(PRIVATE_SUBNETS_TO_USE)
         )
         # Activate full validation if *any* type of subnet (public or private) has its full details provided.
@@ -96,27 +135,42 @@ def check_and_set_context():
             full_validation_mode = True
             # If some are ready but others aren't, print a warning or raise an error based on your strictness
-            if public_ready_for_full_validation and not private_ready_for_full_validation and PRIVATE_SUBNETS_TO_USE:
-                print("Warning: Public subnets have CIDRs/AZs, but private subnets do not. Only public will be fully validated/created with CIDRs.")
-            if private_ready_for_full_validation and not public_ready_for_full_validation and PUBLIC_SUBNETS_TO_USE:
-                print("Warning: Private subnets have CIDRs/AZs, but public subnets do not. Only private will be fully validated/created with CIDRs.")
             # Prepare data for validate_subnet_creation_parameters for all subnets that have full details
             if public_ready_for_full_validation:
                 for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
-                    all_proposed_subnets_data.append({
-                        'name': name,
-                        'cidr': PUBLIC_SUBNET_CIDR_BLOCKS[i],
-                        'az': PUBLIC_SUBNET_AVAILABILITY_ZONES[i]
-                    })
             if private_ready_for_full_validation:
                 for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
-                    all_proposed_subnets_data.append({
-                        'name': name,
-                        'cidr': PRIVATE_SUBNET_CIDR_BLOCKS[i],
-                        'az': PRIVATE_SUBNET_AVAILABILITY_ZONES[i]
-                    })
         print(f"Target VPC ID for Boto3 lookup: {vpc_id}")
@@ -125,22 +179,28 @@ def check_and_set_context():
             existing_aws_subnets = _get_existing_subnets_in_vpc(vpc_id)
         except Exception as e:
             print(f"Failed to fetch existing VPC subnets. Aborting. Error: {e}")
-            raise SystemExit(1) # Exit immediately if we can't get baseline data
         print("\n--- Running Name-Only Subnet Existence Check Mode ---")
         # Fallback: check only by name using the existing data
         checked_public_subnets = {}
         if PUBLIC_SUBNETS_TO_USE:
             for subnet_name in PUBLIC_SUBNETS_TO_USE:
                 print("subnet_name:", subnet_name)
-                exists, subnet_id = check_subnet_exists_by_name(subnet_name, existing_aws_subnets)
-                checked_public_subnets[subnet_name] = {"exists": exists, "id": subnet_id}
                 # If the subnet exists, remove it from the proposed subnets list
-                if checked_public_subnets[subnet_name]["exists"] == True:
                     all_proposed_subnets_data = [
-                        subnet for subnet in all_proposed_subnets_data
-                        if subnet['name'] != subnet_name
                     ]
         context_data["checked_public_subnets"] = checked_public_subnets
@@ -149,74 +209,86 @@ def check_and_set_context():
         if PRIVATE_SUBNETS_TO_USE:
             for subnet_name in PRIVATE_SUBNETS_TO_USE:
                 print("subnet_name:", subnet_name)
-                exists, subnet_id = check_subnet_exists_by_name(subnet_name, existing_aws_subnets)
-                checked_private_subnets[subnet_name] = {"exists": exists, "id": subnet_id}
                 # If the subnet exists, remove it from the proposed subnets list
-                if checked_private_subnets[subnet_name]["exists"] == True:
                     all_proposed_subnets_data = [
-                        subnet for subnet in all_proposed_subnets_data
-                        if subnet['name'] != subnet_name
                     ]
         context_data["checked_private_subnets"] = checked_private_subnets
         print("\nName-only existence subnet check complete.\n")
         if full_validation_mode:
-            print("\n--- Running in Full Subnet Validation Mode (CIDR/AZs provided) ---")
             try:
-                validate_subnet_creation_parameters(vpc_id, all_proposed_subnets_data, existing_aws_subnets)
                 print("\nPre-synth validation successful. Proceeding with CDK synth.\n")
                 # Populate context_data for downstream CDK construct creation
                 context_data["public_subnets_to_create"] = []
                 if public_ready_for_full_validation:
                     for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
-                        context_data["public_subnets_to_create"].append({
-                            'name': name,
-                            'cidr': PUBLIC_SUBNET_CIDR_BLOCKS[i],
-                            'az': PUBLIC_SUBNET_AVAILABILITY_ZONES[i],
-                            'is_public': True
-                        })
                 context_data["private_subnets_to_create"] = []
                 if private_ready_for_full_validation:
                     for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
-                        context_data["private_subnets_to_create"].append({
-                            'name': name,
-                            'cidr': PRIVATE_SUBNET_CIDR_BLOCKS[i],
-                            'az': PRIVATE_SUBNET_AVAILABILITY_ZONES[i],
-                            'is_public': False
-                        })
             except (ValueError, Exception) as e:
                 print(f"\nFATAL ERROR: Subnet parameter validation failed: {e}\n")
-                raise SystemExit(1) # Exit if validation fails
     # Example checks and setting context values
     # IAM Roles
     role_name = CODEBUILD_ROLE_NAME
     exists, _, _ = check_for_existing_role(role_name)
-    context_data[f"exists:{role_name}"] = exists # Use boolean
     if exists:
-         _, role_arn, _ = check_for_existing_role(role_name) # Get ARN if needed
-         context_data[f"arn:{role_name}"] = role_arn
     role_name = ECS_TASK_ROLE_NAME
     exists, _, _ = check_for_existing_role(role_name)
     context_data[f"exists:{role_name}"] = exists
     if exists:
-         _, role_arn, _ = check_for_existing_role(role_name)
-         context_data[f"arn:{role_name}"] = role_arn
     role_name = ECS_TASK_EXECUTION_ROLE_NAME
     exists, _, _ = check_for_existing_role(role_name)
     context_data[f"exists:{role_name}"] = exists
     if exists:
-         _, role_arn, _ = check_for_existing_role(role_name)
-         context_data[f"arn:{role_name}"] = role_arn
     # S3 Buckets
     bucket_name = S3_LOG_CONFIG_BUCKET_NAME
@@ -230,33 +302,36 @@ def check_and_set_context():
     exists, _ = check_s3_bucket_exists(output_bucket_name)
     context_data[f"exists:{output_bucket_name}"] = exists
     if exists:
-         pass
     # ECR Repository
     repo_name = ECR_CDK_REPO_NAME
     exists, _ = check_ecr_repo_exists(repo_name)
     context_data[f"exists:{repo_name}"] = exists
     if exists:
-         pass # from_repository_name is sufficient
     # CodeBuild Project
     project_name = CODEBUILD_PROJECT_NAME
     exists, _ = check_codebuild_project_exists(project_name)
     context_data[f"exists:{project_name}"] = exists
     if exists:
-         # Need a way to get the ARN from the check function
-         _, project_arn = check_codebuild_project_exists(project_name) # Assuming it returns ARN
-         context_data[f"arn:{project_name}"] = project_arn
     # ALB (by name lookup)
     alb_name = ALB_NAME
     exists, _ = check_alb_exists(alb_name, region_name=AWS_REGION)
     context_data[f"exists:{alb_name}"] = exists
     if exists:
-        _, alb_object = check_alb_exists(alb_name, region_name=AWS_REGION) # Assuming check returns object
         print("alb_object:", alb_object)
-        context_data[f"arn:{alb_name}"] = alb_object['LoadBalancerArn']
     # Cognito User Pool (by name)
     user_pool_name = COGNITO_USER_POOL_NAME
@@ -267,10 +342,12 @@ def check_and_set_context():
     # Cognito User Pool Client (by name and pool ID) - requires User Pool ID from check
     if user_pool_id:
-        user_pool_id_for_client_check = user_pool_id #context_data.get(f"id:{user_pool_name}") # Use ID from context
         user_pool_client_name = COGNITO_USER_POOL_CLIENT_NAME
         if user_pool_id_for_client_check:
-            exists, client_id, _ = check_for_existing_user_pool_client(user_pool_client_name, user_pool_id_for_client_check)
             context_data[f"exists:{user_pool_client_name}"] = exists
             if exists:
                 context_data[f"id:{user_pool_client_name}"] = client_id
@@ -281,10 +358,11 @@ def check_and_set_context():
     context_data[f"exists:{secret_name}"] = exists
     # You might not need the ARN if using from_secret_name_v2
     # WAF Web ACL (by name and scope)
     web_acl_name = WEB_ACL_NAME
-    exists, _ = check_web_acl_exists(web_acl_name, scope="CLOUDFRONT") # Assuming check returns object
     context_data[f"exists:{web_acl_name}"] = exists
     if exists:
         _, existing_web_acl = check_web_acl_exists(web_acl_name, scope="CLOUDFRONT")
@@ -295,4 +373,3 @@ def check_and_set_context():
         json.dump(context_data, f, indent=2)
     print(f"Context data written to {CONTEXT_FILE}")

 import json
 import os
+from typing import Any, Dict, List
+from cdk_config import (  # Import necessary config
+    ALB_NAME,
+    AWS_REGION,
+    CDK_CONFIG_PATH,
+    CDK_FOLDER,
+    CODEBUILD_PROJECT_NAME,
+    CODEBUILD_ROLE_NAME,
+    COGNITO_USER_POOL_CLIENT_NAME,
+    COGNITO_USER_POOL_CLIENT_SECRET_NAME,
+    COGNITO_USER_POOL_NAME,
+    CONTEXT_FILE,
+    ECR_CDK_REPO_NAME,
+    ECS_TASK_EXECUTION_ROLE_NAME,
+    ECS_TASK_ROLE_NAME,
+    PRIVATE_SUBNET_AVAILABILITY_ZONES,
+    PRIVATE_SUBNET_CIDR_BLOCKS,
+    PRIVATE_SUBNETS_TO_USE,
+    PUBLIC_SUBNET_AVAILABILITY_ZONES,
+    PUBLIC_SUBNET_CIDR_BLOCKS,
+    PUBLIC_SUBNETS_TO_USE,
+    S3_LOG_CONFIG_BUCKET_NAME,
+    S3_OUTPUT_BUCKET_NAME,
+    VPC_NAME,
+    WEB_ACL_NAME,
+)
+from cdk_functions import (  # Import your check functions (assuming they use Boto3)
+    _get_existing_subnets_in_vpc,
     check_alb_exists,
+    check_codebuild_project_exists,
+    check_ecr_repo_exists,
+    check_for_existing_role,
     check_for_existing_user_pool,
     check_for_existing_user_pool_client,
     check_for_secret,
+    check_s3_bucket_exists,
+    check_subnet_exists_by_name,
     check_web_acl_exists,
+    get_vpc_id_by_name,
+    validate_subnet_creation_parameters,
     # Add other check functions as needed
 )
+cdk_folder = CDK_FOLDER  # <FULL_PATH_TO_CDK_FOLDER_HERE>
 # Full path needed to find config file
 os.environ["CDK_CONFIG_PATH"] = cdk_folder + CDK_CONFIG_PATH
 # --- Helper to parse environment variables into lists ---
 def _get_env_list(env_var_name: str) -> List[str]:
     """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
     if not value:
         return []
     # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(",") if s.strip()]
+if PUBLIC_SUBNETS_TO_USE and not isinstance(PUBLIC_SUBNETS_TO_USE, list):
+    PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
+if PRIVATE_SUBNETS_TO_USE and not isinstance(PRIVATE_SUBNETS_TO_USE, list):
+    PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
+if PUBLIC_SUBNET_CIDR_BLOCKS and not isinstance(PUBLIC_SUBNET_CIDR_BLOCKS, list):
+    PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list(PUBLIC_SUBNET_CIDR_BLOCKS)
+if PUBLIC_SUBNET_AVAILABILITY_ZONES and not isinstance(
+    PUBLIC_SUBNET_AVAILABILITY_ZONES, list
+):
+    PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list(PUBLIC_SUBNET_AVAILABILITY_ZONES)
+if PRIVATE_SUBNET_CIDR_BLOCKS and not isinstance(PRIVATE_SUBNET_CIDR_BLOCKS, list):
+    PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list(PRIVATE_SUBNET_CIDR_BLOCKS)
+if PRIVATE_SUBNET_AVAILABILITY_ZONES and not isinstance(
+    PRIVATE_SUBNET_AVAILABILITY_ZONES, list
+):
+    PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(PRIVATE_SUBNET_AVAILABILITY_ZONES)
 # Check for the existence of elements in your AWS environment to see if it's necessary to create new versions of the same
 def check_and_set_context():
     context_data = {}
             # For simplicity, let's just check if *any* NAT exists in the VPC
             # A more robust check would match by subnet, AZ, or a specific tag.
             context_data["exists:NatGateway"] = True
+            context_data["id:NatGateway"] = nat_gateways[0][
+                "NatGatewayId"
+            ]  # Store the ID of the first one found
         else:
             context_data["exists:NatGateway"] = False
             context_data["id:NatGateway"] = None
         if not vpc_id:
             # If the VPC doesn't exist, you might not be able to check/create subnets.
             # Decide how to handle this: raise an error, set a flag, etc.
+            raise RuntimeError(
+                f"Required VPC '{VPC_NAME}' not found. Cannot proceed with subnet checks."
+            )
+        context_data["vpc_id"] = vpc_id  # Store VPC ID in context
         # SUBNET CHECKS
         context_data: Dict[str, Any] = {}
         # Determine if full validation mode is possible/desired
         # It's 'desired' if CIDR/AZs are provided, and their lengths match the name lists.
         public_ready_for_full_validation = (
+            len(PUBLIC_SUBNETS_TO_USE) > 0
+            and len(PUBLIC_SUBNET_CIDR_BLOCKS) == len(PUBLIC_SUBNETS_TO_USE)
+            and len(PUBLIC_SUBNET_AVAILABILITY_ZONES) == len(PUBLIC_SUBNETS_TO_USE)
         )
         private_ready_for_full_validation = (
+            len(PRIVATE_SUBNETS_TO_USE) > 0
+            and len(PRIVATE_SUBNET_CIDR_BLOCKS) == len(PRIVATE_SUBNETS_TO_USE)
+            and len(PRIVATE_SUBNET_AVAILABILITY_ZONES) == len(PRIVATE_SUBNETS_TO_USE)
         )
         # Activate full validation if *any* type of subnet (public or private) has its full details provided.
             full_validation_mode = True
             # If some are ready but others aren't, print a warning or raise an error based on your strictness
+            if (
+                public_ready_for_full_validation
+                and not private_ready_for_full_validation
+                and PRIVATE_SUBNETS_TO_USE
+            ):
+                print(
+                    "Warning: Public subnets have CIDRs/AZs, but private subnets do not. Only public will be fully validated/created with CIDRs."
+                )
+            if (
+                private_ready_for_full_validation
+                and not public_ready_for_full_validation
+                and PUBLIC_SUBNETS_TO_USE
+            ):
+                print(
+                    "Warning: Private subnets have CIDRs/AZs, but public subnets do not. Only private will be fully validated/created with CIDRs."
+                )
             # Prepare data for validate_subnet_creation_parameters for all subnets that have full details
             if public_ready_for_full_validation:
                 for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
+                    all_proposed_subnets_data.append(
+                        {
+                            "name": name,
+                            "cidr": PUBLIC_SUBNET_CIDR_BLOCKS[i],
+                            "az": PUBLIC_SUBNET_AVAILABILITY_ZONES[i],
+                        }
+                    )
             if private_ready_for_full_validation:
                 for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
+                    all_proposed_subnets_data.append(
+                        {
+                            "name": name,
+                            "cidr": PRIVATE_SUBNET_CIDR_BLOCKS[i],
+                            "az": PRIVATE_SUBNET_AVAILABILITY_ZONES[i],
+                        }
+                    )
         print(f"Target VPC ID for Boto3 lookup: {vpc_id}")
             existing_aws_subnets = _get_existing_subnets_in_vpc(vpc_id)
         except Exception as e:
             print(f"Failed to fetch existing VPC subnets. Aborting. Error: {e}")
+            raise SystemExit(1)  # Exit immediately if we can't get baseline data
         print("\n--- Running Name-Only Subnet Existence Check Mode ---")
         # Fallback: check only by name using the existing data
         checked_public_subnets = {}
         if PUBLIC_SUBNETS_TO_USE:
             for subnet_name in PUBLIC_SUBNETS_TO_USE:
                 print("subnet_name:", subnet_name)
+                exists, subnet_id = check_subnet_exists_by_name(
+                    subnet_name, existing_aws_subnets
+                )
+                checked_public_subnets[subnet_name] = {
+                    "exists": exists,
+                    "id": subnet_id,
+                }
                 # If the subnet exists, remove it from the proposed subnets list
+                if checked_public_subnets[subnet_name]["exists"] is True:
                     all_proposed_subnets_data = [
+                        subnet
+                        for subnet in all_proposed_subnets_data
+                        if subnet["name"] != subnet_name
                     ]
         context_data["checked_public_subnets"] = checked_public_subnets
         if PRIVATE_SUBNETS_TO_USE:
             for subnet_name in PRIVATE_SUBNETS_TO_USE:
                 print("subnet_name:", subnet_name)
+                exists, subnet_id = check_subnet_exists_by_name(
+                    subnet_name, existing_aws_subnets
+                )
+                checked_private_subnets[subnet_name] = {
+                    "exists": exists,
+                    "id": subnet_id,
+                }
                 # If the subnet exists, remove it from the proposed subnets list
+                if checked_private_subnets[subnet_name]["exists"] is True:
                     all_proposed_subnets_data = [
+                        subnet
+                        for subnet in all_proposed_subnets_data
+                        if subnet["name"] != subnet_name
                     ]
         context_data["checked_private_subnets"] = checked_private_subnets
         print("\nName-only existence subnet check complete.\n")
         if full_validation_mode:
+            print(
+                "\n--- Running in Full Subnet Validation Mode (CIDR/AZs provided) ---"
+            )
             try:
+                validate_subnet_creation_parameters(
+                    vpc_id, all_proposed_subnets_data, existing_aws_subnets
+                )
                 print("\nPre-synth validation successful. Proceeding with CDK synth.\n")
                 # Populate context_data for downstream CDK construct creation
                 context_data["public_subnets_to_create"] = []
                 if public_ready_for_full_validation:
                     for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
+                        context_data["public_subnets_to_create"].append(
+                            {
+                                "name": name,
+                                "cidr": PUBLIC_SUBNET_CIDR_BLOCKS[i],
+                                "az": PUBLIC_SUBNET_AVAILABILITY_ZONES[i],
+                                "is_public": True,
+                            }
+                        )
                 context_data["private_subnets_to_create"] = []
                 if private_ready_for_full_validation:
                     for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
+                        context_data["private_subnets_to_create"].append(
+                            {
+                                "name": name,
+                                "cidr": PRIVATE_SUBNET_CIDR_BLOCKS[i],
+                                "az": PRIVATE_SUBNET_AVAILABILITY_ZONES[i],
+                                "is_public": False,
+                            }
+                        )
             except (ValueError, Exception) as e:
                 print(f"\nFATAL ERROR: Subnet parameter validation failed: {e}\n")
+                raise SystemExit(1)  # Exit if validation fails
     # Example checks and setting context values
     # IAM Roles
     role_name = CODEBUILD_ROLE_NAME
     exists, _, _ = check_for_existing_role(role_name)
+    context_data[f"exists:{role_name}"] = exists  # Use boolean
     if exists:
+        _, role_arn, _ = check_for_existing_role(role_name)  # Get ARN if needed
+        context_data[f"arn:{role_name}"] = role_arn
     role_name = ECS_TASK_ROLE_NAME
     exists, _, _ = check_for_existing_role(role_name)
     context_data[f"exists:{role_name}"] = exists
     if exists:
+        _, role_arn, _ = check_for_existing_role(role_name)
+        context_data[f"arn:{role_name}"] = role_arn
     role_name = ECS_TASK_EXECUTION_ROLE_NAME
     exists, _, _ = check_for_existing_role(role_name)
     context_data[f"exists:{role_name}"] = exists
     if exists:
+        _, role_arn, _ = check_for_existing_role(role_name)
+        context_data[f"arn:{role_name}"] = role_arn
     # S3 Buckets
     bucket_name = S3_LOG_CONFIG_BUCKET_NAME
     exists, _ = check_s3_bucket_exists(output_bucket_name)
     context_data[f"exists:{output_bucket_name}"] = exists
     if exists:
+        pass
     # ECR Repository
     repo_name = ECR_CDK_REPO_NAME
     exists, _ = check_ecr_repo_exists(repo_name)
     context_data[f"exists:{repo_name}"] = exists
     if exists:
+        pass  # from_repository_name is sufficient
     # CodeBuild Project
     project_name = CODEBUILD_PROJECT_NAME
     exists, _ = check_codebuild_project_exists(project_name)
     context_data[f"exists:{project_name}"] = exists
     if exists:
+        # Need a way to get the ARN from the check function
+        _, project_arn = check_codebuild_project_exists(
+            project_name
+        )  # Assuming it returns ARN
+        context_data[f"arn:{project_name}"] = project_arn
     # ALB (by name lookup)
     alb_name = ALB_NAME
     exists, _ = check_alb_exists(alb_name, region_name=AWS_REGION)
     context_data[f"exists:{alb_name}"] = exists
     if exists:
+        _, alb_object = check_alb_exists(
+            alb_name, region_name=AWS_REGION
+        )  # Assuming check returns object
         print("alb_object:", alb_object)
+        context_data[f"arn:{alb_name}"] = alb_object["LoadBalancerArn"]
     # Cognito User Pool (by name)
     user_pool_name = COGNITO_USER_POOL_NAME
     # Cognito User Pool Client (by name and pool ID) - requires User Pool ID from check
     if user_pool_id:
+        user_pool_id_for_client_check = user_pool_id  # context_data.get(f"id:{user_pool_name}") # Use ID from context
         user_pool_client_name = COGNITO_USER_POOL_CLIENT_NAME
         if user_pool_id_for_client_check:
+            exists, client_id, _ = check_for_existing_user_pool_client(
+                user_pool_client_name, user_pool_id_for_client_check
+            )
             context_data[f"exists:{user_pool_client_name}"] = exists
             if exists:
                 context_data[f"id:{user_pool_client_name}"] = client_id
     context_data[f"exists:{secret_name}"] = exists
     # You might not need the ARN if using from_secret_name_v2
     # WAF Web ACL (by name and scope)
     web_acl_name = WEB_ACL_NAME
+    exists, _ = check_web_acl_exists(
+        web_acl_name, scope="CLOUDFRONT"
+    )  # Assuming check returns object
     context_data[f"exists:{web_acl_name}"] = exists
     if exists:
         _, existing_web_acl = check_web_acl_exists(web_acl_name, scope="CLOUDFRONT")
         json.dump(context_data, f, indent=2)
     print(f"Context data written to {CONTEXT_FILE}")

cdk/post_cdk_build_quickstart.py CHANGED Viewed

@@ -1,6 +1,17 @@
 import time
-from cdk_config import CODEBUILD_PROJECT_NAME, S3_LOG_CONFIG_BUCKET_NAME, CLUSTER_NAME, ECS_SERVICE_NAME
-from cdk_functions import start_codebuild_build, upload_file_to_s3, start_ecs_task, create_basic_config_env
 from tqdm import tqdm
 # Create basic config.env file that user can use to run the app later. Input is the folder it is saved into.
@@ -11,10 +22,12 @@ print("Starting CodeBuild project.")
 start_codebuild_build(PROJECT_NAME=CODEBUILD_PROJECT_NAME)
 # Upload config.env file to S3 bucket
-upload_file_to_s3(local_file_paths="config/config.env", s3_key="", s3_bucket=S3_LOG_CONFIG_BUCKET_NAME)
-total_seconds = 660 # 11 minutes
-update_interval = 1 # Update every second
 print("Waiting 11 minutes for the CodeBuild container to build.")
@@ -24,4 +37,4 @@ for i in tqdm(range(total_seconds), desc="Building container"):
 # Start task on ECS
 print("Starting ECS task")
-start_ecs_task(cluster_name=CLUSTER_NAME, service_name=ECS_SERVICE_NAME)

 import time
+from cdk_config import (
+    CLUSTER_NAME,
+    CODEBUILD_PROJECT_NAME,
+    ECS_SERVICE_NAME,
+    S3_LOG_CONFIG_BUCKET_NAME,
+)
+from cdk_functions import (
+    create_basic_config_env,
+    start_codebuild_build,
+    start_ecs_task,
+    upload_file_to_s3,
+)
 from tqdm import tqdm
 # Create basic config.env file that user can use to run the app later. Input is the folder it is saved into.
 start_codebuild_build(PROJECT_NAME=CODEBUILD_PROJECT_NAME)
 # Upload config.env file to S3 bucket
+upload_file_to_s3(
+    local_file_paths="config/config.env", s3_key="", s3_bucket=S3_LOG_CONFIG_BUCKET_NAME
+)
+total_seconds = 660  # 11 minutes
+update_interval = 1  # Update every second
 print("Waiting 11 minutes for the CodeBuild container to build.")
 # Start task on ECS
 print("Starting ECS task")
+start_ecs_task(cluster_name=CLUSTER_NAME, service_name=ECS_SERVICE_NAME)

cli_redact.py CHANGED Viewed

@@ -1,70 +1,145 @@
 import argparse
 import os
-import pandas as pd
 import time
 import uuid
-from tools.config import LOCAL_PII_OPTION, AWS_PII_OPTION, OUTPUT_FOLDER, DEFAULT_LANGUAGE, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, CUSTOM_ENTITIES, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, DOCUMENT_REDACTION_BUCKET, DEFAULT_COST_CODE, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, DISPLAY_FILE_NAMES_IN_LOGS, DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DO_INITIAL_TABULAR_DATA_CLEAN, ALLOW_LIST_PATH, DENY_LIST_PATH, WHOLE_PAGE_REDACTION_LIST_PATH, PREPROCESS_LOCAL_OCR_IMAGES, IMAGES_DPI, RETURN_PDF_END_OF_REDACTION, COMPRESS_REDACTED_PDF, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, DEFAULT_DUPLICATE_DETECTION_THRESHOLD, DEFAULT_MIN_WORD_COUNT, DEFAULT_MIN_CONSECUTIVE_PAGES, USE_GREEDY_DUPLICATE_DETECTION, DEFAULT_COMBINE_PAGES, REMOVE_DUPLICATE_ROWS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, INPUT_FOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SESSION_OUTPUT_FOLDER, DIRECT_MODE_DEFAULT_USER, RUN_AWS_FUNCTIONS, S3_USAGE_LOGS_FOLDER
-from tools.helper_functions import ensure_output_folder_exists
 def _generate_session_hash() -> str:
     """Generate a unique session hash for logging purposes."""
     return str(uuid.uuid4())[:8]
-def get_username_and_folders(username:str = "",
-                                output_folder_textbox:str=OUTPUT_FOLDER,
-                                input_folder_textbox:str=INPUT_FOLDER,
-                                session_output_folder:str=SESSION_OUTPUT_FOLDER,
-                                textract_document_upload_input_folder:str=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
-                                textract_document_upload_output_folder:str=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
-                                s3_textract_document_logs_subfolder:str=TEXTRACT_JOBS_S3_LOC,
-                                local_textract_document_logs_subfolder:str=TEXTRACT_JOBS_LOCAL_LOC):
     # Generate session hash for logging. Either from input user name or generated
     if username:
         out_session_hash = username
     else:
-        out_session_hash = _generate_session_hash()
-    if session_output_folder == 'True' or session_output_folder == True:
         output_folder = output_folder_textbox + out_session_hash + "/"
         input_folder = input_folder_textbox + out_session_hash + "/"
-        textract_document_upload_input_folder = textract_document_upload_input_folder + "/" + out_session_hash
-        textract_document_upload_output_folder = textract_document_upload_output_folder + "/" + out_session_hash
-        s3_textract_document_logs_subfolder = s3_textract_document_logs_subfolder + "/" + out_session_hash
-        local_textract_document_logs_subfolder = local_textract_document_logs_subfolder + "/" + out_session_hash + "/"
     else:
         output_folder = output_folder_textbox
         input_folder = input_folder_textbox
-    if not os.path.exists(output_folder): os.mkdir(output_folder)
-    if not os.path.exists(input_folder): os.mkdir(input_folder)
-    return out_session_hash, output_folder, out_session_hash, input_folder, textract_document_upload_input_folder, textract_document_upload_output_folder, s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder
 def _get_env_list(env_var_name: str) -> list[str]:
     """Parses a comma-separated environment variable into a list of strings."""
-    value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
     if not value:
         return []
     # Split by comma and filter out any empty strings that might result from extra commas
-    return [s.strip() for s in value.split(',') if s.strip()]
 # --- Constants and Configuration ---
-if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
-if FULL_COMPREHEND_ENTITY_LIST: FULL_COMPREHEND_ENTITY_LIST = _get_env_list(FULL_COMPREHEND_ENTITY_LIST)
-if CHOSEN_REDACT_ENTITIES: CHOSEN_REDACT_ENTITIES = _get_env_list(CHOSEN_REDACT_ENTITIES)
-if FULL_ENTITY_LIST: FULL_ENTITY_LIST = _get_env_list(FULL_ENTITY_LIST)
-if CUSTOM_ENTITIES: CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
-if DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX: DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = _get_env_list(DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX)
 # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
 CHOSEN_COMPREHEND_ENTITIES.extend(CUSTOM_ENTITIES)
@@ -76,19 +151,20 @@ chosen_comprehend_entities = CHOSEN_COMPREHEND_ENTITIES
 full_comprehend_entity_list = FULL_COMPREHEND_ENTITY_LIST
 default_handwrite_signature_checkbox = DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX
 # --- Main CLI Function ---
 def main(direct_mode_args={}):
     """
     A unified command-line interface to prepare, redact, and anonymise various document types.
     Args:
         direct_mode_args (dict, optional): Dictionary of arguments for direct mode execution.
                                           If provided, uses these instead of parsing command line arguments.
     """
     parser = argparse.ArgumentParser(
-        description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
         formatter_class=argparse.RawTextHelpFormatter,
-        epilog='''
 Examples:
 To run these, you need to do the following:
@@ -171,95 +247,334 @@ python cli_redact.py --task textract --textract_action retrieve --job_id 1234567
 ## List recent Textract jobs:
 python cli_redact.py --task textract --textract_action list
-'''
     )
     # --- Task Selection ---
-    task_group = parser.add_argument_group('Task Selection')
-    task_group.add_argument('--task',
-    choices=['redact', 'deduplicate', 'textract'],
-    default='redact',
-    help='Task to perform: redact (PII redaction/anonymisation), deduplicate (find duplicate content), or textract (AWS Textract batch operations).')
     # --- General Arguments (apply to all file types) ---
-    general_group = parser.add_argument_group('General Options')
-    general_group.add_argument('--input_file',  nargs='+', help='Path to the input file(s) to process. Separate multiple files with a space, and use quotes if there are spaces in the file name.')
-    general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
-    general_group.add_argument('--input_dir', default=INPUT_FOLDER, help='Directory for all input files.')
-    general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
-    general_group.add_argument('--allow_list', default=ALLOW_LIST_PATH, help='Path to a CSV file with words to exclude from redaction.')
-    general_group.add_argument('--pii_detector', choices=[LOCAL_PII_OPTION, AWS_PII_OPTION, "None"], default=LOCAL_PII_OPTION,
-    help='Core PII detection method (Local or AWS Comprehend, or None).')
-    general_group.add_argument('--username', default=DIRECT_MODE_DEFAULT_USER, help='Username for the session.')
-    general_group.add_argument('--save_to_user_folders', default=SESSION_OUTPUT_FOLDER, help='Whether to save to user folders or not.')
-    general_group.add_argument('--local_redact_entities', nargs='+', choices=full_entity_list, default=chosen_redact_entities,
-    help=f'Local redaction entities to use. Default: {chosen_redact_entities}. Full list: {full_entity_list}.')
-    general_group.add_argument('--aws_redact_entities', nargs='+', choices=full_comprehend_entity_list, default=chosen_comprehend_entities,
-    help=f'AWS redaction entities to use. Default: {chosen_comprehend_entities}. Full list: {full_comprehend_entity_list}.')
-    general_group.add_argument('--aws_access_key', default=AWS_ACCESS_KEY, help='Your AWS Access Key ID.')
-    general_group.add_argument('--aws_secret_key', default=AWS_SECRET_KEY, help='Your AWS Secret Access Key.')
-    general_group.add_argument('--cost_code', default=DEFAULT_COST_CODE, help='Cost code for tracking usage.')
-    general_group.add_argument('--aws_region', default=AWS_REGION, help='AWS region for cloud services.')
-    general_group.add_argument('--s3_bucket', default=DOCUMENT_REDACTION_BUCKET, help='S3 bucket name for cloud operations.')
-    general_group.add_argument('--do_initial_clean', default=DO_INITIAL_TABULAR_DATA_CLEAN, help='Perform initial text cleaning for tabular data.')
-    general_group.add_argument('--save_logs_to_csv', default=SAVE_LOGS_TO_CSV, help='Save processing logs to CSV files.')
-    general_group.add_argument('--save_logs_to_dynamodb', default=SAVE_LOGS_TO_DYNAMODB, help='Save processing logs to DynamoDB.')
-    general_group.add_argument('--display_file_names_in_logs', default=DISPLAY_FILE_NAMES_IN_LOGS, help='Include file names in log outputs.')
-    general_group.add_argument('--upload_logs_to_s3', default=RUN_AWS_FUNCTIONS == "1", help='Upload log files to S3 after processing.')
-    general_group.add_argument('--s3_logs_prefix', default=S3_USAGE_LOGS_FOLDER, help='S3 prefix for usage log files.')
     # --- PDF/Image Redaction Arguments ---
-    pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
-    pdf_group.add_argument('--ocr_method', choices=["AWS Textract", "Local OCR", "Local text"], default="Local OCR", help='OCR method for text extraction from images.')
-    pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
-    pdf_group.add_argument('--page_max', type=int, default=0, help='Last page to redact.')
-    pdf_group.add_argument('--images_dpi', type=float, default=float(IMAGES_DPI), help='DPI for image processing.')
-    pdf_group.add_argument('--chosen_local_ocr_model', choices=['tesseract', 'hybrid', 'paddle'], default=CHOSEN_LOCAL_OCR_MODEL, help='Local OCR model to use.')
-    pdf_group.add_argument('--preprocess_local_ocr_images', default=PREPROCESS_LOCAL_OCR_IMAGES, help='Preprocess images before OCR.')
-    pdf_group.add_argument('--compress_redacted_pdf', default=COMPRESS_REDACTED_PDF, help='Compress the final redacted PDF.')
-    pdf_group.add_argument('--return_pdf_end_of_redaction', default=RETURN_PDF_END_OF_REDACTION, help='Return PDF at end of redaction process.')
-    pdf_group.add_argument('--deny_list_file', default=DENY_LIST_PATH, help='Custom words file to recognize for redaction.')
-    pdf_group.add_argument('--allow_list_file', default=ALLOW_LIST_PATH, help='Custom words file to recognize for redaction.')
-    pdf_group.add_argument('--redact_whole_page_file', default=WHOLE_PAGE_REDACTION_LIST_PATH, help='File for pages to redact completely.')
-    pdf_group.add_argument('--handwrite_signature_extraction', nargs='+', default=default_handwrite_signature_checkbox, help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".')
-    pdf_group.add_argument('--extract_forms', action='store_true', help='Extract forms during Textract analysis.')
-    pdf_group.add_argument('--extract_tables', action='store_true', help='Extract tables during Textract analysis.')
-    pdf_group.add_argument('--extract_layout', action='store_true', help='Extract layout during Textract analysis.')
     # --- Word/Tabular Anonymisation Arguments ---
-    tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
-    tabular_group.add_argument('--anon_strategy', choices=['redact', 'redact completely', 'replace_redacted', 'entity_type', 'encrypt', 'hash', 'replace with \'REDACTED\'', 'replace with <ENTITY_NAME>',  'mask', 'fake_first_name'], default=DEFAULT_TABULAR_ANONYMISATION_STRATEGY, help='The anonymisation strategy to apply.')
-    tabular_group.add_argument('--text_columns', nargs='+', default=list(), help='A list of column names to anonymise or deduplicate in tabular data.')
-    tabular_group.add_argument('--excel_sheets', nargs='+', default=list(), help='Specific Excel sheet names to process.')
-    tabular_group.add_argument('--fuzzy_mistakes', type=int, default=DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, help='Number of allowed spelling mistakes for fuzzy matching.')
-    tabular_group.add_argument('--match_fuzzy_whole_phrase_bool', default=True, help='Match fuzzy whole phrase boolean.')
     # --- Duplicate Detection Arguments ---
-    duplicate_group = parser.add_argument_group('Duplicate Detection Options')
-    duplicate_group.add_argument('--duplicate_type', choices=['pages', 'tabular'], default='pages', help='Type of duplicate detection: pages (for OCR files) or tabular (for CSV/Excel files).')
-    duplicate_group.add_argument('--similarity_threshold', type=float, default=DEFAULT_DUPLICATE_DETECTION_THRESHOLD, help='Similarity threshold (0-1) to consider content as duplicates.')
-    duplicate_group.add_argument('--min_word_count', type=int, default=DEFAULT_MIN_WORD_COUNT, help='Minimum word count for text to be considered in duplicate analysis.')
-    duplicate_group.add_argument('--min_consecutive_pages', type=int, default=DEFAULT_MIN_CONSECUTIVE_PAGES, help='Minimum number of consecutive pages to consider as a match.')
-    duplicate_group.add_argument('--greedy_match', default=USE_GREEDY_DUPLICATE_DETECTION, help='Use greedy matching strategy for consecutive pages.')
-    duplicate_group.add_argument('--combine_pages', default=DEFAULT_COMBINE_PAGES, help='Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.')
-    duplicate_group.add_argument('--remove_duplicate_rows', default=REMOVE_DUPLICATE_ROWS, help='Remove duplicate rows from the output.')
     # --- Textract Batch Operations Arguments ---
-    textract_group = parser.add_argument_group('Textract Batch Operations Options')
-    textract_group.add_argument('--textract_action',
-    choices=['submit', 'retrieve', 'list'],
-    help='Textract action to perform: submit (submit document for analysis), retrieve (get results by job ID), or list (show recent jobs).')
-    textract_group.add_argument('--job_id', help='Textract job ID for retrieve action.')
-    textract_group.add_argument('--extract_signatures', action='store_true', help='Extract signatures during Textract analysis (for submit action).')
-    textract_group.add_argument('--textract_bucket', default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, help='S3 bucket name for Textract operations (overrides default).')
-    textract_group.add_argument('--textract_input_prefix', default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, help='S3 prefix for input files in Textract operations.')
-    textract_group.add_argument('--textract_output_prefix', default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, help='S3 prefix for output files in Textract operations.')
-    textract_group.add_argument('--s3_textract_document_logs_subfolder', default=TEXTRACT_JOBS_S3_LOC, help='S3 prefix for logs in Textract operations.')
-    textract_group.add_argument('--local_textract_document_logs_subfolder', default=TEXTRACT_JOBS_LOCAL_LOC, help='Local prefix for logs in Textract operations.')
-    textract_group.add_argument('--poll_interval', type=int, default=30, help='Polling interval in seconds for Textract job status.')
-    textract_group.add_argument('--max_poll_attempts', type=int, default=120, help='Maximum number of polling attempts for Textract job completion.')
     # Parse arguments - either from command line or direct mode
     if direct_mode_args:
         # Use direct mode arguments
@@ -270,42 +585,70 @@ python cli_redact.py --task textract --textract_action list
     # --- Initial Setup ---
     # Convert string boolean variables to boolean
-    if args.preprocess_local_ocr_images == "True": args.preprocess_local_ocr_images = True
-    else: args.preprocess_local_ocr_images = False
-    if args.greedy_match == "True": args.greedy_match = True
-    else: args.greedy_match = False
-    if args.combine_pages == "True": args.combine_pages = True
-    else: args.combine_pages = False
-    if args.remove_duplicate_rows == "True": args.remove_duplicate_rows = True
-    else: args.remove_duplicate_rows = False
-    if args.return_pdf_end_of_redaction == "True": args.return_pdf_end_of_redaction = True
-    else: args.return_pdf_end_of_redaction = False
-    if args.compress_redacted_pdf == "True": args.compress_redacted_pdf = True
-    else: args.compress_redacted_pdf = False
-    if args.do_initial_clean == "True": args.do_initial_clean = True
-    else: args.do_initial_clean = False
-    if args.save_logs_to_csv == "True": args.save_logs_to_csv = True
-    else: args.save_logs_to_csv = False
-    if args.save_logs_to_dynamodb == "True": args.save_logs_to_dynamodb = True
-    else: args.save_logs_to_dynamodb = False
-    if args.display_file_names_in_logs == "True": args.display_file_names_in_logs = True
-    else: args.display_file_names_in_logs = False
-    if args.match_fuzzy_whole_phrase_bool == "True": args.match_fuzzy_whole_phrase_bool = True
-    else: args.match_fuzzy_whole_phrase_bool = False
-    if args.save_to_user_folders == "True": args.save_to_user_folders = True
-    else: args.save_to_user_folders = False
     # Combine extraction options
-    extraction_options = list(args.handwrite_signature_extraction) if args.handwrite_signature_extraction else []
     if args.extract_forms:
-        extraction_options.append('Extract forms')
     if args.extract_tables:
-        extraction_options.append('Extract tables')
     if args.extract_layout:
-        extraction_options.append('Extract layout')
     args.handwrite_signature_extraction = extraction_options
-    if args.task in ['redact', 'deduplicate']:
         if args.input_file:
             if isinstance(args.input_file, str):
                 args.input_file = [args.input_file]
@@ -314,25 +657,46 @@ python cli_redact.py --task textract --textract_action list
             file_extension = file_extension.lower()
         else:
             raise ValueError("Error: --input_file is required for 'redact' task.")
     # Initialise usage logger if logging is enabled
     usage_logger = None
     if args.save_logs_to_csv or args.save_logs_to_dynamodb:
         from tools.cli_usage_logger import create_cli_usage_logger
         try:
             usage_logger = create_cli_usage_logger()
         except Exception as e:
             print(f"Warning: Could not initialise usage logger: {e}")
     # Get username and folders
-    session_hash, args.output_dir, _, args.input_dir, args.textract_input_prefix, args.textract_output_prefix, args.s3_textract_document_logs_subfolder, args.local_textract_document_logs_subfolder = get_username_and_folders(username=args.username, output_folder_textbox=args.output_dir, input_folder_textbox=args.input_dir, session_output_folder=args.save_to_user_folders, textract_document_upload_input_folder=args.textract_input_prefix, textract_document_upload_output_folder=args.textract_output_prefix, s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder)
-    print(f"Conducting analyses with user {args.username}. Outputs will be saved to {args.output_dir}.")
     # --- Route to the Correct Workflow Based on Task and File Type ---
     # Validate input_file requirement for tasks that need it
-    if args.task in ['redact', 'deduplicate'] and not args.input_file:
         print(f"Error: --input_file is required for '{args.task}' task.")
         return
@@ -342,70 +706,151 @@ python cli_redact.py --task textract --textract_action list
         args.prepare_images = False
     from tools.cli_usage_logger import create_cli_usage_logger, log_redaction_usage
     # Task 1: Redaction/Anonymisation
-    if args.task == 'redact':
         # Workflow 1: PDF/Image Redaction
-        if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
             print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
             start_time = time.time()
             try:
                 from tools.file_conversion import prepare_image_or_pdf
                 from tools.file_redaction import choose_and_run_redactor
                 # Step 1: Prepare the document
                 print("\nStep 1: Preparing document...")
                 (
-                    prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
-                    image_annotations, _, original_cropboxes, page_sizes, _, _, _, _, _
                 ) = prepare_image_or_pdf(
-                    file_paths=args.input_file, text_extract_method=args.ocr_method, all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
-                    first_loop_state=True, prepare_for_review=False,
-                    output_folder=args.output_dir, input_folder=args.input_dir, prepare_images=args.prepare_images
                 )
                 print(f"Preparation complete. {prep_summary}")
                 # Step 2: Redact the prepared document
                 print("\nStep 2: Running redaction...")
                 (
-                    output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, comprehend_query_number, _, _, _, _, _, _, page_sizes, _, _, _, total_textract_query_number, _, _, _, _, _, _
                 ) = choose_and_run_redactor(
-                    file_paths=args.input_file, prepared_pdf_file_paths=prepared_pdf_paths,
-                    pdf_image_file_paths=image_file_paths, chosen_redact_entities=args.local_redact_entities,
-                    chosen_redact_comprehend_entities=args.aws_redact_entities, text_extraction_method=args.ocr_method,
-                    in_allow_list=args.allow_list_file, in_deny_list=args.deny_list_file,
-                    redact_whole_page_list=args.redact_whole_page_file, first_loop_state=True,
-                    page_min=args.page_min, page_max=args.page_max, handwrite_signature_checkbox=args.handwrite_signature_extraction, max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes, match_fuzzy_whole_phrase_bool=args.match_fuzzy_whole_phrase_bool,
-                    pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
-                    document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
-                    aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
-                    language=args.language, output_folder=args.output_dir, input_folder=args.input_dir
                 )
                 # Calculate processing time
                 end_time = time.time()
                 processing_time = end_time - start_time
                 # Log usage data if logger is available
                 if usage_logger:
                     try:
                         # Extract file name for logging
                         print("Saving logs to CSV")
-                        doc_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "document"
                         data_file_name = ""  # Not applicable for PDF/image redaction
                         # Determine if this was a Textract API call
                         is_textract_call = args.ocr_method == "AWS Textract"
                         # Count pages (approximate from page_sizes if available)
                         total_pages = len(page_sizes) if page_sizes else 1
                         # Count API calls (approximate - would need to be tracked in the redaction function)
-                        textract_queries = int(total_textract_query_number) if is_textract_call else 0
-                        comprehend_queries = int(comprehend_query_number) if args.pii_detector == "AWS Comprehend" else 0
                         # Format handwriting/signature options
-                        handwriting_signature = ", ".join(args.handwrite_signature_extraction) if args.handwrite_signature_extraction else ""
                         log_redaction_usage(
                             logger=usage_logger,
                             session_hash=session_hash,
@@ -424,33 +869,47 @@ python cli_redact.py --task textract --textract_action list
                             save_to_dynamodb=args.save_logs_to_dynamodb,
                             save_to_s3=args.upload_logs_to_s3,
                             s3_bucket=args.s3_bucket,
-                            s3_key_prefix=args.s3_logs_prefix
                         )
                     except Exception as e:
                         print(f"Warning: Could not log usage data: {e}")
                 print("\n--- Redaction Process Complete ---")
                 print(f"Summary: {output_summary}")
                 print(f"Processing time: {processing_time:.2f} seconds")
                 print(f"\nOutput files saved to: {args.output_dir}")
                 print("Generated Files:", sorted(output_files))
-                if log_files: print("Log Files:", sorted(log_files))
             except Exception as e:
-                print(f"\nAn error occurred during the PDF/Image redaction workflow: {e}")
         # Workflow 2: Word/Tabular Data Anonymisation
-        elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
-            print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
             start_time = time.time()
             try:
                 from tools.data_anonymise import anonymise_files_with_open_text
                 # Run the anonymisation function directly
-                output_summary, output_files, _, _, log_files, _, processing_time, comprehend_query_number = anonymise_files_with_open_text(
                     file_paths=args.input_file,
-                    in_text="", # Not used for file-based operations
                     anon_strategy=args.anon_strategy,
                     chosen_cols=args.text_columns,
                     chosen_redact_entities=args.local_redact_entities,
@@ -465,34 +924,42 @@ python cli_redact.py --task textract --textract_action list
                     aws_access_key_textbox=args.aws_access_key,
                     aws_secret_key_textbox=args.aws_secret_key,
                     language=args.language,
-                    do_initial_clean=args.do_initial_clean
                 )
                 # Calculate processing time
                 end_time = time.time()
                 processing_time = end_time - start_time
                 # Log usage data if logger is available
                 if usage_logger:
                     try:
                         print("Saving logs to CSV")
                         # Extract file name for logging
                         doc_file_name = ""  # Not applicable for tabular data
-                        data_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "data_file"
                         # Determine if this was a Textract API call (not applicable for tabular)
                         is_textract_call = False
                         # Count pages (not applicable for tabular data)
                         total_pages = 0
                         # Count API calls (approximate - would need to be tracked in the anonymisation function)
                         textract_queries = 0  # Not applicable for tabular data
-                        comprehend_queries = comprehend_query_number if args.pii_detector == "AWS Comprehend" else 0
                         # Format handwriting/signature options (not applicable for tabular)
                         handwriting_signature = ""
                         log_redaction_usage(
                             logger=usage_logger,
                             session_hash=session_hash,
@@ -511,7 +978,7 @@ python cli_redact.py --task textract --textract_action list
                             save_to_dynamodb=args.save_logs_to_dynamodb,
                             save_to_s3=args.upload_logs_to_s3,
                             s3_bucket=args.s3_bucket,
-                            s3_key_prefix=args.s3_logs_prefix
                         )
                     except Exception as e:
                         print(f"Warning: Could not log usage data: {e}")
@@ -521,54 +988,71 @@ python cli_redact.py --task textract --textract_action list
                 print(f"Processing time: {processing_time:.2f} seconds")
                 print(f"\nOutput files saved to: {args.output_dir}")
                 print("Generated Files:", sorted(output_files))
-                if log_files: print("Log Files:", sorted(log_files))
             except Exception as e:
-                print(f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}")
         else:
             print(f"Error: Unsupported file type '{file_extension}' for redaction.")
             print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
-            print("Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet")
     # Task 2: Duplicate Detection
-    elif args.task == 'deduplicate':
         print("--- Starting Duplicate Detection Workflow... ---")
         try:
             from tools.find_duplicate_pages import run_duplicate_analysis
-            if args.duplicate_type == 'pages':
                 # Page duplicate detection
-                if file_extension == '.csv':
-                    print("--- Detected OCR CSV file. Starting Page Duplicate Detection... ---")
                     start_time = time.time()
-                    if args.combine_pages == True:
                         print("Combining pages...")
                     else:
                         print("Using line-level duplicate detection...")
                     # Load the CSV file as a list for the duplicate analysis function
-                    results_df, output_paths, full_data_by_file, processing_time, task_textbox = run_duplicate_analysis(
                         files=args.input_file,
                         threshold=args.similarity_threshold,
                         min_words=args.min_word_count,
                         min_consecutive=args.min_consecutive_pages,
                         greedy_match=args.greedy_match,
                         combine_pages=args.combine_pages,
-                        output_folder=args.output_dir
                     )
                     end_time = time.time()
                     processing_time = end_time - start_time
                     print("\n--- Page Duplicate Detection Complete ---")
                     print(f"Found {len(results_df)} duplicate matches")
                     print(f"\nOutput files saved to: {args.output_dir}")
-                    if output_paths: print("Generated Files:", sorted(output_paths))
                 else:
-                    print(f"Error: Page duplicate detection requires CSV files with OCR data.")
                     print("Please provide a CSV file containing OCR output data.")
                     # Log usage data if logger is available
@@ -576,22 +1060,28 @@ python cli_redact.py --task textract --textract_action list
                         try:
                             # Extract file name for logging
                             print("Saving logs to CSV")
-                            doc_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "document"
-                            data_file_name = ""  # Not applicable for PDF/image redaction
                             # Determine if this was a Textract API call
                             is_textract_call = False
                             # Count pages (approximate from page_sizes if available)
                             total_pages = len(page_sizes) if page_sizes else 1
                             # Count API calls (approximate - would need to be tracked in the redaction function)
                             textract_queries = 0
                             comprehend_queries = 0
                             # Format handwriting/signature options
                             handwriting_signature = ""
                             log_redaction_usage(
                                 logger=usage_logger,
                                 session_hash=session_hash,
@@ -610,20 +1100,29 @@ python cli_redact.py --task textract --textract_action list
                                 save_to_dynamodb=args.save_logs_to_dynamodb,
                                 save_to_s3=args.upload_logs_to_s3,
                                 s3_bucket=args.s3_bucket,
-                                s3_key_prefix=args.s3_logs_prefix
                             )
                         except Exception as e:
                             print(f"Warning: Could not log usage data: {e}")
-            elif args.duplicate_type == 'tabular':
                 # Tabular duplicate detection
                 from tools.find_duplicate_tabular import run_tabular_duplicate_detection
-                if file_extension in ['.csv', '.xlsx', '.xls', '.parquet']:
-                    print("--- Detected tabular file. Starting Tabular Duplicate Detection... ---")
                     start_time = time.time()
-                    results_df, output_paths, full_data_by_file, processing_time, task_textbox = run_tabular_duplicate_detection(
                         files=args.input_file,
                         threshold=args.similarity_threshold,
                         min_words=args.min_word_count,
@@ -631,7 +1130,7 @@ python cli_redact.py --task textract --textract_action list
                         output_folder=args.output_dir,
                         do_initial_clean_dup=args.do_initial_clean,
                         in_excel_tabular_sheets=args.excel_sheets,
-                        remove_duplicate_rows=args.remove_duplicate_rows
                     )
                     end_time = time.time()
@@ -643,21 +1142,25 @@ python cli_redact.py --task textract --textract_action list
                             # Extract file name for logging
                             print("Saving logs to CSV")
                             doc_file_name = ""
-                            data_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "data_file"
                             # Determine if this was a Textract API call
                             is_textract_call = False
                             # Count pages (approximate from page_sizes if available)
                             total_pages = len(page_sizes) if page_sizes else 1
                             # Count API calls (approximate - would need to be tracked in the redaction function)
                             textract_queries = 0
                             comprehend_queries = 0
                             # Format handwriting/signature options
                             handwriting_signature = ""
                             log_redaction_usage(
                                 logger=usage_logger,
                                 session_hash=session_hash,
@@ -676,58 +1179,80 @@ python cli_redact.py --task textract --textract_action list
                                 save_to_dynamodb=args.save_logs_to_dynamodb,
                                 save_to_s3=args.upload_logs_to_s3,
                                 s3_bucket=args.s3_bucket,
-                                s3_key_prefix=args.s3_logs_prefix
                             )
                         except Exception as e:
                             print(f"Warning: Could not log usage data: {e}")
                     print("\n--- Tabular Duplicate Detection Complete ---")
                     print(f"Found {len(results_df)} duplicate matches")
                     print(f"\nOutput files saved to: {args.output_dir}")
-                    if output_paths: print("Generated Files:", sorted(output_paths))
                 else:
-                    print(f"Error: Tabular duplicate detection requires CSV, Excel, or Parquet files.")
                     print("Supported types: .csv, .xlsx, .xls, .parquet")
             else:
                 print(f"Error: Invalid duplicate type '{args.duplicate_type}'.")
                 print("Valid options: 'pages' or 'tabular'")
         except Exception as e:
             print(f"\nAn error occurred during the duplicate detection workflow: {e}")
     # Task 3: Textract Batch Operations
-    elif args.task == 'textract':
         print("--- Starting Textract Batch Operations Workflow... ---")
         if not args.textract_action:
             print("Error: --textract_action is required for textract task.")
             print("Valid options: 'submit', 'retrieve', or 'list'")
             return
         try:
-            if args.textract_action == 'submit':
-                from tools.textract_batch_call import analyse_document_with_textract_api, load_in_textract_job_details
                 # Submit document to Textract for analysis
                 if not args.input_file:
                     print("Error: --input_file is required for submit action.")
                     return
                 print(f"--- Submitting document to Textract: {args.input_file} ---")
                 start_time = time.time()
                 # Load existing job details
-                job_df = load_in_textract_job_details(load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, load_local_jobs_loc=args.local_textract_document_logs_subfolder)
                 # Determine signature extraction options
-                signature_options = ['Extract handwriting', 'Extract signatures'] if args.extract_signatures else ['Extract handwriting']
                 # Use configured bucket or override
                 textract_bucket = args.textract_bucket if args.textract_bucket else ""
                 # Submit the job
-                result_message, job_id, job_type, successful_job_number, is_textract_call, total_pages, task_textbox = analyse_document_with_textract_api(
                     local_pdf_path=args.input_file,
                     s3_input_prefix=args.textract_input_prefix,
                     s3_output_prefix=args.textract_output_prefix,
@@ -736,13 +1261,13 @@ python cli_redact.py --task textract --textract_action list
                     general_s3_bucket_name=args.s3_bucket,
                     local_output_dir=args.output_dir,
                     handwrite_signature_checkbox=signature_options,
-                    aws_region=args.aws_region
                 )
                 end_time = time.time()
                 processing_time = end_time - start_time
-                print(f"\n--- Textract Job Submitted Successfully ---")
                 print(f"Job ID: {job_id}")
                 print(f"Job Type: {job_type}")
                 print(f"Message: {result_message}")
@@ -753,20 +1278,24 @@ python cli_redact.py --task textract --textract_action list
                     try:
                         # Extract file name for logging
                         print("Saving logs to CSV")
-                        doc_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "document"
                         data_file_name = ""
                         # Determine if this was a Textract API call
                         is_textract_call = True
                         args.ocr_method == "AWS Textract"
                         # Count API calls (approximate - would need to be tracked in the redaction function)
                         textract_queries = total_pages
                         comprehend_queries = 0
                         # Format handwriting/signature options
                         handwriting_signature = ""
                         log_redaction_usage(
                             logger=usage_logger,
                             session_hash=session_hash,
@@ -785,62 +1314,75 @@ python cli_redact.py --task textract --textract_action list
                             save_to_dynamodb=args.save_logs_to_dynamodb,
                             save_to_s3=args.upload_logs_to_s3,
                             s3_bucket=args.s3_bucket,
-                            s3_key_prefix=args.s3_logs_prefix
                         )
                     except Exception as e:
                         print(f"Warning: Could not log usage data: {e}")
-            elif args.textract_action == 'retrieve':
                 print(f"--- Retrieving Textract results for Job ID: {args.job_id} ---")
-                from tools.textract_batch_call import poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details
                 # Retrieve results by job ID
                 if not args.job_id:
                     print("Error: --job_id is required for retrieve action.")
                     return
                 # Load existing job details to get job type
                 print("Loading existing job details...")
-                job_df = load_in_textract_job_details(load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, load_local_jobs_loc=args.local_textract_document_logs_subfolder)
                 # Find job type from the dataframe
                 job_type = "document_text_detection"  # default
                 if not job_df.empty and "job_id" in job_df.columns:
                     matching_jobs = job_df.loc[job_df["job_id"] == args.job_id]
                     if not matching_jobs.empty and "job_type" in matching_jobs.columns:
                         job_type = matching_jobs.iloc[0]["job_type"]
                 # Use configured bucket or override
                 textract_bucket = args.textract_bucket if args.textract_bucket else ""
                 # Poll for completion and download results
                 print("Polling for completion and downloading results...")
-                downloaded_file_path, job_status, updated_job_df, output_filename = poll_whole_document_textract_analysis_progress_and_download(
-                    job_id=args.job_id,
-                    job_type_dropdown=job_type,
-                    s3_output_prefix=args.textract_output_prefix,
-                    pdf_filename="",  # Will be determined from job details
-                    job_df=job_df,
-                    s3_bucket_name=textract_bucket,
-                    load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
-                    load_local_jobs_loc=args.local_textract_document_logs_subfolder,
-                    local_output_dir=args.output_dir,
-                    poll_interval_seconds=args.poll_interval,
-                    max_polling_attempts=args.max_poll_attempts
                 )
-                print(f"\n--- Textract Results Retrieved Successfully ---")
                 print(f"Job Status: {job_status}")
                 print(f"Downloaded File: {downloaded_file_path}")
-                #print(f"Output Filename: {output_filename}")
-            elif args.textract_action == 'list':
                 from tools.textract_batch_call import load_in_textract_job_details
                 # List recent Textract jobs
                 print("--- Listing Recent Textract Jobs ---")
-                job_df = load_in_textract_job_details(load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, load_local_jobs_loc=args.local_textract_document_logs_subfolder)
                 if job_df.empty:
                     print("No recent Textract jobs found.")
                 else:
@@ -853,17 +1395,18 @@ python cli_redact.py --task textract --textract_action list
                         print(f"Signatures: {job.get('signature_extraction', 'N/A')}")
                         print(f"Date: {job.get('job_date_time', 'N/A')}")
                         print("-" * 80)
             else:
                 print(f"Error: Invalid textract_action '{args.textract_action}'.")
                 print("Valid options: 'submit', 'retrieve', or 'list'")
         except Exception as e:
             print(f"\nAn error occurred during the Textract workflow: {e}")
     else:
         print(f"Error: Invalid task '{args.task}'.")
         print("Valid options: 'redact', 'deduplicate', or 'textract'")
 if __name__ == "__main__":
-    main()

 import argparse
 import os
 import time
 import uuid
+import pandas as pd
+from tools.config import (
+    ALLOW_LIST_PATH,
+    AWS_ACCESS_KEY,
+    AWS_PII_OPTION,
+    AWS_REGION,
+    AWS_SECRET_KEY,
+    CHOSEN_COMPREHEND_ENTITIES,
+    CHOSEN_LOCAL_OCR_MODEL,
+    CHOSEN_REDACT_ENTITIES,
+    COMPRESS_REDACTED_PDF,
+    CUSTOM_ENTITIES,
+    DEFAULT_COMBINE_PAGES,
+    DEFAULT_COST_CODE,
+    DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
+    DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
+    DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
+    DEFAULT_LANGUAGE,
+    DEFAULT_MIN_CONSECUTIVE_PAGES,
+    DEFAULT_MIN_WORD_COUNT,
+    DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
+    DENY_LIST_PATH,
+    DIRECT_MODE_DEFAULT_USER,
+    DISPLAY_FILE_NAMES_IN_LOGS,
+    DO_INITIAL_TABULAR_DATA_CLEAN,
+    DOCUMENT_REDACTION_BUCKET,
+    FULL_COMPREHEND_ENTITY_LIST,
+    FULL_ENTITY_LIST,
+    IMAGES_DPI,
+    INPUT_FOLDER,
+    LOCAL_PII_OPTION,
+    OUTPUT_FOLDER,
+    PREPROCESS_LOCAL_OCR_IMAGES,
+    REMOVE_DUPLICATE_ROWS,
+    RETURN_PDF_END_OF_REDACTION,
+    RUN_AWS_FUNCTIONS,
+    S3_USAGE_LOGS_FOLDER,
+    SAVE_LOGS_TO_CSV,
+    SAVE_LOGS_TO_DYNAMODB,
+    SESSION_OUTPUT_FOLDER,
+    TEXTRACT_JOBS_LOCAL_LOC,
+    TEXTRACT_JOBS_S3_LOC,
+    TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+    TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+    TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+    USE_GREEDY_DUPLICATE_DETECTION,
+    WHOLE_PAGE_REDACTION_LIST_PATH,
+)
 def _generate_session_hash() -> str:
     """Generate a unique session hash for logging purposes."""
     return str(uuid.uuid4())[:8]
+def get_username_and_folders(
+    username: str = "",
+    output_folder_textbox: str = OUTPUT_FOLDER,
+    input_folder_textbox: str = INPUT_FOLDER,
+    session_output_folder: str = SESSION_OUTPUT_FOLDER,
+    textract_document_upload_input_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+    textract_document_upload_output_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+    s3_textract_document_logs_subfolder: str = TEXTRACT_JOBS_S3_LOC,
+    local_textract_document_logs_subfolder: str = TEXTRACT_JOBS_LOCAL_LOC,
+):
     # Generate session hash for logging. Either from input user name or generated
     if username:
         out_session_hash = username
     else:
+        out_session_hash = _generate_session_hash()
+    if session_output_folder == "True" or session_output_folder is True:
         output_folder = output_folder_textbox + out_session_hash + "/"
         input_folder = input_folder_textbox + out_session_hash + "/"
+        textract_document_upload_input_folder = (
+            textract_document_upload_input_folder + "/" + out_session_hash
+        )
+        textract_document_upload_output_folder = (
+            textract_document_upload_output_folder + "/" + out_session_hash
+        )
+        s3_textract_document_logs_subfolder = (
+            s3_textract_document_logs_subfolder + "/" + out_session_hash
+        )
+        local_textract_document_logs_subfolder = (
+            local_textract_document_logs_subfolder + "/" + out_session_hash + "/"
+        )
     else:
         output_folder = output_folder_textbox
         input_folder = input_folder_textbox
+    if not os.path.exists(output_folder):
+        os.mkdir(output_folder)
+    if not os.path.exists(input_folder):
+        os.mkdir(input_folder)
+    return (
+        out_session_hash,
+        output_folder,
+        out_session_hash,
+        input_folder,
+        textract_document_upload_input_folder,
+        textract_document_upload_output_folder,
+        s3_textract_document_logs_subfolder,
+        local_textract_document_logs_subfolder,
+    )
 def _get_env_list(env_var_name: str) -> list[str]:
     """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
     if not value:
         return []
     # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(",") if s.strip()]
 # --- Constants and Configuration ---
+if CHOSEN_COMPREHEND_ENTITIES:
+    CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
+if FULL_COMPREHEND_ENTITY_LIST:
+    FULL_COMPREHEND_ENTITY_LIST = _get_env_list(FULL_COMPREHEND_ENTITY_LIST)
+if CHOSEN_REDACT_ENTITIES:
+    CHOSEN_REDACT_ENTITIES = _get_env_list(CHOSEN_REDACT_ENTITIES)
+if FULL_ENTITY_LIST:
+    FULL_ENTITY_LIST = _get_env_list(FULL_ENTITY_LIST)
+if CUSTOM_ENTITIES:
+    CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
+if DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX:
+    DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = _get_env_list(
+        DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX
+    )
 # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
 CHOSEN_COMPREHEND_ENTITIES.extend(CUSTOM_ENTITIES)
 full_comprehend_entity_list = FULL_COMPREHEND_ENTITY_LIST
 default_handwrite_signature_checkbox = DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX
 # --- Main CLI Function ---
 def main(direct_mode_args={}):
     """
     A unified command-line interface to prepare, redact, and anonymise various document types.
     Args:
         direct_mode_args (dict, optional): Dictionary of arguments for direct mode execution.
                                           If provided, uses these instead of parsing command line arguments.
     """
     parser = argparse.ArgumentParser(
+        description="A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.",
         formatter_class=argparse.RawTextHelpFormatter,
+        epilog="""
 Examples:
 To run these, you need to do the following:
 ## List recent Textract jobs:
 python cli_redact.py --task textract --textract_action list
+""",
     )
     # --- Task Selection ---
+    task_group = parser.add_argument_group("Task Selection")
+    task_group.add_argument(
+        "--task",
+        choices=["redact", "deduplicate", "textract"],
+        default="redact",
+        help="Task to perform: redact (PII redaction/anonymisation), deduplicate (find duplicate content), or textract (AWS Textract batch operations).",
+    )
     # --- General Arguments (apply to all file types) ---
+    general_group = parser.add_argument_group("General Options")
+    general_group.add_argument(
+        "--input_file",
+        nargs="+",
+        help="Path to the input file(s) to process. Separate multiple files with a space, and use quotes if there are spaces in the file name.",
+    )
+    general_group.add_argument(
+        "--output_dir", default=OUTPUT_FOLDER, help="Directory for all output files."
+    )
+    general_group.add_argument(
+        "--input_dir", default=INPUT_FOLDER, help="Directory for all input files."
+    )
+    general_group.add_argument(
+        "--language", default=DEFAULT_LANGUAGE, help="Language of the document content."
+    )
+    general_group.add_argument(
+        "--allow_list",
+        default=ALLOW_LIST_PATH,
+        help="Path to a CSV file with words to exclude from redaction.",
+    )
+    general_group.add_argument(
+        "--pii_detector",
+        choices=[LOCAL_PII_OPTION, AWS_PII_OPTION, "None"],
+        default=LOCAL_PII_OPTION,
+        help="Core PII detection method (Local or AWS Comprehend, or None).",
+    )
+    general_group.add_argument(
+        "--username", default=DIRECT_MODE_DEFAULT_USER, help="Username for the session."
+    )
+    general_group.add_argument(
+        "--save_to_user_folders",
+        default=SESSION_OUTPUT_FOLDER,
+        help="Whether to save to user folders or not.",
+    )
+    general_group.add_argument(
+        "--local_redact_entities",
+        nargs="+",
+        choices=full_entity_list,
+        default=chosen_redact_entities,
+        help=f"Local redaction entities to use. Default: {chosen_redact_entities}. Full list: {full_entity_list}.",
+    )
+    general_group.add_argument(
+        "--aws_redact_entities",
+        nargs="+",
+        choices=full_comprehend_entity_list,
+        default=chosen_comprehend_entities,
+        help=f"AWS redaction entities to use. Default: {chosen_comprehend_entities}. Full list: {full_comprehend_entity_list}.",
+    )
+    general_group.add_argument(
+        "--aws_access_key", default=AWS_ACCESS_KEY, help="Your AWS Access Key ID."
+    )
+    general_group.add_argument(
+        "--aws_secret_key", default=AWS_SECRET_KEY, help="Your AWS Secret Access Key."
+    )
+    general_group.add_argument(
+        "--cost_code", default=DEFAULT_COST_CODE, help="Cost code for tracking usage."
+    )
+    general_group.add_argument(
+        "--aws_region", default=AWS_REGION, help="AWS region for cloud services."
+    )
+    general_group.add_argument(
+        "--s3_bucket",
+        default=DOCUMENT_REDACTION_BUCKET,
+        help="S3 bucket name for cloud operations.",
+    )
+    general_group.add_argument(
+        "--do_initial_clean",
+        default=DO_INITIAL_TABULAR_DATA_CLEAN,
+        help="Perform initial text cleaning for tabular data.",
+    )
+    general_group.add_argument(
+        "--save_logs_to_csv",
+        default=SAVE_LOGS_TO_CSV,
+        help="Save processing logs to CSV files.",
+    )
+    general_group.add_argument(
+        "--save_logs_to_dynamodb",
+        default=SAVE_LOGS_TO_DYNAMODB,
+        help="Save processing logs to DynamoDB.",
+    )
+    general_group.add_argument(
+        "--display_file_names_in_logs",
+        default=DISPLAY_FILE_NAMES_IN_LOGS,
+        help="Include file names in log outputs.",
+    )
+    general_group.add_argument(
+        "--upload_logs_to_s3",
+        default=RUN_AWS_FUNCTIONS == "1",
+        help="Upload log files to S3 after processing.",
+    )
+    general_group.add_argument(
+        "--s3_logs_prefix",
+        default=S3_USAGE_LOGS_FOLDER,
+        help="S3 prefix for usage log files.",
+    )
     # --- PDF/Image Redaction Arguments ---
+    pdf_group = parser.add_argument_group(
+        "PDF/Image Redaction Options (.pdf, .png, .jpg)"
+    )
+    pdf_group.add_argument(
+        "--ocr_method",
+        choices=["AWS Textract", "Local OCR", "Local text"],
+        default="Local OCR",
+        help="OCR method for text extraction from images.",
+    )
+    pdf_group.add_argument(
+        "--page_min", type=int, default=0, help="First page to redact."
+    )
+    pdf_group.add_argument(
+        "--page_max", type=int, default=0, help="Last page to redact."
+    )
+    pdf_group.add_argument(
+        "--images_dpi",
+        type=float,
+        default=float(IMAGES_DPI),
+        help="DPI for image processing.",
+    )
+    pdf_group.add_argument(
+        "--chosen_local_ocr_model",
+        choices=["tesseract", "hybrid", "paddle"],
+        default=CHOSEN_LOCAL_OCR_MODEL,
+        help="Local OCR model to use.",
+    )
+    pdf_group.add_argument(
+        "--preprocess_local_ocr_images",
+        default=PREPROCESS_LOCAL_OCR_IMAGES,
+        help="Preprocess images before OCR.",
+    )
+    pdf_group.add_argument(
+        "--compress_redacted_pdf",
+        default=COMPRESS_REDACTED_PDF,
+        help="Compress the final redacted PDF.",
+    )
+    pdf_group.add_argument(
+        "--return_pdf_end_of_redaction",
+        default=RETURN_PDF_END_OF_REDACTION,
+        help="Return PDF at end of redaction process.",
+    )
+    pdf_group.add_argument(
+        "--deny_list_file",
+        default=DENY_LIST_PATH,
+        help="Custom words file to recognize for redaction.",
+    )
+    pdf_group.add_argument(
+        "--allow_list_file",
+        default=ALLOW_LIST_PATH,
+        help="Custom words file to recognize for redaction.",
+    )
+    pdf_group.add_argument(
+        "--redact_whole_page_file",
+        default=WHOLE_PAGE_REDACTION_LIST_PATH,
+        help="File for pages to redact completely.",
+    )
+    pdf_group.add_argument(
+        "--handwrite_signature_extraction",
+        nargs="+",
+        default=default_handwrite_signature_checkbox,
+        help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".',
+    )
+    pdf_group.add_argument(
+        "--extract_forms",
+        action="store_true",
+        help="Extract forms during Textract analysis.",
+    )
+    pdf_group.add_argument(
+        "--extract_tables",
+        action="store_true",
+        help="Extract tables during Textract analysis.",
+    )
+    pdf_group.add_argument(
+        "--extract_layout",
+        action="store_true",
+        help="Extract layout during Textract analysis.",
+    )
     # --- Word/Tabular Anonymisation Arguments ---
+    tabular_group = parser.add_argument_group(
+        "Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)"
+    )
+    tabular_group.add_argument(
+        "--anon_strategy",
+        choices=[
+            "redact",
+            "redact completely",
+            "replace_redacted",
+            "entity_type",
+            "encrypt",
+            "hash",
+            "replace with 'REDACTED'",
+            "replace with <ENTITY_NAME>",
+            "mask",
+            "fake_first_name",
+        ],
+        default=DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
+        help="The anonymisation strategy to apply.",
+    )
+    tabular_group.add_argument(
+        "--text_columns",
+        nargs="+",
+        default=list(),
+        help="A list of column names to anonymise or deduplicate in tabular data.",
+    )
+    tabular_group.add_argument(
+        "--excel_sheets",
+        nargs="+",
+        default=list(),
+        help="Specific Excel sheet names to process.",
+    )
+    tabular_group.add_argument(
+        "--fuzzy_mistakes",
+        type=int,
+        default=DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
+        help="Number of allowed spelling mistakes for fuzzy matching.",
+    )
+    tabular_group.add_argument(
+        "--match_fuzzy_whole_phrase_bool",
+        default=True,
+        help="Match fuzzy whole phrase boolean.",
+    )
     # --- Duplicate Detection Arguments ---
+    duplicate_group = parser.add_argument_group("Duplicate Detection Options")
+    duplicate_group.add_argument(
+        "--duplicate_type",
+        choices=["pages", "tabular"],
+        default="pages",
+        help="Type of duplicate detection: pages (for OCR files) or tabular (for CSV/Excel files).",
+    )
+    duplicate_group.add_argument(
+        "--similarity_threshold",
+        type=float,
+        default=DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
+        help="Similarity threshold (0-1) to consider content as duplicates.",
+    )
+    duplicate_group.add_argument(
+        "--min_word_count",
+        type=int,
+        default=DEFAULT_MIN_WORD_COUNT,
+        help="Minimum word count for text to be considered in duplicate analysis.",
+    )
+    duplicate_group.add_argument(
+        "--min_consecutive_pages",
+        type=int,
+        default=DEFAULT_MIN_CONSECUTIVE_PAGES,
+        help="Minimum number of consecutive pages to consider as a match.",
+    )
+    duplicate_group.add_argument(
+        "--greedy_match",
+        default=USE_GREEDY_DUPLICATE_DETECTION,
+        help="Use greedy matching strategy for consecutive pages.",
+    )
+    duplicate_group.add_argument(
+        "--combine_pages",
+        default=DEFAULT_COMBINE_PAGES,
+        help="Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.",
+    )
+    duplicate_group.add_argument(
+        "--remove_duplicate_rows",
+        default=REMOVE_DUPLICATE_ROWS,
+        help="Remove duplicate rows from the output.",
+    )
     # --- Textract Batch Operations Arguments ---
+    textract_group = parser.add_argument_group("Textract Batch Operations Options")
+    textract_group.add_argument(
+        "--textract_action",
+        choices=["submit", "retrieve", "list"],
+        help="Textract action to perform: submit (submit document for analysis), retrieve (get results by job ID), or list (show recent jobs).",
+    )
+    textract_group.add_argument("--job_id", help="Textract job ID for retrieve action.")
+    textract_group.add_argument(
+        "--extract_signatures",
+        action="store_true",
+        help="Extract signatures during Textract analysis (for submit action).",
+    )
+    textract_group.add_argument(
+        "--textract_bucket",
+        default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+        help="S3 bucket name for Textract operations (overrides default).",
+    )
+    textract_group.add_argument(
+        "--textract_input_prefix",
+        default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+        help="S3 prefix for input files in Textract operations.",
+    )
+    textract_group.add_argument(
+        "--textract_output_prefix",
+        default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+        help="S3 prefix for output files in Textract operations.",
+    )
+    textract_group.add_argument(
+        "--s3_textract_document_logs_subfolder",
+        default=TEXTRACT_JOBS_S3_LOC,
+        help="S3 prefix for logs in Textract operations.",
+    )
+    textract_group.add_argument(
+        "--local_textract_document_logs_subfolder",
+        default=TEXTRACT_JOBS_LOCAL_LOC,
+        help="Local prefix for logs in Textract operations.",
+    )
+    textract_group.add_argument(
+        "--poll_interval",
+        type=int,
+        default=30,
+        help="Polling interval in seconds for Textract job status.",
+    )
+    textract_group.add_argument(
+        "--max_poll_attempts",
+        type=int,
+        default=120,
+        help="Maximum number of polling attempts for Textract job completion.",
+    )
     # Parse arguments - either from command line or direct mode
     if direct_mode_args:
         # Use direct mode arguments
     # --- Initial Setup ---
     # Convert string boolean variables to boolean
+    if args.preprocess_local_ocr_images == "True":
+        args.preprocess_local_ocr_images = True
+    else:
+        args.preprocess_local_ocr_images = False
+    if args.greedy_match == "True":
+        args.greedy_match = True
+    else:
+        args.greedy_match = False
+    if args.combine_pages == "True":
+        args.combine_pages = True
+    else:
+        args.combine_pages = False
+    if args.remove_duplicate_rows == "True":
+        args.remove_duplicate_rows = True
+    else:
+        args.remove_duplicate_rows = False
+    if args.return_pdf_end_of_redaction == "True":
+        args.return_pdf_end_of_redaction = True
+    else:
+        args.return_pdf_end_of_redaction = False
+    if args.compress_redacted_pdf == "True":
+        args.compress_redacted_pdf = True
+    else:
+        args.compress_redacted_pdf = False
+    if args.do_initial_clean == "True":
+        args.do_initial_clean = True
+    else:
+        args.do_initial_clean = False
+    if args.save_logs_to_csv == "True":
+        args.save_logs_to_csv = True
+    else:
+        args.save_logs_to_csv = False
+    if args.save_logs_to_dynamodb == "True":
+        args.save_logs_to_dynamodb = True
+    else:
+        args.save_logs_to_dynamodb = False
+    if args.display_file_names_in_logs == "True":
+        args.display_file_names_in_logs = True
+    else:
+        args.display_file_names_in_logs = False
+    if args.match_fuzzy_whole_phrase_bool == "True":
+        args.match_fuzzy_whole_phrase_bool = True
+    else:
+        args.match_fuzzy_whole_phrase_bool = False
+    if args.save_to_user_folders == "True":
+        args.save_to_user_folders = True
+    else:
+        args.save_to_user_folders = False
     # Combine extraction options
+    extraction_options = (
+        list(args.handwrite_signature_extraction)
+        if args.handwrite_signature_extraction
+        else []
+    )
     if args.extract_forms:
+        extraction_options.append("Extract forms")
     if args.extract_tables:
+        extraction_options.append("Extract tables")
     if args.extract_layout:
+        extraction_options.append("Extract layout")
     args.handwrite_signature_extraction = extraction_options
+    if args.task in ["redact", "deduplicate"]:
         if args.input_file:
             if isinstance(args.input_file, str):
                 args.input_file = [args.input_file]
             file_extension = file_extension.lower()
         else:
             raise ValueError("Error: --input_file is required for 'redact' task.")
     # Initialise usage logger if logging is enabled
     usage_logger = None
     if args.save_logs_to_csv or args.save_logs_to_dynamodb:
         from tools.cli_usage_logger import create_cli_usage_logger
         try:
             usage_logger = create_cli_usage_logger()
         except Exception as e:
             print(f"Warning: Could not initialise usage logger: {e}")
     # Get username and folders
+    (
+        session_hash,
+        args.output_dir,
+        _,
+        args.input_dir,
+        args.textract_input_prefix,
+        args.textract_output_prefix,
+        args.s3_textract_document_logs_subfolder,
+        args.local_textract_document_logs_subfolder,
+    ) = get_username_and_folders(
+        username=args.username,
+        output_folder_textbox=args.output_dir,
+        input_folder_textbox=args.input_dir,
+        session_output_folder=args.save_to_user_folders,
+        textract_document_upload_input_folder=args.textract_input_prefix,
+        textract_document_upload_output_folder=args.textract_output_prefix,
+        s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder,
+        local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder,
+    )
+    print(
+        f"Conducting analyses with user {args.username}. Outputs will be saved to {args.output_dir}."
+    )
     # --- Route to the Correct Workflow Based on Task and File Type ---
     # Validate input_file requirement for tasks that need it
+    if args.task in ["redact", "deduplicate"] and not args.input_file:
         print(f"Error: --input_file is required for '{args.task}' task.")
         return
         args.prepare_images = False
     from tools.cli_usage_logger import create_cli_usage_logger, log_redaction_usage
     # Task 1: Redaction/Anonymisation
+    if args.task == "redact":
         # Workflow 1: PDF/Image Redaction
+        if file_extension in [".pdf", ".png", ".jpg", ".jpeg"]:
             print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
             start_time = time.time()
             try:
                 from tools.file_conversion import prepare_image_or_pdf
                 from tools.file_redaction import choose_and_run_redactor
                 # Step 1: Prepare the document
                 print("\nStep 1: Preparing document...")
                 (
+                    prep_summary,
+                    prepared_pdf_paths,
+                    image_file_paths,
+                    _,
+                    _,
+                    pdf_doc,
+                    image_annotations,
+                    _,
+                    original_cropboxes,
+                    page_sizes,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
                 ) = prepare_image_or_pdf(
+                    file_paths=args.input_file,
+                    text_extract_method=args.ocr_method,
+                    all_line_level_ocr_results_df=pd.DataFrame(),
+                    all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
+                    first_loop_state=True,
+                    prepare_for_review=False,
+                    output_folder=args.output_dir,
+                    input_folder=args.input_dir,
+                    prepare_images=args.prepare_images,
                 )
                 print(f"Preparation complete. {prep_summary}")
                 # Step 2: Redact the prepared document
                 print("\nStep 2: Running redaction...")
                 (
+                    output_summary,
+                    output_files,
+                    _,
+                    _,
+                    log_files,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    comprehend_query_number,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    page_sizes,
+                    _,
+                    _,
+                    _,
+                    total_textract_query_number,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
                 ) = choose_and_run_redactor(
+                    file_paths=args.input_file,
+                    prepared_pdf_file_paths=prepared_pdf_paths,
+                    pdf_image_file_paths=image_file_paths,
+                    chosen_redact_entities=args.local_redact_entities,
+                    chosen_redact_comprehend_entities=args.aws_redact_entities,
+                    text_extraction_method=args.ocr_method,
+                    in_allow_list=args.allow_list_file,
+                    in_deny_list=args.deny_list_file,
+                    redact_whole_page_list=args.redact_whole_page_file,
+                    first_loop_state=True,
+                    page_min=args.page_min,
+                    page_max=args.page_max,
+                    handwrite_signature_checkbox=args.handwrite_signature_extraction,
+                    max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
+                    match_fuzzy_whole_phrase_bool=args.match_fuzzy_whole_phrase_bool,
+                    pymupdf_doc=pdf_doc,
+                    annotations_all_pages=image_annotations,
+                    page_sizes=page_sizes,
+                    document_cropboxes=original_cropboxes,
+                    pii_identification_method=args.pii_detector,
+                    aws_access_key_textbox=args.aws_access_key,
+                    aws_secret_key_textbox=args.aws_secret_key,
+                    language=args.language,
+                    output_folder=args.output_dir,
+                    input_folder=args.input_dir,
                 )
                 # Calculate processing time
                 end_time = time.time()
                 processing_time = end_time - start_time
                 # Log usage data if logger is available
                 if usage_logger:
                     try:
                         # Extract file name for logging
                         print("Saving logs to CSV")
+                        doc_file_name = (
+                            os.path.basename(args.input_file[0])
+                            if args.display_file_names_in_logs
+                            else "document"
+                        )
                         data_file_name = ""  # Not applicable for PDF/image redaction
                         # Determine if this was a Textract API call
                         is_textract_call = args.ocr_method == "AWS Textract"
                         # Count pages (approximate from page_sizes if available)
                         total_pages = len(page_sizes) if page_sizes else 1
                         # Count API calls (approximate - would need to be tracked in the redaction function)
+                        textract_queries = (
+                            int(total_textract_query_number) if is_textract_call else 0
+                        )
+                        comprehend_queries = (
+                            int(comprehend_query_number)
+                            if args.pii_detector == "AWS Comprehend"
+                            else 0
+                        )
                         # Format handwriting/signature options
+                        handwriting_signature = (
+                            ", ".join(args.handwrite_signature_extraction)
+                            if args.handwrite_signature_extraction
+                            else ""
+                        )
                         log_redaction_usage(
                             logger=usage_logger,
                             session_hash=session_hash,
                             save_to_dynamodb=args.save_logs_to_dynamodb,
                             save_to_s3=args.upload_logs_to_s3,
                             s3_bucket=args.s3_bucket,
+                            s3_key_prefix=args.s3_logs_prefix,
                         )
                     except Exception as e:
                         print(f"Warning: Could not log usage data: {e}")
                 print("\n--- Redaction Process Complete ---")
                 print(f"Summary: {output_summary}")
                 print(f"Processing time: {processing_time:.2f} seconds")
                 print(f"\nOutput files saved to: {args.output_dir}")
                 print("Generated Files:", sorted(output_files))
+                if log_files:
+                    print("Log Files:", sorted(log_files))
             except Exception as e:
+                print(
+                    f"\nAn error occurred during the PDF/Image redaction workflow: {e}"
+                )
         # Workflow 2: Word/Tabular Data Anonymisation
+        elif file_extension in [".docx", ".xlsx", ".xls", ".csv", ".parquet"]:
+            print(
+                "--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---"
+            )
             start_time = time.time()
             try:
                 from tools.data_anonymise import anonymise_files_with_open_text
                 # Run the anonymisation function directly
+                (
+                    output_summary,
+                    output_files,
+                    _,
+                    _,
+                    log_files,
+                    _,
+                    processing_time,
+                    comprehend_query_number,
+                ) = anonymise_files_with_open_text(
                     file_paths=args.input_file,
+                    in_text="",  # Not used for file-based operations
                     anon_strategy=args.anon_strategy,
                     chosen_cols=args.text_columns,
                     chosen_redact_entities=args.local_redact_entities,
                     aws_access_key_textbox=args.aws_access_key,
                     aws_secret_key_textbox=args.aws_secret_key,
                     language=args.language,
+                    do_initial_clean=args.do_initial_clean,
                 )
                 # Calculate processing time
                 end_time = time.time()
                 processing_time = end_time - start_time
                 # Log usage data if logger is available
                 if usage_logger:
                     try:
                         print("Saving logs to CSV")
                         # Extract file name for logging
                         doc_file_name = ""  # Not applicable for tabular data
+                        data_file_name = (
+                            os.path.basename(args.input_file[0])
+                            if args.display_file_names_in_logs
+                            else "data_file"
+                        )
                         # Determine if this was a Textract API call (not applicable for tabular)
                         is_textract_call = False
                         # Count pages (not applicable for tabular data)
                         total_pages = 0
                         # Count API calls (approximate - would need to be tracked in the anonymisation function)
                         textract_queries = 0  # Not applicable for tabular data
+                        comprehend_queries = (
+                            comprehend_query_number
+                            if args.pii_detector == "AWS Comprehend"
+                            else 0
+                        )
                         # Format handwriting/signature options (not applicable for tabular)
                         handwriting_signature = ""
                         log_redaction_usage(
                             logger=usage_logger,
                             session_hash=session_hash,
                             save_to_dynamodb=args.save_logs_to_dynamodb,
                             save_to_s3=args.upload_logs_to_s3,
                             s3_bucket=args.s3_bucket,
+                            s3_key_prefix=args.s3_logs_prefix,
                         )
                     except Exception as e:
                         print(f"Warning: Could not log usage data: {e}")
                 print(f"Processing time: {processing_time:.2f} seconds")
                 print(f"\nOutput files saved to: {args.output_dir}")
                 print("Generated Files:", sorted(output_files))
+                if log_files:
+                    print("Log Files:", sorted(log_files))
             except Exception as e:
+                print(
+                    f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}"
+                )
         else:
             print(f"Error: Unsupported file type '{file_extension}' for redaction.")
             print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
+            print(
+                "Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet"
+            )
     # Task 2: Duplicate Detection
+    elif args.task == "deduplicate":
         print("--- Starting Duplicate Detection Workflow... ---")
         try:
             from tools.find_duplicate_pages import run_duplicate_analysis
+            if args.duplicate_type == "pages":
                 # Page duplicate detection
+                if file_extension == ".csv":
+                    print(
+                        "--- Detected OCR CSV file. Starting Page Duplicate Detection... ---"
+                    )
                     start_time = time.time()
+                    if args.combine_pages is True:
                         print("Combining pages...")
                     else:
                         print("Using line-level duplicate detection...")
                     # Load the CSV file as a list for the duplicate analysis function
+                    (
+                        results_df,
+                        output_paths,
+                        full_data_by_file,
+                        processing_time,
+                        task_textbox,
+                    ) = run_duplicate_analysis(
                         files=args.input_file,
                         threshold=args.similarity_threshold,
                         min_words=args.min_word_count,
                         min_consecutive=args.min_consecutive_pages,
                         greedy_match=args.greedy_match,
                         combine_pages=args.combine_pages,
+                        output_folder=args.output_dir,
                     )
                     end_time = time.time()
                     processing_time = end_time - start_time
                     print("\n--- Page Duplicate Detection Complete ---")
                     print(f"Found {len(results_df)} duplicate matches")
                     print(f"\nOutput files saved to: {args.output_dir}")
+                    if output_paths:
+                        print("Generated Files:", sorted(output_paths))
                 else:
+                    print(
+                        "Error: Page duplicate detection requires CSV files with OCR data."
+                    )
                     print("Please provide a CSV file containing OCR output data.")
                     # Log usage data if logger is available
                         try:
                             # Extract file name for logging
                             print("Saving logs to CSV")
+                            doc_file_name = (
+                                os.path.basename(args.input_file[0])
+                                if args.display_file_names_in_logs
+                                else "document"
+                            )
+                            data_file_name = (
+                                ""  # Not applicable for PDF/image redaction
+                            )
                             # Determine if this was a Textract API call
                             is_textract_call = False
                             # Count pages (approximate from page_sizes if available)
                             total_pages = len(page_sizes) if page_sizes else 1
                             # Count API calls (approximate - would need to be tracked in the redaction function)
                             textract_queries = 0
                             comprehend_queries = 0
                             # Format handwriting/signature options
                             handwriting_signature = ""
                             log_redaction_usage(
                                 logger=usage_logger,
                                 session_hash=session_hash,
                                 save_to_dynamodb=args.save_logs_to_dynamodb,
                                 save_to_s3=args.upload_logs_to_s3,
                                 s3_bucket=args.s3_bucket,
+                                s3_key_prefix=args.s3_logs_prefix,
                             )
                         except Exception as e:
                             print(f"Warning: Could not log usage data: {e}")
+            elif args.duplicate_type == "tabular":
                 # Tabular duplicate detection
                 from tools.find_duplicate_tabular import run_tabular_duplicate_detection
+                if file_extension in [".csv", ".xlsx", ".xls", ".parquet"]:
+                    print(
+                        "--- Detected tabular file. Starting Tabular Duplicate Detection... ---"
+                    )
                     start_time = time.time()
+                    (
+                        results_df,
+                        output_paths,
+                        full_data_by_file,
+                        processing_time,
+                        task_textbox,
+                    ) = run_tabular_duplicate_detection(
                         files=args.input_file,
                         threshold=args.similarity_threshold,
                         min_words=args.min_word_count,
                         output_folder=args.output_dir,
                         do_initial_clean_dup=args.do_initial_clean,
                         in_excel_tabular_sheets=args.excel_sheets,
+                        remove_duplicate_rows=args.remove_duplicate_rows,
                     )
                     end_time = time.time()
                             # Extract file name for logging
                             print("Saving logs to CSV")
                             doc_file_name = ""
+                            data_file_name = (
+                                os.path.basename(args.input_file[0])
+                                if args.display_file_names_in_logs
+                                else "data_file"
+                            )
                             # Determine if this was a Textract API call
                             is_textract_call = False
                             # Count pages (approximate from page_sizes if available)
                             total_pages = len(page_sizes) if page_sizes else 1
                             # Count API calls (approximate - would need to be tracked in the redaction function)
                             textract_queries = 0
                             comprehend_queries = 0
                             # Format handwriting/signature options
                             handwriting_signature = ""
                             log_redaction_usage(
                                 logger=usage_logger,
                                 session_hash=session_hash,
                                 save_to_dynamodb=args.save_logs_to_dynamodb,
                                 save_to_s3=args.upload_logs_to_s3,
                                 s3_bucket=args.s3_bucket,
+                                s3_key_prefix=args.s3_logs_prefix,
                             )
                         except Exception as e:
                             print(f"Warning: Could not log usage data: {e}")
                     print("\n--- Tabular Duplicate Detection Complete ---")
                     print(f"Found {len(results_df)} duplicate matches")
                     print(f"\nOutput files saved to: {args.output_dir}")
+                    if output_paths:
+                        print("Generated Files:", sorted(output_paths))
                 else:
+                    print(
+                        "Error: Tabular duplicate detection requires CSV, Excel, or Parquet files."
+                    )
                     print("Supported types: .csv, .xlsx, .xls, .parquet")
             else:
                 print(f"Error: Invalid duplicate type '{args.duplicate_type}'.")
                 print("Valid options: 'pages' or 'tabular'")
         except Exception as e:
             print(f"\nAn error occurred during the duplicate detection workflow: {e}")
     # Task 3: Textract Batch Operations
+    elif args.task == "textract":
         print("--- Starting Textract Batch Operations Workflow... ---")
         if not args.textract_action:
             print("Error: --textract_action is required for textract task.")
             print("Valid options: 'submit', 'retrieve', or 'list'")
             return
         try:
+            if args.textract_action == "submit":
+                from tools.textract_batch_call import (
+                    analyse_document_with_textract_api,
+                    load_in_textract_job_details,
+                )
                 # Submit document to Textract for analysis
                 if not args.input_file:
                     print("Error: --input_file is required for submit action.")
                     return
                 print(f"--- Submitting document to Textract: {args.input_file} ---")
                 start_time = time.time()
                 # Load existing job details
+                job_df = load_in_textract_job_details(
+                    load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+                    load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+                )
                 # Determine signature extraction options
+                signature_options = (
+                    ["Extract handwriting", "Extract signatures"]
+                    if args.extract_signatures
+                    else ["Extract handwriting"]
+                )
                 # Use configured bucket or override
                 textract_bucket = args.textract_bucket if args.textract_bucket else ""
                 # Submit the job
+                (
+                    result_message,
+                    job_id,
+                    job_type,
+                    successful_job_number,
+                    is_textract_call,
+                    total_pages,
+                    task_textbox,
+                ) = analyse_document_with_textract_api(
                     local_pdf_path=args.input_file,
                     s3_input_prefix=args.textract_input_prefix,
                     s3_output_prefix=args.textract_output_prefix,
                     general_s3_bucket_name=args.s3_bucket,
                     local_output_dir=args.output_dir,
                     handwrite_signature_checkbox=signature_options,
+                    aws_region=args.aws_region,
                 )
                 end_time = time.time()
                 processing_time = end_time - start_time
+                print("\n--- Textract Job Submitted Successfully ---")
                 print(f"Job ID: {job_id}")
                 print(f"Job Type: {job_type}")
                 print(f"Message: {result_message}")
                     try:
                         # Extract file name for logging
                         print("Saving logs to CSV")
+                        doc_file_name = (
+                            os.path.basename(args.input_file[0])
+                            if args.display_file_names_in_logs
+                            else "document"
+                        )
                         data_file_name = ""
                         # Determine if this was a Textract API call
                         is_textract_call = True
                         args.ocr_method == "AWS Textract"
                         # Count API calls (approximate - would need to be tracked in the redaction function)
                         textract_queries = total_pages
                         comprehend_queries = 0
                         # Format handwriting/signature options
                         handwriting_signature = ""
                         log_redaction_usage(
                             logger=usage_logger,
                             session_hash=session_hash,
                             save_to_dynamodb=args.save_logs_to_dynamodb,
                             save_to_s3=args.upload_logs_to_s3,
                             s3_bucket=args.s3_bucket,
+                            s3_key_prefix=args.s3_logs_prefix,
                         )
                     except Exception as e:
                         print(f"Warning: Could not log usage data: {e}")
+            elif args.textract_action == "retrieve":
                 print(f"--- Retrieving Textract results for Job ID: {args.job_id} ---")
+                from tools.textract_batch_call import (
+                    load_in_textract_job_details,
+                    poll_whole_document_textract_analysis_progress_and_download,
+                )
                 # Retrieve results by job ID
                 if not args.job_id:
                     print("Error: --job_id is required for retrieve action.")
                     return
                 # Load existing job details to get job type
                 print("Loading existing job details...")
+                job_df = load_in_textract_job_details(
+                    load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+                    load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+                )
                 # Find job type from the dataframe
                 job_type = "document_text_detection"  # default
                 if not job_df.empty and "job_id" in job_df.columns:
                     matching_jobs = job_df.loc[job_df["job_id"] == args.job_id]
                     if not matching_jobs.empty and "job_type" in matching_jobs.columns:
                         job_type = matching_jobs.iloc[0]["job_type"]
                 # Use configured bucket or override
                 textract_bucket = args.textract_bucket if args.textract_bucket else ""
                 # Poll for completion and download results
                 print("Polling for completion and downloading results...")
+                downloaded_file_path, job_status, updated_job_df, output_filename = (
+                    poll_whole_document_textract_analysis_progress_and_download(
+                        job_id=args.job_id,
+                        job_type_dropdown=job_type,
+                        s3_output_prefix=args.textract_output_prefix,
+                        pdf_filename="",  # Will be determined from job details
+                        job_df=job_df,
+                        s3_bucket_name=textract_bucket,
+                        load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+                        load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+                        local_output_dir=args.output_dir,
+                        poll_interval_seconds=args.poll_interval,
+                        max_polling_attempts=args.max_poll_attempts,
+                    )
                 )
+                print("\n--- Textract Results Retrieved Successfully ---")
                 print(f"Job Status: {job_status}")
                 print(f"Downloaded File: {downloaded_file_path}")
+                # print(f"Output Filename: {output_filename}")
+            elif args.textract_action == "list":
                 from tools.textract_batch_call import load_in_textract_job_details
                 # List recent Textract jobs
                 print("--- Listing Recent Textract Jobs ---")
+                job_df = load_in_textract_job_details(
+                    load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+                    load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+                )
                 if job_df.empty:
                     print("No recent Textract jobs found.")
                 else:
                         print(f"Signatures: {job.get('signature_extraction', 'N/A')}")
                         print(f"Date: {job.get('job_date_time', 'N/A')}")
                         print("-" * 80)
             else:
                 print(f"Error: Invalid textract_action '{args.textract_action}'.")
                 print("Valid options: 'submit', 'retrieve', or 'list'")
         except Exception as e:
             print(f"\nAn error occurred during the Textract workflow: {e}")
     else:
         print(f"Error: Invalid task '{args.task}'.")
         print("Valid options: 'redact', 'deduplicate', or 'textract'")
 if __name__ == "__main__":
+    main()

lambda_entrypoint.py CHANGED Viewed

@@ -1,6 +1,7 @@
-import boto3
-import os
 import json
 # Import the main function from your CLI script
 from cli_redact import main as cli_main
@@ -16,6 +17,7 @@ TMP_DIR = "/tmp"
 INPUT_DIR = os.path.join(TMP_DIR, "input")
 OUTPUT_DIR = os.path.join(TMP_DIR, "output")
 def download_file_from_s3(bucket_name, key, download_path):
     """Download a file from S3 to the local filesystem."""
     try:
@@ -25,6 +27,7 @@ def download_file_from_s3(bucket_name, key, download_path):
         print(f"Error downloading from S3: {e}")
         raise
 def upload_directory_to_s3(local_directory, bucket_name, s3_prefix):
     """Upload all files from a local directory to an S3 prefix."""
     for root, _, files in os.walk(local_directory):
@@ -33,14 +36,17 @@ def upload_directory_to_s3(local_directory, bucket_name, s3_prefix):
             # Create a relative path to maintain directory structure if needed
             relative_path = os.path.relpath(local_file_path, local_directory)
             output_key = os.path.join(s3_prefix, relative_path)
             try:
                 s3_client.upload_file(local_file_path, bucket_name, output_key)
-                print(f"Successfully uploaded {local_file_path} to s3://{bucket_name}/{output_key}")
             except Exception as e:
                 print(f"Error uploading to S3: {e}")
                 raise
 def lambda_handler(event, context):
     print(f"Received event: {json.dumps(event)}")
@@ -51,29 +57,33 @@ def lambda_handler(event, context):
     # 2. Extract information from the event
     # Assumes the event is triggered by S3 and may contain an 'arguments' payload
     try:
-        record = event['Records'][0]
-        bucket_name = record['s3']['bucket']['name']
-        input_key = record['s3']['object']['key']
         # The user metadata can be used to pass arguments
         # This is more robust than embedding them in the main event body
         response = s3_client.head_object(Bucket=bucket_name, Key=input_key)
-        metadata = response.get('Metadata', {})
         # Arguments can be passed as a JSON string in metadata
-        arguments = json.loads(metadata.get('arguments', '{}'))
     except (KeyError, IndexError) as e:
-        print(f"Could not parse S3 event record: {e}. Checking for direct invocation payload.")
         # Fallback for direct invocation (e.g., from Step Functions or manual test)
-        bucket_name = event.get('bucket_name')
-        input_key = event.get('input_key')
-        arguments = event.get('arguments', {})
         if not all([bucket_name, input_key]):
-            raise ValueError("Missing 'bucket_name' or 'input_key' in direct invocation event.")
     print(f"Processing s3://{bucket_name}/{input_key}")
     print(f"With arguments: {arguments}")
     # 3. Download the main input file
     input_file_path = os.path.join(INPUT_DIR, os.path.basename(input_key))
     download_file_from_s3(bucket_name, input_key, input_file_path)
@@ -81,106 +91,117 @@ def lambda_handler(event, context):
     # 4. Prepare arguments for the CLI function
     # This dictionary should mirror the one in your app.py's "direct mode"
     cli_args = {
-        'task': arguments.get('task', 'redact'),
-        'input_file': input_file_path,
-        'output_dir': OUTPUT_DIR,
-        'input_dir': INPUT_DIR,
-        'language': arguments.get('language', 'en_core_web_lg'),
-        'pii_detector': arguments.get('pii_detector', 'Local'), # Default to local
-        'username': arguments.get('username', 'lambda_user'),
-        'save_to_user_folders': arguments.get('save_to_user_folders', 'False'),
-        'ocr_method': arguments.get('ocr_method', 'Tesseract OCR - all PDF types'),
-        'page_min': int(arguments.get('page_min', 0)),
-        'page_max': int(arguments.get('page_max', 0)),
-        'handwrite_signature_extraction': arguments.get('handwrite_signature_checkbox', ['Extract handwriting', 'Extract signatures']),
-        'extract_forms': arguments.get('extract_forms', False),
-        'extract_tables': arguments.get('extract_tables', False),
-        'extract_layout': arguments.get('extract_layout', False),
         # General arguments
-        'local_redact_entities': arguments.get('local_redact_entities', []),
-        'aws_redact_entities': arguments.get('aws_redact_entities', []),
-        'cost_code': arguments.get('cost_code', ''),
-        'save_logs_to_csv': arguments.get('save_logs_to_csv', 'False'),
-        'save_logs_to_dynamodb': arguments.get('save_logs_to_dynamodb', 'False'),
-        'display_file_names_in_logs': arguments.get('display_file_names_in_logs', 'True'),
-        'upload_logs_to_s3': arguments.get('upload_logs_to_s3', 'False'),
-        's3_logs_prefix': arguments.get('s3_logs_prefix', ''),
-        'do_initial_clean': arguments.get('do_initial_clean', 'False'),
         # PDF/Image specific arguments
-        'images_dpi': float(arguments.get('images_dpi', 300.0)),
-        'chosen_local_ocr_model': arguments.get('chosen_local_ocr_model', 'tesseract'),
-        'preprocess_local_ocr_images': arguments.get('preprocess_local_ocr_images', 'False'),
         # Handle optional files like allow/deny lists
-        'allow_list_file': arguments.get('allow_list_file', ""),
-        'deny_list_file': arguments.get('deny_list_file', ""),
-        'redact_whole_page_file': arguments.get('redact_whole_page_file', ""),
         # Tabular/Anonymisation arguments
-        'excel_sheets': arguments.get('excel_sheets', []),
-        'fuzzy_mistakes': int(arguments.get('fuzzy_mistakes', 0)),
-        'match_fuzzy_whole_phrase_bool': arguments.get('match_fuzzy_whole_phrase_bool', 'True'),
         # Deduplication specific arguments
-        'duplicate_type': arguments.get('duplicate_type', 'pages'),
-        'similarity_threshold': float(arguments.get('similarity_threshold', 0.95)),
-        'min_word_count': int(arguments.get('min_word_count', 3)),
-        'min_consecutive_pages': int(arguments.get('min_consecutive_pages', 1)),
-        'greedy_match': arguments.get('greedy_match', 'False'),
-        'combine_pages': arguments.get('combine_pages', 'True'),
-        'search_query': arguments.get('search_query', ""),
-        'text_columns': arguments.get('text_columns', []),
-        'remove_duplicate_rows': arguments.get('remove_duplicate_rows', 'True'),
-        'anon_strategy': arguments.get('anon_strategy', 'redact'),
         # Textract specific arguments
-        'textract_action': arguments.get('textract_action', ''),
-        'job_id': arguments.get('job_id', ''),
-        'extract_signatures': arguments.get('extract_signatures', False),
-        'textract_bucket': arguments.get('textract_bucket', ''),
-        'textract_input_prefix': arguments.get('textract_input_prefix', ''),
-        'textract_output_prefix': arguments.get('textract_output_prefix', ''),
-        's3_textract_document_logs_subfolder': arguments.get('s3_textract_document_logs_subfolder', ''),
-        'local_textract_document_logs_subfolder': arguments.get('local_textract_document_logs_subfolder', ''),
-        'poll_interval': int(arguments.get('poll_interval', 30)),
-        'max_poll_attempts': int(arguments.get('max_poll_attempts', 120)),
         # AWS credentials (use IAM Role instead of keys)
-        'aws_access_key': None,
-        'aws_secret_key': None,
-        'aws_region': os.getenv("AWS_REGION", ""),
-        's3_bucket': bucket_name,
         # Set defaults for boolean flags
-        'prepare_images': arguments.get('prepare_images', True),
-        'compress_redacted_pdf': arguments.get('compress_redacted_pdf', False),
-        'return_pdf_end_of_redaction': arguments.get('return_pdf_end_of_redaction', True)
     }
     # Combine extraction options
-    extraction_options = list(cli_args['handwrite_signature_extraction']) if cli_args['handwrite_signature_extraction'] else []
-    if cli_args['extract_forms']:
-        extraction_options.append('Extract forms')
-    if cli_args['extract_tables']:
-        extraction_options.append('Extract tables')
-    if cli_args['extract_layout']:
-        extraction_options.append('Extract layout')
-    cli_args['handwrite_signature_extraction'] = extraction_options
     # Download optional files if they are specified
-    allow_list_key = arguments.get('allow_list_file')
     if allow_list_key:
-        allow_list_path = os.path.join(INPUT_DIR, 'allow_list.csv')
         download_file_from_s3(bucket_name, allow_list_key, allow_list_path)
-        cli_args['allow_list_file'] = allow_list_path
-    deny_list_key = arguments.get('deny_list_file')
     if deny_list_key:
-        deny_list_path = os.path.join(INPUT_DIR, 'deny_list.csv')
         download_file_from_s3(bucket_name, deny_list_key, deny_list_path)
-        cli_args['deny_list_file'] = deny_list_path
     # 5. Execute the main application logic
     try:
@@ -195,10 +216,14 @@ def lambda_handler(event, context):
     # 6. Upload results back to S3
     output_s3_prefix = f"output/{os.path.splitext(os.path.basename(input_key))[0]}"
-    print(f"Uploading contents of {OUTPUT_DIR} to s3://{bucket_name}/{output_s3_prefix}/")
     upload_directory_to_s3(OUTPUT_DIR, bucket_name, output_s3_prefix)
     return {
         "statusCode": 200,
-        "body": json.dumps(f"Processing complete for {input_key}. Output saved to s3://{bucket_name}/{output_s3_prefix}/")
-    }

 import json
+import os
+import boto3
 # Import the main function from your CLI script
 from cli_redact import main as cli_main
 INPUT_DIR = os.path.join(TMP_DIR, "input")
 OUTPUT_DIR = os.path.join(TMP_DIR, "output")
 def download_file_from_s3(bucket_name, key, download_path):
     """Download a file from S3 to the local filesystem."""
     try:
         print(f"Error downloading from S3: {e}")
         raise
 def upload_directory_to_s3(local_directory, bucket_name, s3_prefix):
     """Upload all files from a local directory to an S3 prefix."""
     for root, _, files in os.walk(local_directory):
             # Create a relative path to maintain directory structure if needed
             relative_path = os.path.relpath(local_file_path, local_directory)
             output_key = os.path.join(s3_prefix, relative_path)
             try:
                 s3_client.upload_file(local_file_path, bucket_name, output_key)
+                print(
+                    f"Successfully uploaded {local_file_path} to s3://{bucket_name}/{output_key}"
+                )
             except Exception as e:
                 print(f"Error uploading to S3: {e}")
                 raise
 def lambda_handler(event, context):
     print(f"Received event: {json.dumps(event)}")
     # 2. Extract information from the event
     # Assumes the event is triggered by S3 and may contain an 'arguments' payload
     try:
+        record = event["Records"][0]
+        bucket_name = record["s3"]["bucket"]["name"]
+        input_key = record["s3"]["object"]["key"]
         # The user metadata can be used to pass arguments
         # This is more robust than embedding them in the main event body
         response = s3_client.head_object(Bucket=bucket_name, Key=input_key)
+        metadata = response.get("Metadata", {})
         # Arguments can be passed as a JSON string in metadata
+        arguments = json.loads(metadata.get("arguments", "{}"))
     except (KeyError, IndexError) as e:
+        print(
+            f"Could not parse S3 event record: {e}. Checking for direct invocation payload."
+        )
         # Fallback for direct invocation (e.g., from Step Functions or manual test)
+        bucket_name = event.get("bucket_name")
+        input_key = event.get("input_key")
+        arguments = event.get("arguments", {})
         if not all([bucket_name, input_key]):
+            raise ValueError(
+                "Missing 'bucket_name' or 'input_key' in direct invocation event."
+            )
     print(f"Processing s3://{bucket_name}/{input_key}")
     print(f"With arguments: {arguments}")
     # 3. Download the main input file
     input_file_path = os.path.join(INPUT_DIR, os.path.basename(input_key))
     download_file_from_s3(bucket_name, input_key, input_file_path)
     # 4. Prepare arguments for the CLI function
     # This dictionary should mirror the one in your app.py's "direct mode"
     cli_args = {
+        "task": arguments.get("task", "redact"),
+        "input_file": input_file_path,
+        "output_dir": OUTPUT_DIR,
+        "input_dir": INPUT_DIR,
+        "language": arguments.get("language", "en_core_web_lg"),
+        "pii_detector": arguments.get("pii_detector", "Local"),  # Default to local
+        "username": arguments.get("username", "lambda_user"),
+        "save_to_user_folders": arguments.get("save_to_user_folders", "False"),
+        "ocr_method": arguments.get("ocr_method", "Tesseract OCR - all PDF types"),
+        "page_min": int(arguments.get("page_min", 0)),
+        "page_max": int(arguments.get("page_max", 0)),
+        "handwrite_signature_extraction": arguments.get(
+            "handwrite_signature_checkbox",
+            ["Extract handwriting", "Extract signatures"],
+        ),
+        "extract_forms": arguments.get("extract_forms", False),
+        "extract_tables": arguments.get("extract_tables", False),
+        "extract_layout": arguments.get("extract_layout", False),
         # General arguments
+        "local_redact_entities": arguments.get("local_redact_entities", []),
+        "aws_redact_entities": arguments.get("aws_redact_entities", []),
+        "cost_code": arguments.get("cost_code", ""),
+        "save_logs_to_csv": arguments.get("save_logs_to_csv", "False"),
+        "save_logs_to_dynamodb": arguments.get("save_logs_to_dynamodb", "False"),
+        "display_file_names_in_logs": arguments.get(
+            "display_file_names_in_logs", "True"
+        ),
+        "upload_logs_to_s3": arguments.get("upload_logs_to_s3", "False"),
+        "s3_logs_prefix": arguments.get("s3_logs_prefix", ""),
+        "do_initial_clean": arguments.get("do_initial_clean", "False"),
         # PDF/Image specific arguments
+        "images_dpi": float(arguments.get("images_dpi", 300.0)),
+        "chosen_local_ocr_model": arguments.get("chosen_local_ocr_model", "tesseract"),
+        "preprocess_local_ocr_images": arguments.get(
+            "preprocess_local_ocr_images", "False"
+        ),
         # Handle optional files like allow/deny lists
+        "allow_list_file": arguments.get("allow_list_file", ""),
+        "deny_list_file": arguments.get("deny_list_file", ""),
+        "redact_whole_page_file": arguments.get("redact_whole_page_file", ""),
         # Tabular/Anonymisation arguments
+        "excel_sheets": arguments.get("excel_sheets", []),
+        "fuzzy_mistakes": int(arguments.get("fuzzy_mistakes", 0)),
+        "match_fuzzy_whole_phrase_bool": arguments.get(
+            "match_fuzzy_whole_phrase_bool", "True"
+        ),
         # Deduplication specific arguments
+        "duplicate_type": arguments.get("duplicate_type", "pages"),
+        "similarity_threshold": float(arguments.get("similarity_threshold", 0.95)),
+        "min_word_count": int(arguments.get("min_word_count", 3)),
+        "min_consecutive_pages": int(arguments.get("min_consecutive_pages", 1)),
+        "greedy_match": arguments.get("greedy_match", "False"),
+        "combine_pages": arguments.get("combine_pages", "True"),
+        "search_query": arguments.get("search_query", ""),
+        "text_columns": arguments.get("text_columns", []),
+        "remove_duplicate_rows": arguments.get("remove_duplicate_rows", "True"),
+        "anon_strategy": arguments.get("anon_strategy", "redact"),
         # Textract specific arguments
+        "textract_action": arguments.get("textract_action", ""),
+        "job_id": arguments.get("job_id", ""),
+        "extract_signatures": arguments.get("extract_signatures", False),
+        "textract_bucket": arguments.get("textract_bucket", ""),
+        "textract_input_prefix": arguments.get("textract_input_prefix", ""),
+        "textract_output_prefix": arguments.get("textract_output_prefix", ""),
+        "s3_textract_document_logs_subfolder": arguments.get(
+            "s3_textract_document_logs_subfolder", ""
+        ),
+        "local_textract_document_logs_subfolder": arguments.get(
+            "local_textract_document_logs_subfolder", ""
+        ),
+        "poll_interval": int(arguments.get("poll_interval", 30)),
+        "max_poll_attempts": int(arguments.get("max_poll_attempts", 120)),
         # AWS credentials (use IAM Role instead of keys)
+        "aws_access_key": None,
+        "aws_secret_key": None,
+        "aws_region": os.getenv("AWS_REGION", ""),
+        "s3_bucket": bucket_name,
         # Set defaults for boolean flags
+        "prepare_images": arguments.get("prepare_images", True),
+        "compress_redacted_pdf": arguments.get("compress_redacted_pdf", False),
+        "return_pdf_end_of_redaction": arguments.get(
+            "return_pdf_end_of_redaction", True
+        ),
     }
     # Combine extraction options
+    extraction_options = (
+        list(cli_args["handwrite_signature_extraction"])
+        if cli_args["handwrite_signature_extraction"]
+        else []
+    )
+    if cli_args["extract_forms"]:
+        extraction_options.append("Extract forms")
+    if cli_args["extract_tables"]:
+        extraction_options.append("Extract tables")
+    if cli_args["extract_layout"]:
+        extraction_options.append("Extract layout")
+    cli_args["handwrite_signature_extraction"] = extraction_options
     # Download optional files if they are specified
+    allow_list_key = arguments.get("allow_list_file")
     if allow_list_key:
+        allow_list_path = os.path.join(INPUT_DIR, "allow_list.csv")
         download_file_from_s3(bucket_name, allow_list_key, allow_list_path)
+        cli_args["allow_list_file"] = allow_list_path
+    deny_list_key = arguments.get("deny_list_file")
     if deny_list_key:
+        deny_list_path = os.path.join(INPUT_DIR, "deny_list.csv")
         download_file_from_s3(bucket_name, deny_list_key, deny_list_path)
+        cli_args["deny_list_file"] = deny_list_path
     # 5. Execute the main application logic
     try:
     # 6. Upload results back to S3
     output_s3_prefix = f"output/{os.path.splitext(os.path.basename(input_key))[0]}"
+    print(
+        f"Uploading contents of {OUTPUT_DIR} to s3://{bucket_name}/{output_s3_prefix}/"
+    )
     upload_directory_to_s3(OUTPUT_DIR, bucket_name, output_s3_prefix)
     return {
         "statusCode": 200,
+        "body": json.dumps(
+            f"Processing complete for {input_key}. Output saved to s3://{bucket_name}/{output_s3_prefix}/"
+        ),
+    }

load_dynamo_logs.py CHANGED Viewed

@@ -1,20 +1,25 @@
-import boto3
 import csv
-from decimal import Decimal
 import datetime
-from boto3.dynamodb.conditions import Key
-from tools.config import AWS_REGION, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, OUTPUT_FOLDER
 # Replace with your actual table name and region
-TABLE_NAME = USAGE_LOG_DYNAMODB_TABLE_NAME # Choose as appropriate
 REGION = AWS_REGION
-CSV_OUTPUT = OUTPUT_FOLDER + 'dynamodb_logs_export.csv'
 # Create DynamoDB resource
-dynamodb = boto3.resource('dynamodb', region_name=REGION)
 table = dynamodb.Table(TABLE_NAME)
 # Helper function to convert Decimal to float or int
 def convert_types(item):
     new_item = {}
@@ -25,11 +30,11 @@ def convert_types(item):
         # Handle Strings that might be dates
         elif isinstance(value, str):
             try:
-                # Attempt to parse a common ISO 8601 format.
                 # The .replace() handles the 'Z' for Zulu/UTC time.
-                dt_obj = datetime.datetime.fromisoformat(value.replace('Z', '+00:00'))
                 # Now that we have a datetime object, format it as desired
-                new_item[key] = dt_obj.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
             except (ValueError, TypeError):
                 # If it fails to parse, it's just a regular string
                 new_item[key] = value
@@ -38,18 +43,20 @@ def convert_types(item):
             new_item[key] = value
     return new_item
 # Paginated scan
 def scan_table():
     items = []
     response = table.scan()
-    items.extend(response['Items'])
-    while 'LastEvaluatedKey' in response:
-        response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
-        items.extend(response['Items'])
     return items
 # Export to CSV
 # Export to CSV
 def export_to_csv(items, output_path, fields_to_drop: list = None):
@@ -59,25 +66,22 @@ def export_to_csv(items, output_path, fields_to_drop: list = None):
     # Use a set for efficient lookup
     drop_set = set(fields_to_drop or [])
     # Get a comprehensive list of all possible headers from all items
     all_keys = set()
     for item in items:
         all_keys.update(item.keys())
     # Determine the final fieldnames by subtracting the ones to drop
     fieldnames = sorted(list(all_keys - drop_set))
     print("Final CSV columns will be:", fieldnames)
-    with open(output_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
         # The key fix is here: extrasaction='ignore'
         # restval='' is also good practice to handle rows that are missing a key
         writer = csv.DictWriter(
-            csvfile,
-            fieldnames=fieldnames,
-            extrasaction='ignore',
-            restval=''
         )
         writer.writeheader()
@@ -88,6 +92,7 @@ def export_to_csv(items, output_path, fields_to_drop: list = None):
     print(f"Exported {len(items)} items to {output_path}")
 # Run export
 items = scan_table()
-export_to_csv(items, CSV_OUTPUT, fields_to_drop=[])

 import csv
 import datetime
+from decimal import Decimal
+import boto3
+from tools.config import (
+    AWS_REGION,
+    OUTPUT_FOLDER,
+    USAGE_LOG_DYNAMODB_TABLE_NAME,
+)
 # Replace with your actual table name and region
+TABLE_NAME = USAGE_LOG_DYNAMODB_TABLE_NAME  # Choose as appropriate
 REGION = AWS_REGION
+CSV_OUTPUT = OUTPUT_FOLDER + "dynamodb_logs_export.csv"
 # Create DynamoDB resource
+dynamodb = boto3.resource("dynamodb", region_name=REGION)
 table = dynamodb.Table(TABLE_NAME)
 # Helper function to convert Decimal to float or int
 def convert_types(item):
     new_item = {}
         # Handle Strings that might be dates
         elif isinstance(value, str):
             try:
+                # Attempt to parse a common ISO 8601 format.
                 # The .replace() handles the 'Z' for Zulu/UTC time.
+                dt_obj = datetime.datetime.fromisoformat(value.replace("Z", "+00:00"))
                 # Now that we have a datetime object, format it as desired
+                new_item[key] = dt_obj.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
             except (ValueError, TypeError):
                 # If it fails to parse, it's just a regular string
                 new_item[key] = value
             new_item[key] = value
     return new_item
 # Paginated scan
 def scan_table():
     items = []
     response = table.scan()
+    items.extend(response["Items"])
+    while "LastEvaluatedKey" in response:
+        response = table.scan(ExclusiveStartKey=response["LastEvaluatedKey"])
+        items.extend(response["Items"])
     return items
 # Export to CSV
 # Export to CSV
 def export_to_csv(items, output_path, fields_to_drop: list = None):
     # Use a set for efficient lookup
     drop_set = set(fields_to_drop or [])
     # Get a comprehensive list of all possible headers from all items
     all_keys = set()
     for item in items:
         all_keys.update(item.keys())
     # Determine the final fieldnames by subtracting the ones to drop
     fieldnames = sorted(list(all_keys - drop_set))
     print("Final CSV columns will be:", fieldnames)
+    with open(output_path, "w", newline="", encoding="utf-8-sig") as csvfile:
         # The key fix is here: extrasaction='ignore'
         # restval='' is also good practice to handle rows that are missing a key
         writer = csv.DictWriter(
+            csvfile, fieldnames=fieldnames, extrasaction="ignore", restval=""
         )
         writer.writeheader()
     print(f"Exported {len(items)} items to {output_path}")
 # Run export
 items = scan_table()
+export_to_csv(items, CSV_OUTPUT, fields_to_drop=[])

load_s3_logs.py CHANGED Viewed

@@ -1,39 +1,53 @@
 import boto3
 import pandas as pd
-from io import StringIO
-from datetime import datetime
-from tools.config import DOCUMENT_REDACTION_BUCKET, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, OUTPUT_FOLDER
 # Combine together log files that can be then used for e.g. dashboarding and financial tracking.
 # S3 setup. Try to use provided keys (needs S3 permissions), otherwise assume AWS SSO connection
 if AWS_ACCESS_KEY and AWS_SECRET_KEY and AWS_REGION:
-    s3 = boto3.client('s3',
-                aws_access_key_id=AWS_ACCESS_KEY,
-                aws_secret_access_key=AWS_SECRET_KEY,
-                region_name=AWS_REGION)
-else: s3 = boto3.client('s3')
 bucket_name = DOCUMENT_REDACTION_BUCKET
-prefix = 'usage/' # 'feedback/' # 'logs/' # Change as needed - top-level folder where logs are stored
-earliest_date = '20250409' # Earliest date of logs folder retrieved
-latest_date = '20250423' # Latest date of logs folder retrieved
 # Function to list all files in a folder
 def list_files_in_s3(bucket, prefix):
     response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
-    if 'Contents' in response:
-        return [content['Key'] for content in response['Contents']]
     return []
 # Function to filter date range
 def is_within_date_range(date_str, start_date, end_date):
-    date_obj = datetime.strptime(date_str, '%Y%m%d')
     return start_date <= date_obj <= end_date
 # Define the date range
-start_date = datetime.strptime(earliest_date, '%Y%m%d')  # Replace with your start date
-end_date = datetime.strptime(latest_date, '%Y%m%d')    # Replace with your end date
 # List all subfolders under 'usage/'
 all_files = list_files_in_s3(bucket_name, prefix)
@@ -41,10 +55,13 @@ all_files = list_files_in_s3(bucket_name, prefix)
 # Filter based on date range
 log_files = []
 for file in all_files:
-    parts = file.split('/')
     if len(parts) >= 3:
         date_str = parts[1]
-        if is_within_date_range(date_str, start_date, end_date) and parts[-1] == 'log.csv':
             log_files.append(file)
 # Download, read and concatenate CSV files into a pandas DataFrame
@@ -53,9 +70,10 @@ for log_file in log_files:
     # Download the file
     obj = s3.get_object(Bucket=bucket_name, Key=log_file)
     try:
-        csv_content = obj['Body'].read().decode('utf-8')
-    except:
-        csv_content = obj['Body'].read().decode('latin-1')
     # Read CSV content into pandas DataFrame
     try:
@@ -71,7 +89,7 @@ if df_list:
     concatenated_df = pd.concat(df_list, ignore_index=True)
     # Save the concatenated DataFrame to a CSV file
-    concatenated_df.to_csv(OUTPUT_FOLDER + 'consolidated_s3_logs.csv', index=False)
     print("Consolidated CSV saved as 'consolidated_s3_logs.csv'")
 else:
     print("No log files found in the given date range.")

+from datetime import datetime
+from io import StringIO
 import boto3
 import pandas as pd
+from tools.config import (
+    AWS_ACCESS_KEY,
+    AWS_REGION,
+    AWS_SECRET_KEY,
+    DOCUMENT_REDACTION_BUCKET,
+    OUTPUT_FOLDER,
+)
 # Combine together log files that can be then used for e.g. dashboarding and financial tracking.
 # S3 setup. Try to use provided keys (needs S3 permissions), otherwise assume AWS SSO connection
 if AWS_ACCESS_KEY and AWS_SECRET_KEY and AWS_REGION:
+    s3 = boto3.client(
+        "s3",
+        aws_access_key_id=AWS_ACCESS_KEY,
+        aws_secret_access_key=AWS_SECRET_KEY,
+        region_name=AWS_REGION,
+    )
+else:
+    s3 = boto3.client("s3")
 bucket_name = DOCUMENT_REDACTION_BUCKET
+prefix = "usage/"  # 'feedback/' # 'logs/' # Change as needed - top-level folder where logs are stored
+earliest_date = "20250409"  # Earliest date of logs folder retrieved
+latest_date = "20250423"  # Latest date of logs folder retrieved
 # Function to list all files in a folder
 def list_files_in_s3(bucket, prefix):
     response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
+    if "Contents" in response:
+        return [content["Key"] for content in response["Contents"]]
     return []
 # Function to filter date range
 def is_within_date_range(date_str, start_date, end_date):
+    date_obj = datetime.strptime(date_str, "%Y%m%d")
     return start_date <= date_obj <= end_date
 # Define the date range
+start_date = datetime.strptime(earliest_date, "%Y%m%d")  # Replace with your start date
+end_date = datetime.strptime(latest_date, "%Y%m%d")  # Replace with your end date
 # List all subfolders under 'usage/'
 all_files = list_files_in_s3(bucket_name, prefix)
 # Filter based on date range
 log_files = []
 for file in all_files:
+    parts = file.split("/")
     if len(parts) >= 3:
         date_str = parts[1]
+        if (
+            is_within_date_range(date_str, start_date, end_date)
+            and parts[-1] == "log.csv"
+        ):
             log_files.append(file)
 # Download, read and concatenate CSV files into a pandas DataFrame
     # Download the file
     obj = s3.get_object(Bucket=bucket_name, Key=log_file)
     try:
+        csv_content = obj["Body"].read().decode("utf-8")
+    except Exception as e:
+        print("Could not load in log file:", log_file, "due to:", e)
+        csv_content = obj["Body"].read().decode("latin-1")
     # Read CSV content into pandas DataFrame
     try:
     concatenated_df = pd.concat(df_list, ignore_index=True)
     # Save the concatenated DataFrame to a CSV file
+    concatenated_df.to_csv(OUTPUT_FOLDER + "consolidated_s3_logs.csv", index=False)
     print("Consolidated CSV saved as 'consolidated_s3_logs.csv'")
 else:
     print("No log files found in the given date range.")

pyproject.toml CHANGED Viewed

@@ -52,7 +52,16 @@ test = ["pytest", "pytest-cov"]
 # Configuration for Ruff linter:
 [tool.ruff]
 line-length = 88
 select = ["E", "F", "I"]
 # Configuration for a Black formatter:
 [tool.black]

 # Configuration for Ruff linter:
 [tool.ruff]
 line-length = 88
+[tool.ruff.lint]
 select = ["E", "F", "I"]
+ignore = [
+    "E501",  # line-too-long (handled with Black)
+    "E402",  # module-import-not-at-top-of-file (sometimes needed for conditional imports)
+]
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"]  # Allow unused imports in __init__.py
 # Configuration for a Black formatter:
 [tool.black]

test/GUI_TEST_README.md ADDED Viewed

	@@ -0,0 +1,111 @@

+# GUI Testing for Document Redaction App
+This directory contains tests specifically for verifying that the GUI application (`app.py`) loads correctly.
+## Test Files
+### `test_gui_only.py`
+A standalone script that tests only the GUI functionality. This is useful for:
+- Quick verification that the Gradio interface loads without errors
+- CI/CD pipelines where you want to test GUI separately from CLI functionality
+- Development testing when you only want to check GUI components
+**Usage:**
+Option 1 - Manual activation:
+```bash
+conda activate redaction
+cd test
+python test_gui_only.py
+```
+Option 2 - Using helper scripts (Windows):
+```bash
+cd test
+# For Command Prompt:
+run_gui_test.bat
+# For PowerShell:
+.\run_gui_test.ps1
+```
+### `test.py` (Updated)
+The main test suite now includes both CLI and GUI tests. The GUI tests are in the `TestGUIApp` class.
+**Usage:**
+Option 1 - Manual activation:
+```bash
+conda activate redaction
+cd test
+python test.py
+```
+Option 2 - Using helper scripts (Windows):
+```bash
+cd test
+# For Command Prompt:
+run_gui_test.bat
+# For PowerShell:
+.\run_gui_test.ps1
+```
+## What the GUI Tests Check
+1. **App Import and Initialization** (`test_app_import_and_initialization`)
+   - Verifies that `app.py` can be imported without errors
+   - Checks that the Gradio `app` object is created successfully
+   - Ensures the app is a proper Gradio Blocks instance
+2. **App Launch in Headless Mode** (`test_app_launch_headless`)
+   - Tests that the app can be launched without opening a browser
+   - Verifies the Gradio server starts successfully
+   - Uses threading to prevent blocking the test execution
+3. **Configuration Loading** (`test_app_configuration_loading`)
+   - Verifies that configuration variables are loaded correctly
+   - Checks key settings like server port, file size limits, language settings
+   - Ensures the app has access to all required configuration
+## Test Requirements
+- **Conda environment 'redaction' must be activated** before running tests
+- Python environment with all dependencies installed
+- Access to the `tools.config` module
+- Gradio and related GUI dependencies (including `gradio_image_annotation`)
+- The `app.py` file in the parent directory
+### Prerequisites
+Before running the GUI tests, ensure you have activated the conda environment:
+```bash
+conda activate redaction
+```
+The `gradio_image_annotation` package is already installed in the 'redaction' environment.
+## Expected Behavior
+- All tests should pass if the GUI loads correctly
+- Tests will fail if there are import errors, missing dependencies, or configuration issues
+- The headless launch test may take up to 10 seconds to complete
+## Troubleshooting
+If tests fail:
+1. Check that all dependencies are installed (`pip install -r requirements.txt`)
+2. Verify that `app.py` exists in the parent directory
+3. Ensure configuration files are properly set up
+4. Check for any missing environment variables or configuration issues
+## Integration with CI/CD
+These tests are designed to run in headless environments and are suitable for:
+- GitHub Actions
+- Jenkins pipelines
+- Docker containers
+- Any automated testing environment
+The tests do not require a display or browser to be available.

test/demo_single_test.py CHANGED Viewed

@@ -7,84 +7,100 @@ to test a specific CLI example.
 """
 import os
 import sys
 import tempfile
-import shutil
 # Add the parent directory to the path
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from test.test import run_cli_redact
 def demo_pdf_redaction():
     """Demonstrate how to run a single PDF redaction test."""
     print("=== Demo: PDF Redaction with Default Settings ===")
     # Set up paths
-    script_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "cli_redact.py")
-    input_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "example_data", "example_of_emails_sent_to_a_professor_before_applying.pdf")
     output_dir = tempfile.mkdtemp(prefix="demo_output_")
     print(f"Script: {script_path}")
     print(f"Input: {input_file}")
     print(f"Output: {output_dir}")
     # Check if files exist
     if not os.path.isfile(script_path):
         print(f"❌ Script not found: {script_path}")
         return False
     if not os.path.isfile(input_file):
         print(f"❌ Input file not found: {input_file}")
-        print("Make sure you have the example data files in the example_data/ directory")
         return False
     try:
         # Run the test
         print("\nRunning PDF redaction with default settings...")
         result = run_cli_redact(
-            script_path=script_path,
-            input_file=input_file,
-            output_dir=output_dir
         )
         if result:
             print("✅ Test completed successfully!")
             print(f"Check the output directory for results: {output_dir}")
         else:
             print("❌ Test failed!")
         return result
     finally:
         # Clean up
         if os.path.exists(output_dir):
             shutil.rmtree(output_dir)
             print(f"Cleaned up: {output_dir}")
 def demo_csv_anonymisation():
     """Demonstrate how to run a CSV anonymisation test."""
     print("\n=== Demo: CSV Anonymisation ===")
     # Set up paths
-    script_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "cli_redact.py")
-    input_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "example_data", "combined_case_notes.csv")
     output_dir = tempfile.mkdtemp(prefix="demo_output_")
     print(f"Script: {script_path}")
     print(f"Input: {input_file}")
     print(f"Output: {output_dir}")
     # Check if files exist
     if not os.path.isfile(script_path):
         print(f"❌ Script not found: {script_path}")
         return False
     if not os.path.isfile(input_file):
         print(f"❌ Input file not found: {input_file}")
-        print("Make sure you have the example data files in the example_data/ directory")
         return False
     try:
         # Run the test
         print("\nRunning CSV anonymisation...")
@@ -93,40 +109,41 @@ def demo_csv_anonymisation():
             input_file=input_file,
             output_dir=output_dir,
             text_columns=["Case Note", "Client"],
-            anon_strategy="replace_redacted"
         )
         if result:
             print("✅ Test completed successfully!")
             print(f"Check the output directory for results: {output_dir}")
         else:
             print("❌ Test failed!")
         return result
     finally:
         # Clean up
         if os.path.exists(output_dir):
             shutil.rmtree(output_dir)
             print(f"Cleaned up: {output_dir}")
 if __name__ == "__main__":
     print("CLI Redaction Test Demo")
     print("=" * 50)
     print("This script demonstrates how to run individual tests.")
     print("=" * 50)
     # Run the demos
     success1 = demo_pdf_redaction()
     success2 = demo_csv_anonymisation()
     print("\n" + "=" * 50)
     print("Demo Summary")
     print("=" * 50)
     print(f"PDF Redaction: {'✅ PASSED' if success1 else '❌ FAILED'}")
     print(f"CSV Anonymisation: {'✅ PASSED' if success2 else '❌ FAILED'}")
     overall_success = success1 and success2
     print(f"\nOverall: {'✅ PASSED' if overall_success else '❌ FAILED'}")
     sys.exit(0 if overall_success else 1)

 """
 import os
+import shutil
 import sys
 import tempfile
 # Add the parent directory to the path
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from test.test import run_cli_redact
 def demo_pdf_redaction():
     """Demonstrate how to run a single PDF redaction test."""
     print("=== Demo: PDF Redaction with Default Settings ===")
     # Set up paths
+    script_path = os.path.join(
+        os.path.dirname(os.path.dirname(__file__)), "cli_redact.py"
+    )
+    input_file = os.path.join(
+        os.path.dirname(os.path.dirname(__file__)),
+        "example_data",
+        "example_of_emails_sent_to_a_professor_before_applying.pdf",
+    )
     output_dir = tempfile.mkdtemp(prefix="demo_output_")
     print(f"Script: {script_path}")
     print(f"Input: {input_file}")
     print(f"Output: {output_dir}")
     # Check if files exist
     if not os.path.isfile(script_path):
         print(f"❌ Script not found: {script_path}")
         return False
     if not os.path.isfile(input_file):
         print(f"❌ Input file not found: {input_file}")
+        print(
+            "Make sure you have the example data files in the example_data/ directory"
+        )
         return False
     try:
         # Run the test
         print("\nRunning PDF redaction with default settings...")
         result = run_cli_redact(
+            script_path=script_path, input_file=input_file, output_dir=output_dir
         )
         if result:
             print("✅ Test completed successfully!")
             print(f"Check the output directory for results: {output_dir}")
         else:
             print("❌ Test failed!")
         return result
     finally:
         # Clean up
         if os.path.exists(output_dir):
             shutil.rmtree(output_dir)
             print(f"Cleaned up: {output_dir}")
 def demo_csv_anonymisation():
     """Demonstrate how to run a CSV anonymisation test."""
     print("\n=== Demo: CSV Anonymisation ===")
     # Set up paths
+    script_path = os.path.join(
+        os.path.dirname(os.path.dirname(__file__)), "cli_redact.py"
+    )
+    input_file = os.path.join(
+        os.path.dirname(os.path.dirname(__file__)),
+        "example_data",
+        "combined_case_notes.csv",
+    )
     output_dir = tempfile.mkdtemp(prefix="demo_output_")
     print(f"Script: {script_path}")
     print(f"Input: {input_file}")
     print(f"Output: {output_dir}")
     # Check if files exist
     if not os.path.isfile(script_path):
         print(f"❌ Script not found: {script_path}")
         return False
     if not os.path.isfile(input_file):
         print(f"❌ Input file not found: {input_file}")
+        print(
+            "Make sure you have the example data files in the example_data/ directory"
+        )
         return False
     try:
         # Run the test
         print("\nRunning CSV anonymisation...")
             input_file=input_file,
             output_dir=output_dir,
             text_columns=["Case Note", "Client"],
+            anon_strategy="replace_redacted",
         )
         if result:
             print("✅ Test completed successfully!")
             print(f"Check the output directory for results: {output_dir}")
         else:
             print("❌ Test failed!")
         return result
     finally:
         # Clean up
         if os.path.exists(output_dir):
             shutil.rmtree(output_dir)
             print(f"Cleaned up: {output_dir}")
 if __name__ == "__main__":
     print("CLI Redaction Test Demo")
     print("=" * 50)
     print("This script demonstrates how to run individual tests.")
     print("=" * 50)
     # Run the demos
     success1 = demo_pdf_redaction()
     success2 = demo_csv_anonymisation()
     print("\n" + "=" * 50)
     print("Demo Summary")
     print("=" * 50)
     print(f"PDF Redaction: {'✅ PASSED' if success1 else '❌ FAILED'}")
     print(f"CSV Anonymisation: {'✅ PASSED' if success2 else '❌ FAILED'}")
     overall_success = success1 and success2
     print(f"\nOverall: {'✅ PASSED' if overall_success else '❌ FAILED'}")
     sys.exit(0 if overall_success else 1)

test/run_gui_test.bat ADDED Viewed

	@@ -0,0 +1,26 @@

+@echo off
+REM Batch script to run GUI tests with conda environment activated
+REM This script activates the 'redaction' conda environment and runs the GUI tests
+echo Activating conda environment 'redaction'...
+call conda activate redaction
+if %errorlevel% neq 0 (
+    echo Failed to activate conda environment 'redaction'
+    echo Please ensure conda is installed and the 'redaction' environment exists
+    pause
+    exit /b 1
+)
+echo Running GUI tests...
+python test_gui_only.py
+if %errorlevel% neq 0 (
+    echo GUI tests failed
+    pause
+    exit /b 1
+) else (
+    echo GUI tests passed successfully
+)
+pause

test/run_gui_test.ps1 ADDED Viewed

	@@ -0,0 +1,34 @@

+# PowerShell script to run GUI tests with conda environment activated
+# This script activates the 'redaction' conda environment and runs the GUI tests
+Write-Host "Activating conda environment 'redaction'..." -ForegroundColor Green
+try {
+    # Try to activate the conda environment
+    conda activate redaction
+    if ($LASTEXITCODE -ne 0) {
+        Write-Host "Failed to activate conda environment 'redaction'" -ForegroundColor Red
+        Write-Host "Please ensure conda is installed and the 'redaction' environment exists" -ForegroundColor Red
+        Read-Host "Press Enter to exit"
+        exit 1
+    }
+    Write-Host "Running GUI tests..." -ForegroundColor Green
+    python test_gui_only.py
+    if ($LASTEXITCODE -ne 0) {
+        Write-Host "GUI tests failed" -ForegroundColor Red
+        Read-Host "Press Enter to exit"
+        exit 1
+    } else {
+        Write-Host "GUI tests passed successfully" -ForegroundColor Green
+    }
+} catch {
+    Write-Host "An error occurred: $_" -ForegroundColor Red
+    Read-Host "Press Enter to exit"
+    exit 1
+}
+Read-Host "Press Enter to exit"

test/run_tests.py CHANGED Viewed

@@ -6,8 +6,8 @@ This script demonstrates how to run the comprehensive test suite
 that covers all the examples from the CLI epilog.
 """
-import sys
 import os
 # Add the parent directory to the path so we can import the test module
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -18,9 +18,9 @@ if __name__ == "__main__":
     print("Starting CLI Redaction Test Suite...")
     print("This will test all examples from the CLI epilog.")
     print("=" * 60)
     success = run_all_tests()
     if success:
         print("\n🎉 All tests passed successfully!")
         sys.exit(0)

 that covers all the examples from the CLI epilog.
 """
 import os
+import sys
 # Add the parent directory to the path so we can import the test module
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
     print("Starting CLI Redaction Test Suite...")
     print("This will test all examples from the CLI epilog.")
     print("=" * 60)
     success = run_all_tests()
     if success:
         print("\n🎉 All tests passed successfully!")
         sys.exit(0)

test/test.py CHANGED Viewed

@@ -1,17 +1,20 @@
-from typing import List, Optional
 import os
 import subprocess
-import unittest
 import tempfile
-import shutil
 def run_cli_redact(
     script_path: str,
     input_file: str,
     output_dir: str,
-    task: str = 'redact',
-    timeout: int = 600, # 10-minute timeout
     # --- General Arguments ---
     input_dir: Optional[str] = None,
     language: Optional[str] = None,
@@ -32,7 +35,6 @@ def run_cli_redact(
     display_file_names_in_logs: Optional[bool] = None,
     upload_logs_to_s3: Optional[bool] = None,
     s3_logs_prefix: Optional[str] = None,
     # --- PDF/Image Redaction Arguments ---
     ocr_method: Optional[str] = None,
     page_min: Optional[int] = None,
@@ -49,14 +51,12 @@ def run_cli_redact(
     extract_forms: Optional[bool] = None,
     extract_tables: Optional[bool] = None,
     extract_layout: Optional[bool] = None,
     # --- Word/Tabular Anonymisation Arguments ---
     anon_strategy: Optional[str] = None,
     text_columns: Optional[List[str]] = None,
     excel_sheets: Optional[List[str]] = None,
     fuzzy_mistakes: Optional[int] = None,
     match_fuzzy_whole_phrase_bool: Optional[bool] = None,
     # --- Duplicate Detection Arguments ---
     duplicate_type: Optional[str] = None,
     similarity_threshold: Optional[float] = None,
@@ -65,7 +65,6 @@ def run_cli_redact(
     greedy_match: Optional[bool] = None,
     combine_pages: Optional[bool] = None,
     remove_duplicate_rows: Optional[bool] = None,
     # --- Textract Batch Operations Arguments ---
     textract_action: Optional[str] = None,
     job_id: Optional[str] = None,
@@ -76,7 +75,7 @@ def run_cli_redact(
     s3_textract_document_logs_subfolder: Optional[str] = None,
     local_textract_document_logs_subfolder: Optional[str] = None,
     poll_interval: Optional[int] = None,
-    max_poll_attempts: Optional[int] = None
 ) -> bool:
     """
     Executes the cli_redact.py script with specified arguments using a subprocess.
@@ -87,7 +86,7 @@ def run_cli_redact(
         output_dir (str): The path to the directory for output files.
         task (str): The main task to perform ('redact', 'deduplicate', or 'textract').
         timeout (int): Timeout in seconds for the subprocess.
         # General Arguments
         input_dir (str): Directory for all input files.
         language (str): Language of the document content.
@@ -108,7 +107,7 @@ def run_cli_redact(
         display_file_names_in_logs (bool): Include file names in log outputs.
         upload_logs_to_s3 (bool): Upload log files to S3 after processing.
         s3_logs_prefix (str): S3 prefix for usage log files.
         # PDF/Image Redaction Arguments
         ocr_method (str): OCR method for text extraction from images.
         page_min (int): First page to redact.
@@ -125,14 +124,14 @@ def run_cli_redact(
         extract_forms (bool): Extract forms during Textract analysis.
         extract_tables (bool): Extract tables during Textract analysis.
         extract_layout (bool): Extract layout during Textract analysis.
         # Word/Tabular Anonymisation Arguments
         anon_strategy (str): The anonymisation strategy to apply.
         text_columns (List[str]): A list of column names to anonymise or deduplicate.
         excel_sheets (List[str]): Specific Excel sheet names to process.
         fuzzy_mistakes (int): Number of allowed spelling mistakes for fuzzy matching.
         match_fuzzy_whole_phrase_bool (bool): Match fuzzy whole phrase boolean.
         # Duplicate Detection Arguments
         duplicate_type (str): Type of duplicate detection (pages or tabular).
         similarity_threshold (float): Similarity threshold (0-1) to consider content as duplicates.
@@ -141,7 +140,7 @@ def run_cli_redact(
         greedy_match (bool): Use greedy matching strategy for consecutive pages.
         combine_pages (bool): Combine text from the same page number within a file.
         remove_duplicate_rows (bool): Remove duplicate rows from the output.
         # Textract Batch Operations Arguments
         textract_action (str): Textract action to perform (submit, retrieve, or list).
         job_id (str): Textract job ID for retrieve action.
@@ -160,9 +159,9 @@ def run_cli_redact(
     # 1. Get absolute paths and perform pre-checks
     script_abs_path = os.path.abspath(script_path)
     output_abs_dir = os.path.abspath(output_dir)
     # Handle input file based on task and action
-    if task == 'textract' and textract_action in ['retrieve', 'list']:
         # For retrieve and list actions, input file is not required
         input_abs_path = None
     else:
@@ -172,25 +171,27 @@ def run_cli_redact(
         input_abs_path = os.path.abspath(input_file)
         if not os.path.isfile(input_abs_path):
             raise FileNotFoundError(f"Input file not found: {input_abs_path}")
     if not os.path.isfile(script_abs_path):
         raise FileNotFoundError(f"Script not found: {script_abs_path}")
     if not os.path.isdir(output_abs_dir):
         # Create the output directory if it doesn't exist
         print(f"Output directory not found. Creating: {output_abs_dir}")
         os.makedirs(output_abs_dir)
     script_folder = os.path.dirname(script_abs_path)
     # 2. Dynamically build the command list
     command = [
         "python",
         script_abs_path,
-        "--output_dir", output_abs_dir,
-        "--task", task,
     ]
     # Add input_file only if it's not None
     if input_abs_path is not None:
         command.extend(["--input_file", input_abs_path])
@@ -231,7 +232,9 @@ def run_cli_redact(
     if save_logs_to_dynamodb is not None:
         command.extend(["--save_logs_to_dynamodb", str(save_logs_to_dynamodb)])
     if display_file_names_in_logs is not None:
-        command.extend(["--display_file_names_in_logs", str(display_file_names_in_logs)])
     if upload_logs_to_s3 is not None:
         command.extend(["--upload_logs_to_s3", str(upload_logs_to_s3)])
     if s3_logs_prefix:
@@ -249,17 +252,23 @@ def run_cli_redact(
     if chosen_local_ocr_model:
         command.extend(["--chosen_local_ocr_model", chosen_local_ocr_model])
     if preprocess_local_ocr_images is not None:
-        command.extend(["--preprocess_local_ocr_images", str(preprocess_local_ocr_images)])
     if compress_redacted_pdf is not None:
         command.extend(["--compress_redacted_pdf", str(compress_redacted_pdf)])
     if return_pdf_end_of_redaction is not None:
-        command.extend(["--return_pdf_end_of_redaction", str(return_pdf_end_of_redaction)])
     if deny_list_file and os.path.isfile(deny_list_file):
         command.extend(["--deny_list_file", os.path.abspath(deny_list_file)])
     if allow_list_file and os.path.isfile(allow_list_file):
         command.extend(["--allow_list_file", os.path.abspath(allow_list_file)])
     if redact_whole_page_file and os.path.isfile(redact_whole_page_file):
-        command.extend(["--redact_whole_page_file", os.path.abspath(redact_whole_page_file)])
     if handwrite_signature_extraction:
         command.append("--handwrite_signature_extraction")
         command.extend(handwrite_signature_extraction)
@@ -282,7 +291,9 @@ def run_cli_redact(
     if fuzzy_mistakes is not None:
         command.extend(["--fuzzy_mistakes", str(fuzzy_mistakes)])
     if match_fuzzy_whole_phrase_bool is not None:
-        command.extend(["--match_fuzzy_whole_phrase_bool", str(match_fuzzy_whole_phrase_bool)])
     # Add duplicate detection arguments
     if duplicate_type:
@@ -315,16 +326,26 @@ def run_cli_redact(
     if textract_output_prefix:
         command.extend(["--textract_output_prefix", textract_output_prefix])
     if s3_textract_document_logs_subfolder:
-        command.extend(["--s3_textract_document_logs_subfolder", s3_textract_document_logs_subfolder])
     if local_textract_document_logs_subfolder:
-        command.extend(["--local_textract_document_logs_subfolder", local_textract_document_logs_subfolder])
     if poll_interval is not None:
         command.extend(["--poll_interval", str(poll_interval)])
     if max_poll_attempts is not None:
         command.extend(["--max_poll_attempts", str(max_poll_attempts)])
     # Filter out None values before joining
-    command_str = ' '.join(str(arg) for arg in command if arg is not None)
     print(f"Executing command: {command_str}")
     # 3. Execute the command using subprocess
@@ -334,12 +355,12 @@ def run_cli_redact(
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             text=True,
-            cwd=script_folder # Important for relative paths within the script
         )
         # Communicate with the process to get output and handle timeout
         stdout, stderr = result.communicate(timeout=timeout)
         print("--- SCRIPT STDOUT ---")
         if stdout:
             print(stdout)
@@ -366,124 +387,150 @@ def run_cli_redact(
 class TestCLIRedactExamples(unittest.TestCase):
     """Test suite for CLI redaction examples from the epilog."""
     @classmethod
     def setUpClass(cls):
         """Set up test environment before running tests."""
-        cls.script_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "cli_redact.py")
-        cls.example_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "example_data")
         cls.temp_output_dir = tempfile.mkdtemp(prefix="test_output_")
         # Verify script exists
         if not os.path.isfile(cls.script_path):
             raise FileNotFoundError(f"CLI script not found: {cls.script_path}")
         print(f"Test setup complete. Script: {cls.script_path}")
         print(f"Example data directory: {cls.example_data_dir}")
         print(f"Temp output directory: {cls.temp_output_dir}")
     @classmethod
     def tearDownClass(cls):
         """Clean up test environment after running tests."""
         if os.path.exists(cls.temp_output_dir):
             shutil.rmtree(cls.temp_output_dir)
         print(f"Cleaned up temp directory: {cls.temp_output_dir}")
     def test_pdf_redaction_default_settings(self):
         """Test: Redact a PDF with default settings (local OCR)"""
         print("\n=== Testing PDF redaction with default settings ===")
-        input_file = os.path.join(self.example_data_dir, "example_of_emails_sent_to_a_professor_before_applying.pdf")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
-            output_dir=self.temp_output_dir
         )
         self.assertTrue(result, "PDF redaction with default settings should succeed")
         print("✅ PDF redaction with default settings passed")
     def test_pdf_text_extraction_only(self):
         """Test: Extract text from a PDF only (i.e. no redaction), using local OCR"""
         print("\n=== Testing PDF text extraction only ===")
-        input_file = os.path.join(self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf")
-        whole_page_file = os.path.join(self.example_data_dir, "partnership_toolkit_redact_some_pages.csv")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         if not os.path.isfile(whole_page_file):
             self.skipTest(f"Whole page file not found: {whole_page_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             redact_whole_page_file=whole_page_file,
-            pii_detector="None"
         )
         self.assertTrue(result, "PDF text extraction should succeed")
         print("✅ PDF text extraction only passed")
     def test_pdf_text_extraction_with_whole_page_redaction(self):
         """Test: Extract text from a PDF only with a whole page redaction list"""
         print("\n=== Testing PDF text extraction with whole page redaction ===")
-        input_file = os.path.join(self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf")
-        whole_page_file = os.path.join(self.example_data_dir, "partnership_toolkit_redact_some_pages.csv")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         if not os.path.isfile(whole_page_file):
             self.skipTest(f"Whole page file not found: {whole_page_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             redact_whole_page_file=whole_page_file,
             pii_detector="Local",
-            local_redact_entities=["CUSTOM"]
         )
-        self.assertTrue(result, "PDF text extraction with whole page redaction should succeed")
         print("✅ PDF text extraction with whole page redaction passed")
     def test_pdf_redaction_with_allow_list(self):
         """Test: Redact a PDF with allow list (local OCR) and custom list of redaction entities"""
         print("\n=== Testing PDF redaction with allow list ===")
-        input_file = os.path.join(self.example_data_dir, "graduate-job-example-cover-letter.pdf")
-        allow_list_file = os.path.join(self.example_data_dir, "test_allow_list_graduate.csv")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         if not os.path.isfile(allow_list_file):
             self.skipTest(f"Allow list file not found: {allow_list_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             allow_list_file=allow_list_file,
-            local_redact_entities=["TITLES", "PERSON", "DATE_TIME"]
         )
         self.assertTrue(result, "PDF redaction with allow list should succeed")
         print("✅ PDF redaction with allow list passed")
     def test_pdf_redaction_limited_pages_with_custom_fuzzy(self):
         """Test: Redact a PDF with limited pages and text extraction method with custom fuzzy matching"""
         print("\n=== Testing PDF redaction with limited pages and fuzzy matching ===")
-        input_file = os.path.join(self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf")
-        deny_list_file = os.path.join(self.example_data_dir, "Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         if not os.path.isfile(deny_list_file):
             self.skipTest(f"Deny list file not found: {deny_list_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
@@ -493,20 +540,30 @@ class TestCLIRedactExamples(unittest.TestCase):
             page_min=1,
             page_max=3,
             ocr_method="Local text",
-            fuzzy_mistakes=3
         )
-        self.assertTrue(result, "PDF redaction with limited pages and fuzzy matching should succeed")
         print("✅ PDF redaction with limited pages and fuzzy matching passed")
     def test_pdf_redaction_with_custom_lists(self):
         """Test: Redaction with custom deny list, allow list, and whole page redaction list"""
         print("\n=== Testing PDF redaction with custom lists ===")
-        input_file = os.path.join(self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf")
-        deny_list_file = os.path.join(self.example_data_dir, "partnership_toolkit_redact_custom_deny_list.csv")
-        whole_page_file = os.path.join(self.example_data_dir, "partnership_toolkit_redact_some_pages.csv")
-        allow_list_file = os.path.join(self.example_data_dir, "test_allow_list_partnership.csv")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         if not os.path.isfile(deny_list_file):
@@ -515,164 +572,186 @@ class TestCLIRedactExamples(unittest.TestCase):
             self.skipTest(f"Whole page file not found: {whole_page_file}")
         if not os.path.isfile(allow_list_file):
             self.skipTest(f"Allow list file not found: {allow_list_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             deny_list_file=deny_list_file,
             redact_whole_page_file=whole_page_file,
-            allow_list_file=allow_list_file
         )
         self.assertTrue(result, "PDF redaction with custom lists should succeed")
         print("✅ PDF redaction with custom lists passed")
     def test_image_redaction(self):
         """Test: Redact an image"""
         print("\n=== Testing image redaction ===")
         input_file = os.path.join(self.example_data_dir, "example_complaint_letter.jpg")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
-            output_dir=self.temp_output_dir
         )
         self.assertTrue(result, "Image redaction should succeed")
         print("✅ Image redaction passed")
     def test_csv_anonymisation_specific_columns(self):
         """Test: Anonymise csv file with specific columns"""
         print("\n=== Testing CSV anonymisation with specific columns ===")
         input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             text_columns=["Case Note", "Client"],
-            anon_strategy="replace_redacted"
         )
-        self.assertTrue(result, "CSV anonymisation with specific columns should succeed")
         print("✅ CSV anonymisation with specific columns passed")
     def test_csv_anonymisation_different_strategy(self):
         """Test: Anonymise csv file with a different strategy (remove text completely)"""
         print("\n=== Testing CSV anonymisation with different strategy ===")
         input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             text_columns=["Case Note", "Client"],
-            anon_strategy="redact"
         )
-        self.assertTrue(result, "CSV anonymisation with different strategy should succeed")
         print("✅ CSV anonymisation with different strategy passed")
     def test_word_document_anonymisation(self):
         """Test: Anonymise a word document"""
         print("\n=== Testing Word document anonymisation ===")
-        input_file = os.path.join(self.example_data_dir, "Bold minimalist professional cover letter.docx")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
-            anon_strategy="replace_redacted"
         )
         self.assertTrue(result, "Word document anonymisation should succeed")
         print("✅ Word document anonymisation passed")
     def test_aws_textract_comprehend_redaction(self):
         """Test: Use Textract and Comprehend for redaction"""
         print("\n=== Testing AWS Textract and Comprehend redaction ===")
-        input_file = os.path.join(self.example_data_dir, "example_of_emails_sent_to_a_professor_before_applying.pdf")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         # Skip this test if AWS credentials are not available
         # This is a conditional test that may not work in all environments
-        result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             ocr_method="AWS Textract",
-            pii_detector="AWS Comprehend"
         )
         # Note: This test may fail if AWS credentials are not configured
         # We'll mark it as passed if it runs without crashing
         print("✅ AWS Textract and Comprehend redaction test completed")
     def test_aws_textract_signature_extraction(self):
         """Test: Redact specific pages with AWS OCR and signature extraction"""
         print("\n=== Testing AWS Textract with signature extraction ===")
-        input_file = os.path.join(self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         # Skip this test if AWS credentials are not available
-        result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             page_min=6,
             page_max=7,
             ocr_method="AWS Textract",
-            handwrite_signature_extraction=["Extract handwriting", "Extract signatures"]
         )
         # Note: This test may fail if AWS credentials are not configured
         print("✅ AWS Textract with signature extraction test completed")
     def test_duplicate_pages_detection(self):
         """Test: Find duplicate pages in OCR files"""
         print("\n=== Testing duplicate pages detection ===")
-        input_file = os.path.join(self.example_data_dir, "example_outputs", "doubled_output_joined.pdf_ocr_output.csv")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example OCR file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             task="deduplicate",
             duplicate_type="pages",
-            similarity_threshold=0.95
         )
         self.assertTrue(result, "Duplicate pages detection should succeed")
         print("✅ Duplicate pages detection passed")
     def test_duplicate_line_level_detection(self):
         """Test: Find duplicate in OCR files at the line level"""
         print("\n=== Testing duplicate line level detection ===")
-        input_file = os.path.join(self.example_data_dir, "example_outputs", "doubled_output_joined.pdf_ocr_output.csv")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example OCR file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
@@ -681,20 +760,22 @@ class TestCLIRedactExamples(unittest.TestCase):
             duplicate_type="pages",
             similarity_threshold=0.95,
             combine_pages=False,
-            min_word_count=3
         )
         self.assertTrue(result, "Duplicate line level detection should succeed")
         print("✅ Duplicate line level detection passed")
     def test_duplicate_tabular_detection(self):
         """Test: Find duplicate rows in tabular data"""
         print("\n=== Testing duplicate tabular detection ===")
-        input_file = os.path.join(self.example_data_dir, "Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example CSV file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
@@ -702,124 +783,284 @@ class TestCLIRedactExamples(unittest.TestCase):
             task="deduplicate",
             duplicate_type="tabular",
             text_columns=["text"],
-            similarity_threshold=0.95
         )
         self.assertTrue(result, "Duplicate tabular detection should succeed")
         print("✅ Duplicate tabular detection passed")
     def test_textract_submit_document(self):
         """Test: Submit document to Textract for basic text analysis"""
         print("\n=== Testing Textract document submission ===")
-        input_file = os.path.join(self.example_data_dir, "example_of_emails_sent_to_a_professor_before_applying.pdf")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         # Skip this test if AWS credentials are not available
         try:
-            result = run_cli_redact(
                 script_path=self.script_path,
                 input_file=input_file,
                 output_dir=self.temp_output_dir,
                 task="textract",
-                textract_action="submit"
             )
         except Exception as e:
             print(f"Textract test failed (expected without AWS credentials): {e}")
-            result = True  # Mark as passed since this is expected to fail without credentials
         # Note: This test may fail if AWS credentials are not configured
         print("✅ Textract document submission test completed")
     def test_textract_submit_with_signatures(self):
         """Test: Submit document to Textract for analysis with signature extraction"""
         print("\n=== Testing Textract submission with signature extraction ===")
-        input_file = os.path.join(self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         # Skip this test if AWS credentials are not available
         try:
-            result = run_cli_redact(
                 script_path=self.script_path,
                 input_file=input_file,
                 output_dir=self.temp_output_dir,
                 task="textract",
                 textract_action="submit",
-                extract_signatures=True
             )
         except Exception as e:
             print(f"Textract test failed (expected without AWS credentials): {e}")
-            result = True  # Mark as passed since this is expected to fail without credentials
         # Note: This test may fail if AWS credentials are not configured
         print("✅ Textract submission with signature extraction test completed")
     def test_textract_retrieve_results(self):
         """Test: Retrieve Textract results by job ID"""
         print("\n=== Testing Textract results retrieval ===")
         # Skip this test if AWS credentials are not available
         # This would require a valid job ID from a previous submission
         # For retrieve and list actions, we don't need a real input file
         try:
-            result = run_cli_redact(
                 script_path=self.script_path,
                 input_file=None,  # No input file needed for retrieve action
                 output_dir=self.temp_output_dir,
                 task="textract",
                 textract_action="retrieve",
-                job_id="12345678-1234-1234-1234-123456789012"  # Dummy job ID
             )
         except Exception as e:
             print(f"Textract test failed (expected without AWS credentials): {e}")
-            result = True  # Mark as passed since this is expected to fail without credentials
         # Note: This test will likely fail with a dummy job ID, but that's expected
         print("✅ Textract results retrieval test completed")
     def test_textract_list_jobs(self):
         """Test: List recent Textract jobs"""
         print("\n=== Testing Textract jobs listing ===")
         # Skip this test if AWS credentials are not available
         # For list action, we don't need a real input file
         try:
-            result = run_cli_redact(
                 script_path=self.script_path,
                 input_file=None,  # No input file needed for list action
                 output_dir=self.temp_output_dir,
                 task="textract",
-                textract_action="list"
             )
         except Exception as e:
             print(f"Textract test failed (expected without AWS credentials): {e}")
-            result = True  # Mark as passed since this is expected to fail without credentials
         # Note: This test may fail if AWS credentials are not configured
         print("✅ Textract jobs listing test completed")
 def run_all_tests():
     """Run all test examples and report results."""
     print("=" * 80)
-    print("DOCUMENT REDACTION CLI TEST SUITE")
     print("=" * 80)
-    print("This test suite runs through all the examples from the CLI epilog.")
     print("Tests will be skipped if required example files are not found.")
     print("AWS-related tests may fail if credentials are not configured.")
     print("=" * 80)
     # Create test suite
     loader = unittest.TestLoader()
-    suite = loader.loadTestsFromTestCase(TestCLIRedactExamples)
     # Run tests with detailed output
     runner = unittest.TextTestRunner(verbosity=2, stream=None)
     result = runner.run(suite)
     # Print summary
     print("\n" + "=" * 80)
     print("TEST SUMMARY")
@@ -828,25 +1069,25 @@ def run_all_tests():
     print(f"Failures: {len(result.failures)}")
     print(f"Errors: {len(result.errors)}")
     print(f"Skipped: {len(result.skipped) if hasattr(result, 'skipped') else 0}")
     if result.failures:
         print("\nFAILURES:")
         for test, traceback in result.failures:
             print(f"- {test}: {traceback}")
     if result.errors:
         print("\nERRORS:")
         for test, traceback in result.errors:
             print(f"- {test}: {traceback}")
     success = len(result.failures) == 0 and len(result.errors) == 0
     print(f"\nOverall result: {'✅ PASSED' if success else '❌ FAILED'}")
     print("=" * 80)
     return success
 if __name__ == "__main__":
     # Run the test suite
     success = run_all_tests()
-    exit(0 if success else 1)

 import os
+import shutil
 import subprocess
 import tempfile
+import unittest
+import sys
+import threading
+import time
+from typing import List, Optional
 def run_cli_redact(
     script_path: str,
     input_file: str,
     output_dir: str,
+    task: str = "redact",
+    timeout: int = 600,  # 10-minute timeout
     # --- General Arguments ---
     input_dir: Optional[str] = None,
     language: Optional[str] = None,
     display_file_names_in_logs: Optional[bool] = None,
     upload_logs_to_s3: Optional[bool] = None,
     s3_logs_prefix: Optional[str] = None,
     # --- PDF/Image Redaction Arguments ---
     ocr_method: Optional[str] = None,
     page_min: Optional[int] = None,
     extract_forms: Optional[bool] = None,
     extract_tables: Optional[bool] = None,
     extract_layout: Optional[bool] = None,
     # --- Word/Tabular Anonymisation Arguments ---
     anon_strategy: Optional[str] = None,
     text_columns: Optional[List[str]] = None,
     excel_sheets: Optional[List[str]] = None,
     fuzzy_mistakes: Optional[int] = None,
     match_fuzzy_whole_phrase_bool: Optional[bool] = None,
     # --- Duplicate Detection Arguments ---
     duplicate_type: Optional[str] = None,
     similarity_threshold: Optional[float] = None,
     greedy_match: Optional[bool] = None,
     combine_pages: Optional[bool] = None,
     remove_duplicate_rows: Optional[bool] = None,
     # --- Textract Batch Operations Arguments ---
     textract_action: Optional[str] = None,
     job_id: Optional[str] = None,
     s3_textract_document_logs_subfolder: Optional[str] = None,
     local_textract_document_logs_subfolder: Optional[str] = None,
     poll_interval: Optional[int] = None,
+    max_poll_attempts: Optional[int] = None,
 ) -> bool:
     """
     Executes the cli_redact.py script with specified arguments using a subprocess.
         output_dir (str): The path to the directory for output files.
         task (str): The main task to perform ('redact', 'deduplicate', or 'textract').
         timeout (int): Timeout in seconds for the subprocess.
         # General Arguments
         input_dir (str): Directory for all input files.
         language (str): Language of the document content.
         display_file_names_in_logs (bool): Include file names in log outputs.
         upload_logs_to_s3 (bool): Upload log files to S3 after processing.
         s3_logs_prefix (str): S3 prefix for usage log files.
         # PDF/Image Redaction Arguments
         ocr_method (str): OCR method for text extraction from images.
         page_min (int): First page to redact.
         extract_forms (bool): Extract forms during Textract analysis.
         extract_tables (bool): Extract tables during Textract analysis.
         extract_layout (bool): Extract layout during Textract analysis.
         # Word/Tabular Anonymisation Arguments
         anon_strategy (str): The anonymisation strategy to apply.
         text_columns (List[str]): A list of column names to anonymise or deduplicate.
         excel_sheets (List[str]): Specific Excel sheet names to process.
         fuzzy_mistakes (int): Number of allowed spelling mistakes for fuzzy matching.
         match_fuzzy_whole_phrase_bool (bool): Match fuzzy whole phrase boolean.
         # Duplicate Detection Arguments
         duplicate_type (str): Type of duplicate detection (pages or tabular).
         similarity_threshold (float): Similarity threshold (0-1) to consider content as duplicates.
         greedy_match (bool): Use greedy matching strategy for consecutive pages.
         combine_pages (bool): Combine text from the same page number within a file.
         remove_duplicate_rows (bool): Remove duplicate rows from the output.
         # Textract Batch Operations Arguments
         textract_action (str): Textract action to perform (submit, retrieve, or list).
         job_id (str): Textract job ID for retrieve action.
     # 1. Get absolute paths and perform pre-checks
     script_abs_path = os.path.abspath(script_path)
     output_abs_dir = os.path.abspath(output_dir)
     # Handle input file based on task and action
+    if task == "textract" and textract_action in ["retrieve", "list"]:
         # For retrieve and list actions, input file is not required
         input_abs_path = None
     else:
         input_abs_path = os.path.abspath(input_file)
         if not os.path.isfile(input_abs_path):
             raise FileNotFoundError(f"Input file not found: {input_abs_path}")
     if not os.path.isfile(script_abs_path):
         raise FileNotFoundError(f"Script not found: {script_abs_path}")
     if not os.path.isdir(output_abs_dir):
         # Create the output directory if it doesn't exist
         print(f"Output directory not found. Creating: {output_abs_dir}")
         os.makedirs(output_abs_dir)
     script_folder = os.path.dirname(script_abs_path)
     # 2. Dynamically build the command list
     command = [
         "python",
         script_abs_path,
+        "--output_dir",
+        output_abs_dir,
+        "--task",
+        task,
     ]
     # Add input_file only if it's not None
     if input_abs_path is not None:
         command.extend(["--input_file", input_abs_path])
     if save_logs_to_dynamodb is not None:
         command.extend(["--save_logs_to_dynamodb", str(save_logs_to_dynamodb)])
     if display_file_names_in_logs is not None:
+        command.extend(
+            ["--display_file_names_in_logs", str(display_file_names_in_logs)]
+        )
     if upload_logs_to_s3 is not None:
         command.extend(["--upload_logs_to_s3", str(upload_logs_to_s3)])
     if s3_logs_prefix:
     if chosen_local_ocr_model:
         command.extend(["--chosen_local_ocr_model", chosen_local_ocr_model])
     if preprocess_local_ocr_images is not None:
+        command.extend(
+            ["--preprocess_local_ocr_images", str(preprocess_local_ocr_images)]
+        )
     if compress_redacted_pdf is not None:
         command.extend(["--compress_redacted_pdf", str(compress_redacted_pdf)])
     if return_pdf_end_of_redaction is not None:
+        command.extend(
+            ["--return_pdf_end_of_redaction", str(return_pdf_end_of_redaction)]
+        )
     if deny_list_file and os.path.isfile(deny_list_file):
         command.extend(["--deny_list_file", os.path.abspath(deny_list_file)])
     if allow_list_file and os.path.isfile(allow_list_file):
         command.extend(["--allow_list_file", os.path.abspath(allow_list_file)])
     if redact_whole_page_file and os.path.isfile(redact_whole_page_file):
+        command.extend(
+            ["--redact_whole_page_file", os.path.abspath(redact_whole_page_file)]
+        )
     if handwrite_signature_extraction:
         command.append("--handwrite_signature_extraction")
         command.extend(handwrite_signature_extraction)
     if fuzzy_mistakes is not None:
         command.extend(["--fuzzy_mistakes", str(fuzzy_mistakes)])
     if match_fuzzy_whole_phrase_bool is not None:
+        command.extend(
+            ["--match_fuzzy_whole_phrase_bool", str(match_fuzzy_whole_phrase_bool)]
+        )
     # Add duplicate detection arguments
     if duplicate_type:
     if textract_output_prefix:
         command.extend(["--textract_output_prefix", textract_output_prefix])
     if s3_textract_document_logs_subfolder:
+        command.extend(
+            [
+                "--s3_textract_document_logs_subfolder",
+                s3_textract_document_logs_subfolder,
+            ]
+        )
     if local_textract_document_logs_subfolder:
+        command.extend(
+            [
+                "--local_textract_document_logs_subfolder",
+                local_textract_document_logs_subfolder,
+            ]
+        )
     if poll_interval is not None:
         command.extend(["--poll_interval", str(poll_interval)])
     if max_poll_attempts is not None:
         command.extend(["--max_poll_attempts", str(max_poll_attempts)])
     # Filter out None values before joining
+    command_str = " ".join(str(arg) for arg in command if arg is not None)
     print(f"Executing command: {command_str}")
     # 3. Execute the command using subprocess
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             text=True,
+            cwd=script_folder,  # Important for relative paths within the script
         )
         # Communicate with the process to get output and handle timeout
         stdout, stderr = result.communicate(timeout=timeout)
         print("--- SCRIPT STDOUT ---")
         if stdout:
             print(stdout)
 class TestCLIRedactExamples(unittest.TestCase):
     """Test suite for CLI redaction examples from the epilog."""
     @classmethod
     def setUpClass(cls):
         """Set up test environment before running tests."""
+        cls.script_path = os.path.join(
+            os.path.dirname(os.path.dirname(__file__)), "cli_redact.py"
+        )
+        cls.example_data_dir = os.path.join(
+            os.path.dirname(os.path.dirname(__file__)), "example_data"
+        )
         cls.temp_output_dir = tempfile.mkdtemp(prefix="test_output_")
         # Verify script exists
         if not os.path.isfile(cls.script_path):
             raise FileNotFoundError(f"CLI script not found: {cls.script_path}")
         print(f"Test setup complete. Script: {cls.script_path}")
         print(f"Example data directory: {cls.example_data_dir}")
         print(f"Temp output directory: {cls.temp_output_dir}")
     @classmethod
     def tearDownClass(cls):
         """Clean up test environment after running tests."""
         if os.path.exists(cls.temp_output_dir):
             shutil.rmtree(cls.temp_output_dir)
         print(f"Cleaned up temp directory: {cls.temp_output_dir}")
     def test_pdf_redaction_default_settings(self):
         """Test: Redact a PDF with default settings (local OCR)"""
         print("\n=== Testing PDF redaction with default settings ===")
+        input_file = os.path.join(
+            self.example_data_dir,
+            "example_of_emails_sent_to_a_professor_before_applying.pdf",
+        )
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
+            output_dir=self.temp_output_dir,
         )
         self.assertTrue(result, "PDF redaction with default settings should succeed")
         print("✅ PDF redaction with default settings passed")
     def test_pdf_text_extraction_only(self):
         """Test: Extract text from a PDF only (i.e. no redaction), using local OCR"""
         print("\n=== Testing PDF text extraction only ===")
+        input_file = os.path.join(
+            self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
+        )
+        whole_page_file = os.path.join(
+            self.example_data_dir, "partnership_toolkit_redact_some_pages.csv"
+        )
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         if not os.path.isfile(whole_page_file):
             self.skipTest(f"Whole page file not found: {whole_page_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             redact_whole_page_file=whole_page_file,
+            pii_detector="None",
         )
         self.assertTrue(result, "PDF text extraction should succeed")
         print("✅ PDF text extraction only passed")
     def test_pdf_text_extraction_with_whole_page_redaction(self):
         """Test: Extract text from a PDF only with a whole page redaction list"""
         print("\n=== Testing PDF text extraction with whole page redaction ===")
+        input_file = os.path.join(
+            self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
+        )
+        whole_page_file = os.path.join(
+            self.example_data_dir, "partnership_toolkit_redact_some_pages.csv"
+        )
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         if not os.path.isfile(whole_page_file):
             self.skipTest(f"Whole page file not found: {whole_page_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             redact_whole_page_file=whole_page_file,
             pii_detector="Local",
+            local_redact_entities=["CUSTOM"],
+        )
+        self.assertTrue(
+            result, "PDF text extraction with whole page redaction should succeed"
         )
         print("✅ PDF text extraction with whole page redaction passed")
     def test_pdf_redaction_with_allow_list(self):
         """Test: Redact a PDF with allow list (local OCR) and custom list of redaction entities"""
         print("\n=== Testing PDF redaction with allow list ===")
+        input_file = os.path.join(
+            self.example_data_dir, "graduate-job-example-cover-letter.pdf"
+        )
+        allow_list_file = os.path.join(
+            self.example_data_dir, "test_allow_list_graduate.csv"
+        )
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         if not os.path.isfile(allow_list_file):
             self.skipTest(f"Allow list file not found: {allow_list_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             allow_list_file=allow_list_file,
+            local_redact_entities=["TITLES", "PERSON", "DATE_TIME"],
         )
         self.assertTrue(result, "PDF redaction with allow list should succeed")
         print("✅ PDF redaction with allow list passed")
     def test_pdf_redaction_limited_pages_with_custom_fuzzy(self):
         """Test: Redact a PDF with limited pages and text extraction method with custom fuzzy matching"""
         print("\n=== Testing PDF redaction with limited pages and fuzzy matching ===")
+        input_file = os.path.join(
+            self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
+        )
+        deny_list_file = os.path.join(
+            self.example_data_dir,
+            "Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv",
+        )
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         if not os.path.isfile(deny_list_file):
             self.skipTest(f"Deny list file not found: {deny_list_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             page_min=1,
             page_max=3,
             ocr_method="Local text",
+            fuzzy_mistakes=3,
+        )
+        self.assertTrue(
+            result, "PDF redaction with limited pages and fuzzy matching should succeed"
         )
         print("✅ PDF redaction with limited pages and fuzzy matching passed")
     def test_pdf_redaction_with_custom_lists(self):
         """Test: Redaction with custom deny list, allow list, and whole page redaction list"""
         print("\n=== Testing PDF redaction with custom lists ===")
+        input_file = os.path.join(
+            self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
+        )
+        deny_list_file = os.path.join(
+            self.example_data_dir, "partnership_toolkit_redact_custom_deny_list.csv"
+        )
+        whole_page_file = os.path.join(
+            self.example_data_dir, "partnership_toolkit_redact_some_pages.csv"
+        )
+        allow_list_file = os.path.join(
+            self.example_data_dir, "test_allow_list_partnership.csv"
+        )
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         if not os.path.isfile(deny_list_file):
             self.skipTest(f"Whole page file not found: {whole_page_file}")
         if not os.path.isfile(allow_list_file):
             self.skipTest(f"Allow list file not found: {allow_list_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             deny_list_file=deny_list_file,
             redact_whole_page_file=whole_page_file,
+            allow_list_file=allow_list_file,
         )
         self.assertTrue(result, "PDF redaction with custom lists should succeed")
         print("✅ PDF redaction with custom lists passed")
     def test_image_redaction(self):
         """Test: Redact an image"""
         print("\n=== Testing image redaction ===")
         input_file = os.path.join(self.example_data_dir, "example_complaint_letter.jpg")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
+            output_dir=self.temp_output_dir,
         )
         self.assertTrue(result, "Image redaction should succeed")
         print("✅ Image redaction passed")
     def test_csv_anonymisation_specific_columns(self):
         """Test: Anonymise csv file with specific columns"""
         print("\n=== Testing CSV anonymisation with specific columns ===")
         input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             text_columns=["Case Note", "Client"],
+            anon_strategy="replace_redacted",
+        )
+        self.assertTrue(
+            result, "CSV anonymisation with specific columns should succeed"
         )
         print("✅ CSV anonymisation with specific columns passed")
     def test_csv_anonymisation_different_strategy(self):
         """Test: Anonymise csv file with a different strategy (remove text completely)"""
         print("\n=== Testing CSV anonymisation with different strategy ===")
         input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             text_columns=["Case Note", "Client"],
+            anon_strategy="redact",
+        )
+        self.assertTrue(
+            result, "CSV anonymisation with different strategy should succeed"
         )
         print("✅ CSV anonymisation with different strategy passed")
     def test_word_document_anonymisation(self):
         """Test: Anonymise a word document"""
         print("\n=== Testing Word document anonymisation ===")
+        input_file = os.path.join(
+            self.example_data_dir, "Bold minimalist professional cover letter.docx"
+        )
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
+            anon_strategy="replace_redacted",
         )
         self.assertTrue(result, "Word document anonymisation should succeed")
         print("✅ Word document anonymisation passed")
     def test_aws_textract_comprehend_redaction(self):
         """Test: Use Textract and Comprehend for redaction"""
         print("\n=== Testing AWS Textract and Comprehend redaction ===")
+        input_file = os.path.join(
+            self.example_data_dir,
+            "example_of_emails_sent_to_a_professor_before_applying.pdf",
+        )
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         # Skip this test if AWS credentials are not available
         # This is a conditional test that may not work in all environments
+        run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             ocr_method="AWS Textract",
+            pii_detector="AWS Comprehend",
         )
         # Note: This test may fail if AWS credentials are not configured
         # We'll mark it as passed if it runs without crashing
         print("✅ AWS Textract and Comprehend redaction test completed")
     def test_aws_textract_signature_extraction(self):
         """Test: Redact specific pages with AWS OCR and signature extraction"""
         print("\n=== Testing AWS Textract with signature extraction ===")
+        input_file = os.path.join(
+            self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
+        )
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         # Skip this test if AWS credentials are not available
+        run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             page_min=6,
             page_max=7,
             ocr_method="AWS Textract",
+            handwrite_signature_extraction=[
+                "Extract handwriting",
+                "Extract signatures",
+            ],
         )
         # Note: This test may fail if AWS credentials are not configured
         print("✅ AWS Textract with signature extraction test completed")
     def test_duplicate_pages_detection(self):
         """Test: Find duplicate pages in OCR files"""
         print("\n=== Testing duplicate pages detection ===")
+        input_file = os.path.join(
+            self.example_data_dir,
+            "example_outputs",
+            "doubled_output_joined.pdf_ocr_output.csv",
+        )
         if not os.path.isfile(input_file):
             self.skipTest(f"Example OCR file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             output_dir=self.temp_output_dir,
             task="deduplicate",
             duplicate_type="pages",
+            similarity_threshold=0.95,
         )
         self.assertTrue(result, "Duplicate pages detection should succeed")
         print("✅ Duplicate pages detection passed")
     def test_duplicate_line_level_detection(self):
         """Test: Find duplicate in OCR files at the line level"""
         print("\n=== Testing duplicate line level detection ===")
+        input_file = os.path.join(
+            self.example_data_dir,
+            "example_outputs",
+            "doubled_output_joined.pdf_ocr_output.csv",
+        )
         if not os.path.isfile(input_file):
             self.skipTest(f"Example OCR file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             duplicate_type="pages",
             similarity_threshold=0.95,
             combine_pages=False,
+            min_word_count=3,
         )
         self.assertTrue(result, "Duplicate line level detection should succeed")
         print("✅ Duplicate line level detection passed")
     def test_duplicate_tabular_detection(self):
         """Test: Find duplicate rows in tabular data"""
         print("\n=== Testing duplicate tabular detection ===")
+        input_file = os.path.join(
+            self.example_data_dir, "Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"
+        )
         if not os.path.isfile(input_file):
             self.skipTest(f"Example CSV file not found: {input_file}")
         result = run_cli_redact(
             script_path=self.script_path,
             input_file=input_file,
             task="deduplicate",
             duplicate_type="tabular",
             text_columns=["text"],
+            similarity_threshold=0.95,
         )
         self.assertTrue(result, "Duplicate tabular detection should succeed")
         print("✅ Duplicate tabular detection passed")
     def test_textract_submit_document(self):
         """Test: Submit document to Textract for basic text analysis"""
         print("\n=== Testing Textract document submission ===")
+        input_file = os.path.join(
+            self.example_data_dir,
+            "example_of_emails_sent_to_a_professor_before_applying.pdf",
+        )
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         # Skip this test if AWS credentials are not available
         try:
+            run_cli_redact(
                 script_path=self.script_path,
                 input_file=input_file,
                 output_dir=self.temp_output_dir,
                 task="textract",
+                textract_action="submit",
             )
         except Exception as e:
             print(f"Textract test failed (expected without AWS credentials): {e}")
         # Note: This test may fail if AWS credentials are not configured
         print("✅ Textract document submission test completed")
     def test_textract_submit_with_signatures(self):
         """Test: Submit document to Textract for analysis with signature extraction"""
         print("\n=== Testing Textract submission with signature extraction ===")
+        input_file = os.path.join(
+            self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
+        )
         if not os.path.isfile(input_file):
             self.skipTest(f"Example file not found: {input_file}")
         # Skip this test if AWS credentials are not available
         try:
+            run_cli_redact(
                 script_path=self.script_path,
                 input_file=input_file,
                 output_dir=self.temp_output_dir,
                 task="textract",
                 textract_action="submit",
+                extract_signatures=True,
             )
         except Exception as e:
             print(f"Textract test failed (expected without AWS credentials): {e}")
         # Note: This test may fail if AWS credentials are not configured
         print("✅ Textract submission with signature extraction test completed")
     def test_textract_retrieve_results(self):
         """Test: Retrieve Textract results by job ID"""
         print("\n=== Testing Textract results retrieval ===")
         # Skip this test if AWS credentials are not available
         # This would require a valid job ID from a previous submission
         # For retrieve and list actions, we don't need a real input file
         try:
+            run_cli_redact(
                 script_path=self.script_path,
                 input_file=None,  # No input file needed for retrieve action
                 output_dir=self.temp_output_dir,
                 task="textract",
                 textract_action="retrieve",
+                job_id="12345678-1234-1234-1234-123456789012",  # Dummy job ID
             )
         except Exception as e:
             print(f"Textract test failed (expected without AWS credentials): {e}")
         # Note: This test will likely fail with a dummy job ID, but that's expected
         print("✅ Textract results retrieval test completed")
     def test_textract_list_jobs(self):
         """Test: List recent Textract jobs"""
         print("\n=== Testing Textract jobs listing ===")
         # Skip this test if AWS credentials are not available
         # For list action, we don't need a real input file
         try:
+            run_cli_redact(
                 script_path=self.script_path,
                 input_file=None,  # No input file needed for list action
                 output_dir=self.temp_output_dir,
                 task="textract",
+                textract_action="list",
             )
         except Exception as e:
             print(f"Textract test failed (expected without AWS credentials): {e}")
         # Note: This test may fail if AWS credentials are not configured
         print("✅ Textract jobs listing test completed")
+class TestGUIApp(unittest.TestCase):
+    """Test suite for GUI application loading and basic functionality."""
+    @classmethod
+    def setUpClass(cls):
+        """Set up test environment for GUI tests."""
+        cls.app_path = os.path.join(
+            os.path.dirname(os.path.dirname(__file__)), "app.py"
+        )
+        # Verify app.py exists
+        if not os.path.isfile(cls.app_path):
+            raise FileNotFoundError(f"App file not found: {cls.app_path}")
+        print(f"GUI test setup complete. App: {cls.app_path}")
+    def test_app_import_and_initialization(self):
+        """Test: Import app.py and check if the Gradio app object is created successfully."""
+        print("\n=== Testing GUI app import and initialization ===")
+        try:
+            # Add the parent directory to the path so we can import app
+            parent_dir = os.path.dirname(os.path.dirname(__file__))
+            if parent_dir not in sys.path:
+                sys.path.insert(0, parent_dir)
+            # Import the app module
+            import app
+            # Check if the app object exists and is a Gradio Blocks object
+            self.assertTrue(hasattr(app, 'app'), "App object should exist in the module")
+            # Check if it's a Gradio Blocks instance
+            import gradio as gr
+            self.assertIsInstance(app.app, gr.Blocks, "App should be a Gradio Blocks instance")
+            print("✅ GUI app import and initialization passed")
+        except ImportError as e:
+            error_msg = f"Failed to import app module: {e}"
+            if "gradio_image_annotation" in str(e):
+                error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+                error_msg += "\nPlease run: conda activate redaction"
+                error_msg += "\nThen run this test again."
+            self.fail(error_msg)
+        except Exception as e:
+            self.fail(f"Unexpected error during app initialization: {e}")
+    def test_app_launch_headless(self):
+        """Test: Launch the app in headless mode to verify it starts without errors."""
+        print("\n=== Testing GUI app launch in headless mode ===")
+        try:
+            # Add the parent directory to the path
+            parent_dir = os.path.dirname(os.path.dirname(__file__))
+            if parent_dir not in sys.path:
+                sys.path.insert(0, parent_dir)
+            # Import the app module
+            import app
+            import gradio as gr
+            # Set up a flag to track if the app launched successfully
+            app_launched = threading.Event()
+            launch_error = None
+            def launch_app():
+                try:
+                    # Launch the app in headless mode with a short timeout
+                    app.app.launch(
+                        show_error=True,
+                        inbrowser=False,  # Don't open browser
+                        server_port=0,    # Use any available port
+                        quiet=True,       # Suppress output
+                        prevent_thread_lock=True  # Don't block the main thread
+                    )
+                    app_launched.set()
+                except Exception as e:
+                    launch_error = e
+                    app_launched.set()
+            # Start the app in a separate thread
+            launch_thread = threading.Thread(target=launch_app)
+            launch_thread.daemon = True
+            launch_thread.start()
+            # Wait for the app to launch (with timeout)
+            if app_launched.wait(timeout=10):  # 10 second timeout
+                if launch_error:
+                    self.fail(f"App launch failed: {launch_error}")
+                else:
+                    print("✅ GUI app launch in headless mode passed")
+            else:
+                self.fail("App launch timed out after 10 seconds")
+        except Exception as e:
+            error_msg = f"Unexpected error during app launch test: {e}"
+            if "gradio_image_annotation" in str(e):
+                error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+                error_msg += "\nPlease run: conda activate redaction"
+                error_msg += "\nThen run this test again."
+            self.fail(error_msg)
+    def test_app_configuration_loading(self):
+        """Test: Verify that the app can load its configuration without errors."""
+        print("\n=== Testing GUI app configuration loading ===")
+        try:
+            # Add the parent directory to the path
+            parent_dir = os.path.dirname(os.path.dirname(__file__))
+            if parent_dir not in sys.path:
+                sys.path.insert(0, parent_dir)
+            # Import the app module
+            import app
+            # Check if key configuration variables are accessible
+            # These should be imported from tools.config
+            from tools.config import (
+                GRADIO_SERVER_PORT,
+                MAX_FILE_SIZE,
+                DEFAULT_LANGUAGE,
+                PII_DETECTION_MODELS
+            )
+            # Verify these are not None/empty
+            self.assertIsNotNone(GRADIO_SERVER_PORT, "GRADIO_SERVER_PORT should be configured")
+            self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
+            self.assertIsNotNone(DEFAULT_LANGUAGE, "DEFAULT_LANGUAGE should be configured")
+            self.assertIsNotNone(PII_DETECTION_MODELS, "PII_DETECTION_MODELS should be configured")
+            print("✅ GUI app configuration loading passed")
+        except ImportError as e:
+            error_msg = f"Failed to import configuration: {e}"
+            if "gradio_image_annotation" in str(e):
+                error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+                error_msg += "\nPlease run: conda activate redaction"
+                error_msg += "\nThen run this test again."
+            self.fail(error_msg)
+        except Exception as e:
+            error_msg = f"Unexpected error during configuration test: {e}"
+            if "gradio_image_annotation" in str(e):
+                error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+                error_msg += "\nPlease run: conda activate redaction"
+                error_msg += "\nThen run this test again."
+            self.fail(error_msg)
 def run_all_tests():
     """Run all test examples and report results."""
     print("=" * 80)
+    print("DOCUMENT REDACTION TEST SUITE")
     print("=" * 80)
+    print("This test suite includes:")
+    print("- CLI examples from the epilog")
+    print("- GUI application loading and initialization tests")
     print("Tests will be skipped if required example files are not found.")
     print("AWS-related tests may fail if credentials are not configured.")
     print("=" * 80)
     # Create test suite
     loader = unittest.TestLoader()
+    suite = unittest.TestSuite()
+    # Add CLI tests
+    cli_suite = loader.loadTestsFromTestCase(TestCLIRedactExamples)
+    suite.addTests(cli_suite)
+    # Add GUI tests
+    gui_suite = loader.loadTestsFromTestCase(TestGUIApp)
+    suite.addTests(gui_suite)
     # Run tests with detailed output
     runner = unittest.TextTestRunner(verbosity=2, stream=None)
     result = runner.run(suite)
     # Print summary
     print("\n" + "=" * 80)
     print("TEST SUMMARY")
     print(f"Failures: {len(result.failures)}")
     print(f"Errors: {len(result.errors)}")
     print(f"Skipped: {len(result.skipped) if hasattr(result, 'skipped') else 0}")
     if result.failures:
         print("\nFAILURES:")
         for test, traceback in result.failures:
             print(f"- {test}: {traceback}")
     if result.errors:
         print("\nERRORS:")
         for test, traceback in result.errors:
             print(f"- {test}: {traceback}")
     success = len(result.failures) == 0 and len(result.errors) == 0
     print(f"\nOverall result: {'✅ PASSED' if success else '❌ FAILED'}")
     print("=" * 80)
     return success
 if __name__ == "__main__":
     # Run the test suite
     success = run_all_tests()
+    exit(0 if success else 1)

test/test_gui_only.py ADDED Viewed

	@@ -0,0 +1,198 @@

+#!/usr/bin/env python3
+"""
+Standalone GUI test script for the document redaction application.
+This script tests only the GUI functionality of app.py to ensure it loads correctly.
+Run this script to verify that the Gradio interface can be imported and initialized.
+"""
+import os
+import sys
+import unittest
+import threading
+import time
+# Add the parent directory to the path so we can import the app
+parent_dir = os.path.dirname(os.path.dirname(__file__))
+if parent_dir not in sys.path:
+    sys.path.insert(0, parent_dir)
+class TestGUIAppOnly(unittest.TestCase):
+    """Test suite for GUI application loading and basic functionality."""
+    @classmethod
+    def setUpClass(cls):
+        """Set up test environment for GUI tests."""
+        cls.app_path = os.path.join(parent_dir, "app.py")
+        # Verify app.py exists
+        if not os.path.isfile(cls.app_path):
+            raise FileNotFoundError(f"App file not found: {cls.app_path}")
+        print(f"GUI test setup complete. App: {cls.app_path}")
+    def test_app_import_and_initialization(self):
+        """Test: Import app.py and check if the Gradio app object is created successfully."""
+        print("\n=== Testing GUI app import and initialization ===")
+        try:
+            # Import the app module
+            import app
+            # Check if the app object exists and is a Gradio Blocks object
+            self.assertTrue(hasattr(app, 'app'), "App object should exist in the module")
+            # Check if it's a Gradio Blocks instance
+            import gradio as gr
+            self.assertIsInstance(app.app, gr.Blocks, "App should be a Gradio Blocks instance")
+            print("✅ GUI app import and initialization passed")
+        except ImportError as e:
+            error_msg = f"Failed to import app module: {e}"
+            if "gradio_image_annotation" in str(e):
+                error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+                error_msg += "\nPlease run: conda activate redaction"
+                error_msg += "\nThen run this test again."
+            self.fail(error_msg)
+        except Exception as e:
+            self.fail(f"Unexpected error during app initialization: {e}")
+    def test_app_launch_headless(self):
+        """Test: Launch the app in headless mode to verify it starts without errors."""
+        print("\n=== Testing GUI app launch in headless mode ===")
+        try:
+            # Import the app module
+            import app
+            import gradio as gr
+            # Set up a flag to track if the app launched successfully
+            app_launched = threading.Event()
+            launch_error = None
+            def launch_app():
+                try:
+                    # Launch the app in headless mode with a short timeout
+                    app.app.launch(
+                        show_error=True,
+                        inbrowser=False,  # Don't open browser
+                        server_port=0,    # Use any available port
+                        quiet=True,       # Suppress output
+                        prevent_thread_lock=True  # Don't block the main thread
+                    )
+                    app_launched.set()
+                except Exception as e:
+                    launch_error = e
+                    app_launched.set()
+            # Start the app in a separate thread
+            launch_thread = threading.Thread(target=launch_app)
+            launch_thread.daemon = True
+            launch_thread.start()
+            # Wait for the app to launch (with timeout)
+            if app_launched.wait(timeout=10):  # 10 second timeout
+                if launch_error:
+                    self.fail(f"App launch failed: {launch_error}")
+                else:
+                    print("✅ GUI app launch in headless mode passed")
+            else:
+                self.fail("App launch timed out after 10 seconds")
+        except Exception as e:
+            error_msg = f"Unexpected error during app launch test: {e}"
+            if "gradio_image_annotation" in str(e):
+                error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+                error_msg += "\nPlease run: conda activate redaction"
+                error_msg += "\nThen run this test again."
+            self.fail(error_msg)
+    def test_app_configuration_loading(self):
+        """Test: Verify that the app can load its configuration without errors."""
+        print("\n=== Testing GUI app configuration loading ===")
+        try:
+            # Import the app module
+            import app
+            # Check if key configuration variables are accessible
+            # These should be imported from tools.config
+            from tools.config import (
+                GRADIO_SERVER_PORT,
+                MAX_FILE_SIZE,
+                DEFAULT_LANGUAGE,
+                PII_DETECTION_MODELS
+            )
+            # Verify these are not None/empty
+            self.assertIsNotNone(GRADIO_SERVER_PORT, "GRADIO_SERVER_PORT should be configured")
+            self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
+            self.assertIsNotNone(DEFAULT_LANGUAGE, "DEFAULT_LANGUAGE should be configured")
+            self.assertIsNotNone(PII_DETECTION_MODELS, "PII_DETECTION_MODELS should be configured")
+            print("✅ GUI app configuration loading passed")
+        except ImportError as e:
+            error_msg = f"Failed to import configuration: {e}"
+            if "gradio_image_annotation" in str(e):
+                error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+                error_msg += "\nPlease run: conda activate redaction"
+                error_msg += "\nThen run this test again."
+            self.fail(error_msg)
+        except Exception as e:
+            error_msg = f"Unexpected error during configuration test: {e}"
+            if "gradio_image_annotation" in str(e):
+                error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+                error_msg += "\nPlease run: conda activate redaction"
+                error_msg += "\nThen run this test again."
+            self.fail(error_msg)
+def run_gui_tests():
+    """Run GUI tests and report results."""
+    print("=" * 80)
+    print("DOCUMENT REDACTION GUI TEST SUITE")
+    print("=" * 80)
+    print("This test suite verifies that the GUI application loads correctly.")
+    print("=" * 80)
+    # Create test suite
+    loader = unittest.TestLoader()
+    suite = loader.loadTestsFromTestCase(TestGUIAppOnly)
+    # Run tests with detailed output
+    runner = unittest.TextTestRunner(verbosity=2, stream=None)
+    result = runner.run(suite)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("GUI TEST SUMMARY")
+    print("=" * 80)
+    print(f"Tests run: {result.testsRun}")
+    print(f"Failures: {len(result.failures)}")
+    print(f"Errors: {len(result.errors)}")
+    print(f"Skipped: {len(result.skipped) if hasattr(result, 'skipped') else 0}")
+    if result.failures:
+        print("\nFAILURES:")
+        for test, traceback in result.failures:
+            print(f"- {test}: {traceback}")
+    if result.errors:
+        print("\nERRORS:")
+        for test, traceback in result.errors:
+            print(f"- {test}: {traceback}")
+    success = len(result.failures) == 0 and len(result.errors) == 0
+    print(f"\nOverall result: {'✅ PASSED' if success else '❌ FAILED'}")
+    print("=" * 80)
+    return success
+if __name__ == "__main__":
+    # Run the GUI test suite
+    success = run_gui_tests()
+    exit(0 if success else 1)

tools/auth.py CHANGED Viewed

@@ -1,22 +1,33 @@
-#import os
-import boto3
-#import gradio as gr
-import hmac
-import hashlib
 import base64
-from tools.config import AWS_CLIENT_ID, AWS_CLIENT_SECRET, AWS_USER_POOL_ID, AWS_REGION
-def calculate_secret_hash(client_id:str, client_secret:str, username:str):
     message = username + client_id
     dig = hmac.new(
-        str(client_secret).encode('utf-8'),
-        msg=str(message).encode('utf-8'),
-        digestmod=hashlib.sha256
     ).digest()
     secret_hash = base64.b64encode(dig).decode()
     return secret_hash
-def authenticate_user(username:str, password:str, user_pool_id:str=AWS_USER_POOL_ID, client_id:str=AWS_CLIENT_ID, client_secret:str=AWS_CLIENT_SECRET):
     """Authenticates a user against an AWS Cognito user pool.
     Args:
@@ -30,36 +41,38 @@ def authenticate_user(username:str, password:str, user_pool_id:str=AWS_USER_POOL
         bool: True if the user is authenticated, False otherwise.
     """
-    client = boto3.client('cognito-idp', region_name=AWS_REGION)  # Cognito Identity Provider client
     # Compute the secret hash
     secret_hash = calculate_secret_hash(client_id, client_secret, username)
     try:
-        if client_secret == '':
             response = client.initiate_auth(
-                AuthFlow='USER_PASSWORD_AUTH',
                 AuthParameters={
-                    'USERNAME': username,
-                    'PASSWORD': password,
                 },
-                ClientId=client_id
             )
         else:
             response = client.initiate_auth(
-            AuthFlow='USER_PASSWORD_AUTH',
-            AuthParameters={
-                'USERNAME': username,
-                'PASSWORD': password,
-                'SECRET_HASH': secret_hash
-            },
-            ClientId=client_id
             )
         # If successful, you'll receive an AuthenticationResult in the response
-        if response.get('AuthenticationResult'):
             return True
         else:
             return False
@@ -72,4 +85,4 @@ def authenticate_user(username:str, password:str, user_pool_id:str=AWS_USER_POOL
         out_message = f"An error occurred: {e}"
         print(out_message)
         raise Exception(out_message)
-        return False

+# import os
 import base64
+import hashlib
+# import gradio as gr
+import hmac
+import boto3
+from tools.config import AWS_CLIENT_ID, AWS_CLIENT_SECRET, AWS_REGION, AWS_USER_POOL_ID
+def calculate_secret_hash(client_id: str, client_secret: str, username: str):
     message = username + client_id
     dig = hmac.new(
+        str(client_secret).encode("utf-8"),
+        msg=str(message).encode("utf-8"),
+        digestmod=hashlib.sha256,
     ).digest()
     secret_hash = base64.b64encode(dig).decode()
     return secret_hash
+def authenticate_user(
+    username: str,
+    password: str,
+    user_pool_id: str = AWS_USER_POOL_ID,
+    client_id: str = AWS_CLIENT_ID,
+    client_secret: str = AWS_CLIENT_SECRET,
+):
     """Authenticates a user against an AWS Cognito user pool.
     Args:
         bool: True if the user is authenticated, False otherwise.
     """
+    client = boto3.client(
+        "cognito-idp", region_name=AWS_REGION
+    )  # Cognito Identity Provider client
     # Compute the secret hash
     secret_hash = calculate_secret_hash(client_id, client_secret, username)
     try:
+        if client_secret == "":
             response = client.initiate_auth(
+                AuthFlow="USER_PASSWORD_AUTH",
                 AuthParameters={
+                    "USERNAME": username,
+                    "PASSWORD": password,
                 },
+                ClientId=client_id,
             )
         else:
             response = client.initiate_auth(
+                AuthFlow="USER_PASSWORD_AUTH",
+                AuthParameters={
+                    "USERNAME": username,
+                    "PASSWORD": password,
+                    "SECRET_HASH": secret_hash,
+                },
+                ClientId=client_id,
             )
         # If successful, you'll receive an AuthenticationResult in the response
+        if response.get("AuthenticationResult"):
             return True
         else:
             return False
         out_message = f"An error occurred: {e}"
         print(out_message)
         raise Exception(out_message)
+        return False

tools/aws_functions.py CHANGED Viewed

@@ -1,28 +1,37 @@
-from typing import Type, List
-import pandas as pd
-import boto3
-import tempfile
 import os
-from tools.config import AWS_REGION, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SAVE_LOGS_TO_CSV
 PandasDataFrame = Type[pd.DataFrame]
 def get_assumed_role_info():
-    sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
-    sts = boto3.client('sts', region_name=AWS_REGION, endpoint_url=sts_endpoint)
     response = sts.get_caller_identity()
     # Extract ARN of the assumed role
-    assumed_role_arn = response['Arn']
     # Extract the name of the assumed role from the ARN
-    assumed_role_name = assumed_role_arn.split('/')[-1]
     return assumed_role_arn, assumed_role_name
 if RUN_AWS_FUNCTIONS == "1":
-    try:
-        session = boto3.Session(region_name=AWS_REGION)
     except Exception as e:
         print("Could not start boto3 session:", e)
@@ -30,14 +39,20 @@ if RUN_AWS_FUNCTIONS == "1":
         assumed_role_arn, assumed_role_name = get_assumed_role_info()
         print("Successfully assumed ARN role")
-        #print("Assumed Role ARN:", assumed_role_arn)
-        #print("Assumed Role Name:", assumed_role_name)
     except Exception as e:
         print("Could not get assumed role from STS:", e)
 # Download direct from S3 - requires login credentials
-def download_file_from_s3(bucket_name:str, key:str, local_file_path_and_name:str, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
     if RUN_AWS_FUNCTIONS == "1":
@@ -45,30 +60,39 @@ def download_file_from_s3(bucket_name:str, key:str, local_file_path_and_name:str
             # Ensure the local directory exists
             os.makedirs(os.path.dirname(local_file_path_and_name), exist_ok=True)
-            s3 = boto3.client('s3', region_name=AWS_REGION)
             s3.download_file(bucket_name, key, local_file_path_and_name)
-            print(f"File downloaded from s3://{bucket_name}/{key} to {local_file_path_and_name}")
         except Exception as e:
             print("Could not download file:", key, "from s3 due to", e)
-def download_folder_from_s3(bucket_name:str, s3_folder:str, local_folder:str, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
     """
     Download all files from an S3 folder to a local folder.
     """
     if RUN_AWS_FUNCTIONS == "1":
         if bucket_name and s3_folder and local_folder:
-            s3 = boto3.client('s3', region_name=AWS_REGION)
             # List objects in the specified S3 folder
             response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
             # Download each object
-            for obj in response.get('Contents', []):
                 # Extract object key and construct local file path
-                object_key = obj['Key']
-                local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
                 # Create directories if necessary
                 os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
@@ -76,12 +100,24 @@ def download_folder_from_s3(bucket_name:str, s3_folder:str, local_folder:str, RU
                 # Download the object
                 try:
                     s3.download_file(bucket_name, object_key, local_file_path)
-                    print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
                 except Exception as e:
                     print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
-        else: print("One or more required variables are empty, could not download from S3")
-def download_files_from_s3(bucket_name:str, s3_folder:str, local_folder:str, filenames:List[str], RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
     """
     Download specific files from an S3 folder to a local folder.
     """
@@ -89,18 +125,20 @@ def download_files_from_s3(bucket_name:str, s3_folder:str, local_folder:str, fil
     if RUN_AWS_FUNCTIONS == "1":
         if bucket_name and s3_folder and local_folder and filenames:
-            s3 = boto3.client('s3', region_name=AWS_REGION)
             print("Trying to download file: ", filenames)
-            if filenames == '*':
                 # List all objects in the S3 folder
                 print("Trying to download all files in AWS folder: ", s3_folder)
                 response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
-                print("Found files in AWS folder: ", response.get('Contents', []))
-                filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
                 print("Found filenames in AWS folder: ", filenames)
@@ -114,13 +152,24 @@ def download_files_from_s3(bucket_name:str, s3_folder:str, local_folder:str, fil
                 # Download the object
                 try:
                     s3.download_file(bucket_name, object_key, local_file_path)
-                    print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
                 except Exception as e:
                     print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
-        else: print("One or more required variables are empty, could not download from S3")
-def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
     """
     Uploads a file from local machine to Amazon S3.
@@ -139,14 +188,14 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCU
         try:
             if s3_bucket and s3_key and local_file_paths:
-                s3_client = boto3.client('s3', region_name=AWS_REGION)
                 if isinstance(local_file_paths, str):
                     local_file_paths = [local_file_paths]
                 for file in local_file_paths:
                     if s3_client:
-                        #print(s3_client)
                         try:
                             # Get file name off file path
                             file_name = os.path.basename(file)
@@ -155,28 +204,40 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCU
                             print("S3 key: ", s3_key_full)
                             s3_client.upload_file(file, s3_bucket, s3_key_full)
-                            out_message = "File " + file_name + " uploaded successfully!"
                             print(out_message)
                         except Exception as e:
                             out_message = f"Error uploading file(s): {e}"
                             print(out_message)
                         final_out_message.append(out_message)
-                        final_out_message_str = '\n'.join(final_out_message)
-                    else: final_out_message_str = "Could not connect to AWS."
-            else: final_out_message_str = "At least one essential variable is empty, could not upload to S3"
         except Exception as e:
             final_out_message_str = "Could not upload files to S3 due to: " + str(e)
             print(final_out_message_str)
     else:
-        final_out_message_str = "App config will not AWS functions"
     return final_out_message_str
-def upload_log_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS, SAVE_LOGS_TO_CSV:str=SAVE_LOGS_TO_CSV):
     """
     Uploads a log file from local machine to Amazon S3.
@@ -195,14 +256,14 @@ def upload_log_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=
         try:
             if s3_bucket and s3_key and local_file_paths:
-                s3_client = boto3.client('s3', region_name=AWS_REGION)
                 if isinstance(local_file_paths, str):
                     local_file_paths = [local_file_paths]
                 for file in local_file_paths:
                     if s3_client:
-                        #print(s3_client)
                         try:
                             # Get file name off file path
                             file_name = os.path.basename(file)
@@ -211,23 +272,29 @@ def upload_log_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=
                             print("S3 key: ", s3_key_full)
                             s3_client.upload_file(file, s3_bucket, s3_key_full)
-                            out_message = "File " + file_name + " uploaded successfully!"
                             print(out_message)
                         except Exception as e:
                             out_message = f"Error uploading file(s): {e}"
                             print(out_message)
                         final_out_message.append(out_message)
-                        final_out_message_str = '\n'.join(final_out_message)
-                    else: final_out_message_str = "Could not connect to AWS."
-            else: final_out_message_str = "At least one essential variable is empty, could not upload to S3"
         except Exception as e:
             final_out_message_str = "Could not upload files to S3 due to: " + str(e)
             print(final_out_message_str)
     else:
-        final_out_message_str = "App config will not AWS functions"
         print(final_out_message_str)
     return final_out_message_str

 import os
+from typing import List, Type
+import boto3
+import pandas as pd
+from tools.config import (
+    AWS_REGION,
+    DOCUMENT_REDACTION_BUCKET,
+    RUN_AWS_FUNCTIONS,
+    SAVE_LOGS_TO_CSV,
+)
 PandasDataFrame = Type[pd.DataFrame]
 def get_assumed_role_info():
+    sts_endpoint = "https://sts." + AWS_REGION + ".amazonaws.com"
+    sts = boto3.client("sts", region_name=AWS_REGION, endpoint_url=sts_endpoint)
     response = sts.get_caller_identity()
     # Extract ARN of the assumed role
+    assumed_role_arn = response["Arn"]
     # Extract the name of the assumed role from the ARN
+    assumed_role_name = assumed_role_arn.split("/")[-1]
     return assumed_role_arn, assumed_role_name
 if RUN_AWS_FUNCTIONS == "1":
+    try:
+        session = boto3.Session(region_name=AWS_REGION)
     except Exception as e:
         print("Could not start boto3 session:", e)
         assumed_role_arn, assumed_role_name = get_assumed_role_info()
         print("Successfully assumed ARN role")
+        # print("Assumed Role ARN:", assumed_role_arn)
+        # print("Assumed Role Name:", assumed_role_name)
     except Exception as e:
         print("Could not get assumed role from STS:", e)
 # Download direct from S3 - requires login credentials
+def download_file_from_s3(
+    bucket_name: str,
+    key: str,
+    local_file_path_and_name: str,
+    RUN_AWS_FUNCTIONS: str = RUN_AWS_FUNCTIONS,
+):
     if RUN_AWS_FUNCTIONS == "1":
             # Ensure the local directory exists
             os.makedirs(os.path.dirname(local_file_path_and_name), exist_ok=True)
+            s3 = boto3.client("s3", region_name=AWS_REGION)
             s3.download_file(bucket_name, key, local_file_path_and_name)
+            print(
+                f"File downloaded from s3://{bucket_name}/{key} to {local_file_path_and_name}"
+            )
         except Exception as e:
             print("Could not download file:", key, "from s3 due to", e)
+def download_folder_from_s3(
+    bucket_name: str,
+    s3_folder: str,
+    local_folder: str,
+    RUN_AWS_FUNCTIONS: str = RUN_AWS_FUNCTIONS,
+):
     """
     Download all files from an S3 folder to a local folder.
     """
     if RUN_AWS_FUNCTIONS == "1":
         if bucket_name and s3_folder and local_folder:
+            s3 = boto3.client("s3", region_name=AWS_REGION)
             # List objects in the specified S3 folder
             response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
             # Download each object
+            for obj in response.get("Contents", []):
                 # Extract object key and construct local file path
+                object_key = obj["Key"]
+                local_file_path = os.path.join(
+                    local_folder, os.path.relpath(object_key, s3_folder)
+                )
                 # Create directories if necessary
                 os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
                 # Download the object
                 try:
                     s3.download_file(bucket_name, object_key, local_file_path)
+                    print(
+                        f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'"
+                    )
                 except Exception as e:
                     print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
+        else:
+            print(
+                "One or more required variables are empty, could not download from S3"
+            )
+def download_files_from_s3(
+    bucket_name: str,
+    s3_folder: str,
+    local_folder: str,
+    filenames: List[str],
+    RUN_AWS_FUNCTIONS: str = RUN_AWS_FUNCTIONS,
+):
     """
     Download specific files from an S3 folder to a local folder.
     """
     if RUN_AWS_FUNCTIONS == "1":
         if bucket_name and s3_folder and local_folder and filenames:
+            s3 = boto3.client("s3", region_name=AWS_REGION)
             print("Trying to download file: ", filenames)
+            if filenames == "*":
                 # List all objects in the S3 folder
                 print("Trying to download all files in AWS folder: ", s3_folder)
                 response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
+                print("Found files in AWS folder: ", response.get("Contents", []))
+                filenames = [
+                    obj["Key"].split("/")[-1] for obj in response.get("Contents", [])
+                ]
                 print("Found filenames in AWS folder: ", filenames)
                 # Download the object
                 try:
                     s3.download_file(bucket_name, object_key, local_file_path)
+                    print(
+                        f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'"
+                    )
                 except Exception as e:
                     print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
+        else:
+            print(
+                "One or more required variables are empty, could not download from S3"
+            )
+def upload_file_to_s3(
+    local_file_paths: List[str],
+    s3_key: str,
+    s3_bucket: str = DOCUMENT_REDACTION_BUCKET,
+    RUN_AWS_FUNCTIONS: str = RUN_AWS_FUNCTIONS,
+):
     """
     Uploads a file from local machine to Amazon S3.
         try:
             if s3_bucket and s3_key and local_file_paths:
+                s3_client = boto3.client("s3", region_name=AWS_REGION)
                 if isinstance(local_file_paths, str):
                     local_file_paths = [local_file_paths]
                 for file in local_file_paths:
                     if s3_client:
+                        # print(s3_client)
                         try:
                             # Get file name off file path
                             file_name = os.path.basename(file)
                             print("S3 key: ", s3_key_full)
                             s3_client.upload_file(file, s3_bucket, s3_key_full)
+                            out_message = (
+                                "File " + file_name + " uploaded successfully!"
+                            )
                             print(out_message)
                         except Exception as e:
                             out_message = f"Error uploading file(s): {e}"
                             print(out_message)
                         final_out_message.append(out_message)
+                        final_out_message_str = "\n".join(final_out_message)
+                    else:
+                        final_out_message_str = "Could not connect to AWS."
+            else:
+                final_out_message_str = (
+                    "At least one essential variable is empty, could not upload to S3"
+                )
         except Exception as e:
             final_out_message_str = "Could not upload files to S3 due to: " + str(e)
             print(final_out_message_str)
     else:
+        final_out_message_str = "App config will not run AWS functions"
     return final_out_message_str
+def upload_log_file_to_s3(
+    local_file_paths: List[str],
+    s3_key: str,
+    s3_bucket: str = DOCUMENT_REDACTION_BUCKET,
+    RUN_AWS_FUNCTIONS: str = RUN_AWS_FUNCTIONS,
+    SAVE_LOGS_TO_CSV: str = SAVE_LOGS_TO_CSV,
+):
     """
     Uploads a log file from local machine to Amazon S3.
         try:
             if s3_bucket and s3_key and local_file_paths:
+                s3_client = boto3.client("s3", region_name=AWS_REGION)
                 if isinstance(local_file_paths, str):
                     local_file_paths = [local_file_paths]
                 for file in local_file_paths:
                     if s3_client:
+                        # print(s3_client)
                         try:
                             # Get file name off file path
                             file_name = os.path.basename(file)
                             print("S3 key: ", s3_key_full)
                             s3_client.upload_file(file, s3_bucket, s3_key_full)
+                            out_message = (
+                                "File " + file_name + " uploaded successfully!"
+                            )
                             print(out_message)
                         except Exception as e:
                             out_message = f"Error uploading file(s): {e}"
                             print(out_message)
                         final_out_message.append(out_message)
+                        final_out_message_str = "\n".join(final_out_message)
+                    else:
+                        final_out_message_str = "Could not connect to AWS."
+            else:
+                final_out_message_str = (
+                    "At least one essential variable is empty, could not upload to S3"
+                )
         except Exception as e:
             final_out_message_str = "Could not upload files to S3 due to: " + str(e)
             print(final_out_message_str)
     else:
+        final_out_message_str = "App config will not run AWS functions"
         print(final_out_message_str)
     return final_out_message_str

tools/aws_textract.py CHANGED Viewed

@@ -1,27 +1,44 @@
-import boto3
-from typing import List
 import io
-import os
 import json
-import pikepdf
 import time
 import pandas as pd
-from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
-from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, RUN_AWS_FUNCTIONS, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS
-def extract_textract_metadata(response:object):
     """Extracts metadata from an AWS Textract response."""
-    request_id = response['ResponseMetadata']['RequestId']
-    pages = response['DocumentMetadata']['Pages']
-    return str({
-        'RequestId': request_id,
-        'Pages': pages
-    })
-def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Extract handwriting"], textract_output_found:bool=False, aws_access_key_textbox:str=AWS_ACCESS_KEY, aws_secret_key_textbox:str=AWS_SECRET_KEY, RUN_AWS_FUNCTIONS:str=RUN_AWS_FUNCTIONS, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:str=PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS):
-    '''
     Analyzes a single page of a document using AWS Textract to extract text and other features.
     Args:
@@ -53,45 +70,63 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
         Tuple[List[Dict], str]: A tuple containing:
             - A list of dictionaries, where each dictionary represents a Textract block (e.g., LINE, WORD, FORM, TABLE).
             - A string containing metadata about the Textract request.
-    '''
-    #print("handwrite_signature_checkbox in analyse_page_with_textract:", handwrite_signature_checkbox)
     if client == "":
-        try:
             # Try to connect to AWS Textract Client if using that text extraction method
-            if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
                 print("Connecting to Textract via existing SSO connection")
-                client = boto3.client('textract', region_name=AWS_REGION)
             elif aws_access_key_textbox and aws_secret_key_textbox:
-                print("Connecting to Textract using AWS access key and secret keys from user input.")
-                client = boto3.client('textract',
-                    aws_access_key_id=aws_access_key_textbox,
-                    aws_secret_access_key=aws_secret_key_textbox, region_name=AWS_REGION)
             elif RUN_AWS_FUNCTIONS == "1":
                 print("Connecting to Textract via existing SSO connection")
-                client = boto3.client('textract', region_name=AWS_REGION)
             elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
                 print("Getting Textract credentials from environment variables.")
-                client = boto3.client('textract',
-                    aws_access_key_id=AWS_ACCESS_KEY,
-                    aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
-            elif textract_output_found==True:
-                print("Existing Textract data found for file, no need to connect to AWS Textract")
-                client = boto3.client('textract', region_name=AWS_REGION)
             else:
                 client = ""
                 out_message = "Cannot connect to AWS Textract service."
                 print(out_message)
                 raise Exception(out_message)
-        except:
             out_message = "Cannot connect to AWS Textract"
-            print(out_message)
             raise Exception(out_message)
             return [], ""  # Return an empty list and an empty string
     # Redact signatures if specified
     feature_types = list()
-    if "Extract signatures" in handwrite_signature_checkbox or "Extract forms" in handwrite_signature_checkbox or "Extract layout" in handwrite_signature_checkbox or "Extract tables" in handwrite_signature_checkbox:
         if "Extract signatures" in handwrite_signature_checkbox:
             feature_types.append("SIGNATURES")
         if "Extract forms" in handwrite_signature_checkbox:
@@ -101,39 +136,50 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
         if "Extract tables" in handwrite_signature_checkbox:
             feature_types.append("TABLES")
         try:
-            response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=feature_types)
         except Exception as e:
             print("Textract call failed due to:", e, "trying again in 3 seconds.")
             time.sleep(3)
-            response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=feature_types)
-    if not "Extract signatures" in handwrite_signature_checkbox and not "Extract forms" in handwrite_signature_checkbox and not "Extract layout" in handwrite_signature_checkbox and not "Extract tables" in handwrite_signature_checkbox:
         # Call detect_document_text to extract plain text
         try:
-            response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
         except Exception as e:
             print("Textract call failed due to:", e, "trying again in 5 seconds.")
             time.sleep(5)
-            response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
-     # Add the 'Page' attribute to each block
     if "Blocks" in response:
         for block in response["Blocks"]:
             block["Page"] = page_no  # Inject the page number into each block
     # Wrap the response with the page number in the desired format
-    wrapped_response = {
-        'page_no': page_no,
-        'data': response
-    }
-    request_metadata = extract_textract_metadata(response)  # Metadata comes out as a string
     # Return a list containing the wrapped response and the metadata
-    return wrapped_response, request_metadata  # Return as a list to match the desired structure
-def convert_pike_pdf_page_to_bytes(pdf:object, page_num:int):
     # Create a new empty PDF
     new_pdf = pikepdf.Pdf.new()
@@ -155,8 +201,11 @@ def convert_pike_pdf_page_to_bytes(pdf:object, page_num:int):
     return pdf_bytes
-def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
-    '''
     Convert the json response from Textract to the OCRResult format used elsewhere in the code.
     Looks for lines, words, and signatures. Handwriting and signatures are set aside especially
     for later in case the user wants to override the default behaviour and redact all
@@ -175,7 +224,7 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
             - list: Bounding boxes identified specifically as signatures.
             - list: Bounding boxes identified specifically as handwriting.
             - dict: OCR results with word-level detail, structured for further processing.
-    '''
     all_ocr_results = list()
     signature_or_handwriting_recogniser_results = list()
     signature_recogniser_results = list()
@@ -183,14 +232,14 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
     signatures = list()
     handwriting = list()
     ocr_results_with_words = dict()
-    text_block=dict()
     text_line_number = 1
     # Assuming json_data is structured as a dictionary with a "pages" key
     # Find the specific page data
-    page_json_data = json_data #next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
     if "Blocks" in page_json_data:
         # Access the data for the specific page
@@ -198,14 +247,17 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
     # This is a new page
     elif "page_no" in page_json_data:
         text_blocks = page_json_data["data"]["Blocks"]
-    else: text_blocks = []
     is_signature = False
     is_handwriting = False
-    for text_block in text_blocks:
-        if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
             # Extract text and bounding box for the line
             line_bbox = text_block["Geometry"]["BoundingBox"]
@@ -217,26 +269,41 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
             width_abs = int(line_bbox["Width"] * page_width)
             height_abs = int(line_bbox["Height"] * page_height)
-            if text_block['BlockType'] == 'LINE':
                 # Extract text and bounding box for the line
-                line_text = text_block.get('Text', '')
                 words = []
-                current_line_handwriting_results = []  # Track handwriting results for this line
-                if 'Relationships' in text_block:
-                    for relationship in text_block['Relationships']:
-                        if relationship['Type'] == 'CHILD':
-                            for child_id in relationship['Ids']:
-                                child_block = next((block for block in text_blocks if block['Id'] == child_id), None)
-                                if child_block and child_block['BlockType'] == 'WORD':
-                                    word_text = child_block.get('Text', '')
                                     word_bbox = child_block["Geometry"]["BoundingBox"]
-                                    confidence = child_block.get('Confidence','')
                                     word_left = int(word_bbox["Left"] * page_width)
                                     word_top = int(word_bbox["Top"] * page_height)
-                                    word_right = int((word_bbox["Left"] + word_bbox["Width"]) * page_width)
-                                    word_bottom = int((word_bbox["Top"] + word_bbox["Height"]) * page_height)
                                     # Extract BoundingBox details
                                     word_width = word_bbox["Width"]
@@ -245,13 +312,20 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
                                     # Convert proportional coordinates to absolute coordinates
                                     word_width_abs = int(word_width * page_width)
                                     word_height_abs = int(word_height * page_height)
-                                    words.append({
-                                        'text': word_text,
-                                        'bounding_box': (word_left, word_top, word_right, word_bottom)
-                                    })
                                     # Check for handwriting
-                                    text_type = child_block.get("TextType", '')
                                     if text_type == "HANDWRITING":
                                         is_handwriting = True
@@ -267,22 +341,28 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
                                             left=word_left,
                                             top=word_top,
                                             width=word_width_abs,
-                                            height=word_height_abs
                                         )
                                         # Add to handwriting collections immediately
                                         handwriting.append(recogniser_result)
-                                        handwriting_recogniser_results.append(recogniser_result)
-                                        signature_or_handwriting_recogniser_results.append(recogniser_result)
-                                        current_line_handwriting_results.append(recogniser_result)
-            # If handwriting or signature, add to bounding box
-            elif (text_block['BlockType'] == 'SIGNATURE'):
                 line_text = "SIGNATURE"
                 is_signature = True
                 entity_name = "SIGNATURE"
-                confidence = text_block.get('Confidence', 0)
                 word_end = len(line_text)
                 recogniser_result = CustomImageRecognizerResult(
@@ -294,7 +374,7 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
                     left=line_left,
                     top=line_top,
                     width=width_abs,
-                    height=height_abs
                 )
                 # Add to signature collections immediately
@@ -302,13 +382,15 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
                 signature_recogniser_results.append(recogniser_result)
                 signature_or_handwriting_recogniser_results.append(recogniser_result)
-                words = [{
-                    'text': line_text,
-                    'bounding_box': (line_left, line_top, line_right, line_bottom)
-                }]
         else:
             line_text = ""
-            words=[]
             line_left = 0
             line_top = 0
             line_right = 0
@@ -320,14 +402,22 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
             ocr_results_with_words["text_line_" + str(text_line_number)] = {
                 "line": text_line_number,
-                'text': line_text,
-                'bounding_box': (line_left, line_top, line_right, line_bottom),
-                'words': words,
-                'page': page_no
             }
             # Create OCRResult with absolute coordinates
-            ocr_result = OCRResult(line_text, line_left, line_top, width_abs, height_abs, conf=confidence, line=text_line_number)
             all_ocr_results.append(ocr_result)
             # Increase line number
@@ -337,35 +427,50 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
         # If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
         if is_signature_or_handwriting:
-            if recogniser_result not in signature_or_handwriting_recogniser_results:
                 signature_or_handwriting_recogniser_results.append(recogniser_result)
             if is_signature:
-                if recogniser_result not in signature_recogniser_results:
                     signature_recogniser_results.append(recogniser_result)
-            if is_handwriting:
-                if recogniser_result not in handwriting_recogniser_results:
                     handwriting_recogniser_results.append(recogniser_result)
     # Add page key to the line level results
     all_ocr_results_with_page = {"page": page_no, "results": all_ocr_results}
-    ocr_results_with_words_with_page = {"page": page_no, "results": ocr_results_with_words}
-    return all_ocr_results_with_page, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_words_with_page
-def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
     """
     Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
     """
     if not os.path.exists(textract_json_file_path):
         print("No existing Textract results file found.")
-        return {}, True, log_files_output_paths  # Return empty dict and flag indicating missing file
-    no_textract_file = False
     print("Found existing Textract json results file.")
     # Track log files
@@ -373,7 +478,7 @@ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output
         log_files_output_paths.append(textract_json_file_path)
     try:
-        with open(textract_json_file_path, 'r', encoding='utf-8') as json_file:
             textract_data = json.load(json_file)
     except json.JSONDecodeError:
         print("Error: Failed to parse Textract JSON file. Returning empty data.")
@@ -387,21 +492,30 @@ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output
     if "Blocks" in textract_data:
         print("Need to convert Textract JSON to app format.")
         try:
             textract_data = restructure_textract_output(textract_data, page_sizes_df)
-            return textract_data, False, log_files_output_paths  # Successfully converted
         except Exception as e:
             print("Failed to convert JSON data to app format due to:", e)
             return {}, True, log_files_output_paths  # Conversion failed
     else:
         print("Invalid Textract JSON format: 'Blocks' missing.")
-        #print("textract data:", textract_data)
-        return {}, True, log_files_output_paths  # Return empty data if JSON is not recognized
-def restructure_textract_output(textract_output: dict, page_sizes_df:pd.DataFrame):
     """
-    Reorganise Textract output from the bulk Textract analysis option on AWS
     into a format that works in this redaction app, reducing size.
     """
     pages_dict = {}
@@ -410,8 +524,8 @@ def restructure_textract_output(textract_output: dict, page_sizes_df:pd.DataFram
     document_metadata = textract_output.get("DocumentMetadata", {})
     # For efficient lookup, set 'page' as index if it's not already
-    if 'page' in page_sizes_df.columns:
-        page_sizes_df = page_sizes_df.set_index('page')
     for block in textract_output.get("Blocks", []):
         page_no = block.get("Page", 1)  # Default to 1 if missing
@@ -419,29 +533,30 @@ def restructure_textract_output(textract_output: dict, page_sizes_df:pd.DataFram
         # --- Geometry Conversion Logic ---
         try:
             page_info = page_sizes_df.loc[page_no]
-            cb_width = page_info['cropbox_width']
-            cb_height = page_info['cropbox_height']
-            mb_width = page_info['mediabox_width']
-            mb_height = page_info['mediabox_height']
-            cb_x_offset = page_info['cropbox_x_offset']
-            cb_y_offset_top = page_info['cropbox_y_offset_from_top']
             # Check if conversion is needed (and avoid division by zero)
             needs_conversion = (
-                abs(cb_width - mb_width) > 1e-6 or \
-                abs(cb_height - mb_height) > 1e-6
-            ) and mb_width > 1e-6 and mb_height > 1e-6 # Avoid division by zero
-            if needs_conversion and 'Geometry' in block:
-                geometry = block['Geometry'] # Work directly on the block's geometry
                 # --- Convert BoundingBox ---
-                if 'BoundingBox' in geometry:
-                    bbox = geometry['BoundingBox']
-                    old_left = bbox['Left']
-                    old_top = bbox['Top']
-                    old_width = bbox['Width']
-                    old_height = bbox['Height']
                     # Calculate absolute coordinates within CropBox
                     abs_cb_x = old_left * cb_width
@@ -454,15 +569,19 @@ def restructure_textract_output(textract_output: dict, page_sizes_df:pd.DataFram
                     abs_mb_y = cb_y_offset_top + abs_cb_y
                     # Convert back to normalized coordinates relative to MediaBox
-                    bbox['Left'] = abs_mb_x / mb_width
-                    bbox['Top'] = abs_mb_y / mb_height
-                    bbox['Width'] = abs_cb_width / mb_width
-                    bbox['Height'] = abs_cb_height / mb_height
         except KeyError:
-            print(f"Warning: Page number {page_no} not found in page_sizes_df. Skipping coordinate conversion for this block.")
             # Decide how to handle missing page info: skip conversion, raise error, etc.
         except ZeroDivisionError:
-             print(f"Warning: MediaBox width or height is zero for page {page_no}. Skipping coordinate conversion for this block.")
         # Initialise page structure if not already present
         if page_no not in pages_dict:
@@ -470,16 +589,25 @@ def restructure_textract_output(textract_output: dict, page_sizes_df:pd.DataFram
         # Keep only essential fields to reduce size
         filtered_block = {
-            key: block[key] for key in ["BlockType", "Confidence", "Text", "Geometry", "Page", "Id", "Relationships"]
             if key in block
         }
         pages_dict[page_no]["data"]["Blocks"].append(filtered_block)
     # Convert pages dictionary to a sorted list
     structured_output = {
         "DocumentMetadata": document_metadata,  # Store metadata separately
-        "pages": [pages_dict[page] for page in sorted(pages_dict.keys())]
     }
     return structured_output

 import io
 import json
+import os
 import time
+from typing import List
+import boto3
 import pandas as pd
+import pikepdf
+from tools.config import (
+    AWS_ACCESS_KEY,
+    AWS_REGION,
+    AWS_SECRET_KEY,
+    PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
+    RUN_AWS_FUNCTIONS,
+)
+from tools.custom_image_analyser_engine import CustomImageRecognizerResult, OCRResult
+def extract_textract_metadata(response: object):
     """Extracts metadata from an AWS Textract response."""
+    request_id = response["ResponseMetadata"]["RequestId"]
+    pages = response["DocumentMetadata"]["Pages"]
+    return str({"RequestId": request_id, "Pages": pages})
+def analyse_page_with_textract(
+    pdf_page_bytes: object,
+    page_no: int,
+    client: str = "",
+    handwrite_signature_checkbox: List[str] = ["Extract handwriting"],
+    textract_output_found: bool = False,
+    aws_access_key_textbox: str = AWS_ACCESS_KEY,
+    aws_secret_key_textbox: str = AWS_SECRET_KEY,
+    RUN_AWS_FUNCTIONS: str = RUN_AWS_FUNCTIONS,
+    PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS: str = PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
+):
+    """
     Analyzes a single page of a document using AWS Textract to extract text and other features.
     Args:
         Tuple[List[Dict], str]: A tuple containing:
             - A list of dictionaries, where each dictionary represents a Textract block (e.g., LINE, WORD, FORM, TABLE).
             - A string containing metadata about the Textract request.
+    """
+    # print("handwrite_signature_checkbox in analyse_page_with_textract:", handwrite_signature_checkbox)
     if client == "":
+        try:
             # Try to connect to AWS Textract Client if using that text extraction method
+            if (
+                RUN_AWS_FUNCTIONS == "1"
+                and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1"
+            ):
                 print("Connecting to Textract via existing SSO connection")
+                client = boto3.client("textract", region_name=AWS_REGION)
             elif aws_access_key_textbox and aws_secret_key_textbox:
+                print(
+                    "Connecting to Textract using AWS access key and secret keys from user input."
+                )
+                client = boto3.client(
+                    "textract",
+                    aws_access_key_id=aws_access_key_textbox,
+                    aws_secret_access_key=aws_secret_key_textbox,
+                    region_name=AWS_REGION,
+                )
             elif RUN_AWS_FUNCTIONS == "1":
                 print("Connecting to Textract via existing SSO connection")
+                client = boto3.client("textract", region_name=AWS_REGION)
             elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
                 print("Getting Textract credentials from environment variables.")
+                client = boto3.client(
+                    "textract",
+                    aws_access_key_id=AWS_ACCESS_KEY,
+                    aws_secret_access_key=AWS_SECRET_KEY,
+                    region_name=AWS_REGION,
+                )
+            elif textract_output_found is True:
+                print(
+                    "Existing Textract data found for file, no need to connect to AWS Textract"
+                )
+                client = boto3.client("textract", region_name=AWS_REGION)
             else:
                 client = ""
                 out_message = "Cannot connect to AWS Textract service."
                 print(out_message)
                 raise Exception(out_message)
+        except Exception as e:
             out_message = "Cannot connect to AWS Textract"
+            print(out_message, "due to:", e)
             raise Exception(out_message)
             return [], ""  # Return an empty list and an empty string
     # Redact signatures if specified
     feature_types = list()
+    if (
+        "Extract signatures" in handwrite_signature_checkbox
+        or "Extract forms" in handwrite_signature_checkbox
+        or "Extract layout" in handwrite_signature_checkbox
+        or "Extract tables" in handwrite_signature_checkbox
+    ):
         if "Extract signatures" in handwrite_signature_checkbox:
             feature_types.append("SIGNATURES")
         if "Extract forms" in handwrite_signature_checkbox:
         if "Extract tables" in handwrite_signature_checkbox:
             feature_types.append("TABLES")
         try:
+            response = client.analyze_document(
+                Document={"Bytes": pdf_page_bytes}, FeatureTypes=feature_types
+            )
         except Exception as e:
             print("Textract call failed due to:", e, "trying again in 3 seconds.")
             time.sleep(3)
+            response = client.analyze_document(
+                Document={"Bytes": pdf_page_bytes}, FeatureTypes=feature_types
+            )
+    if (
+        "Extract signatures" not in handwrite_signature_checkbox
+        and "Extract forms" not in handwrite_signature_checkbox
+        and "Extract layout" not in handwrite_signature_checkbox
+        and "Extract tables" not in handwrite_signature_checkbox
+    ):
         # Call detect_document_text to extract plain text
         try:
+            response = client.detect_document_text(Document={"Bytes": pdf_page_bytes})
         except Exception as e:
             print("Textract call failed due to:", e, "trying again in 5 seconds.")
             time.sleep(5)
+            response = client.detect_document_text(Document={"Bytes": pdf_page_bytes})
+    # Add the 'Page' attribute to each block
     if "Blocks" in response:
         for block in response["Blocks"]:
             block["Page"] = page_no  # Inject the page number into each block
     # Wrap the response with the page number in the desired format
+    wrapped_response = {"page_no": page_no, "data": response}
+    request_metadata = extract_textract_metadata(
+        response
+    )  # Metadata comes out as a string
     # Return a list containing the wrapped response and the metadata
+    return (
+        wrapped_response,
+        request_metadata,
+    )  # Return as a list to match the desired structure
+def convert_pike_pdf_page_to_bytes(pdf: object, page_num: int):
     # Create a new empty PDF
     new_pdf = pikepdf.Pdf.new()
     return pdf_bytes
+def json_to_ocrresult(
+    json_data: dict, page_width: float, page_height: float, page_no: int
+):
+    """
     Convert the json response from Textract to the OCRResult format used elsewhere in the code.
     Looks for lines, words, and signatures. Handwriting and signatures are set aside especially
     for later in case the user wants to override the default behaviour and redact all
             - list: Bounding boxes identified specifically as signatures.
             - list: Bounding boxes identified specifically as handwriting.
             - dict: OCR results with word-level detail, structured for further processing.
+    """
     all_ocr_results = list()
     signature_or_handwriting_recogniser_results = list()
     signature_recogniser_results = list()
     signatures = list()
     handwriting = list()
     ocr_results_with_words = dict()
+    text_block = dict()
     text_line_number = 1
     # Assuming json_data is structured as a dictionary with a "pages" key
     # Find the specific page data
+    page_json_data = json_data  # next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
     if "Blocks" in page_json_data:
         # Access the data for the specific page
     # This is a new page
     elif "page_no" in page_json_data:
         text_blocks = page_json_data["data"]["Blocks"]
+    else:
+        text_blocks = []
     is_signature = False
     is_handwriting = False
+    for text_block in text_blocks:
+        if (text_block["BlockType"] == "LINE") | (
+            text_block["BlockType"] == "SIGNATURE"
+        ):  # (text_block['BlockType'] == 'WORD') |
             # Extract text and bounding box for the line
             line_bbox = text_block["Geometry"]["BoundingBox"]
             width_abs = int(line_bbox["Width"] * page_width)
             height_abs = int(line_bbox["Height"] * page_height)
+            if text_block["BlockType"] == "LINE":
                 # Extract text and bounding box for the line
+                line_text = text_block.get("Text", "")
                 words = []
+                current_line_handwriting_results = (
+                    []
+                )  # Track handwriting results for this line
+                if "Relationships" in text_block:
+                    for relationship in text_block["Relationships"]:
+                        if relationship["Type"] == "CHILD":
+                            for child_id in relationship["Ids"]:
+                                child_block = next(
+                                    (
+                                        block
+                                        for block in text_blocks
+                                        if block["Id"] == child_id
+                                    ),
+                                    None,
+                                )
+                                if child_block and child_block["BlockType"] == "WORD":
+                                    word_text = child_block.get("Text", "")
                                     word_bbox = child_block["Geometry"]["BoundingBox"]
+                                    confidence = child_block.get("Confidence", "")
                                     word_left = int(word_bbox["Left"] * page_width)
                                     word_top = int(word_bbox["Top"] * page_height)
+                                    word_right = int(
+                                        (word_bbox["Left"] + word_bbox["Width"])
+                                        * page_width
+                                    )
+                                    word_bottom = int(
+                                        (word_bbox["Top"] + word_bbox["Height"])
+                                        * page_height
+                                    )
                                     # Extract BoundingBox details
                                     word_width = word_bbox["Width"]
                                     # Convert proportional coordinates to absolute coordinates
                                     word_width_abs = int(word_width * page_width)
                                     word_height_abs = int(word_height * page_height)
+                                    words.append(
+                                        {
+                                            "text": word_text,
+                                            "bounding_box": (
+                                                word_left,
+                                                word_top,
+                                                word_right,
+                                                word_bottom,
+                                            ),
+                                        }
+                                    )
                                     # Check for handwriting
+                                    text_type = child_block.get("TextType", "")
                                     if text_type == "HANDWRITING":
                                         is_handwriting = True
                                             left=word_left,
                                             top=word_top,
                                             width=word_width_abs,
+                                            height=word_height_abs,
                                         )
                                         # Add to handwriting collections immediately
                                         handwriting.append(recogniser_result)
+                                        handwriting_recogniser_results.append(
+                                            recogniser_result
+                                        )
+                                        signature_or_handwriting_recogniser_results.append(
+                                            recogniser_result
+                                        )
+                                        current_line_handwriting_results.append(
+                                            recogniser_result
+                                        )
+            # If handwriting or signature, add to bounding box
+            elif text_block["BlockType"] == "SIGNATURE":
                 line_text = "SIGNATURE"
                 is_signature = True
                 entity_name = "SIGNATURE"
+                confidence = text_block.get("Confidence", 0)
                 word_end = len(line_text)
                 recogniser_result = CustomImageRecognizerResult(
                     left=line_left,
                     top=line_top,
                     width=width_abs,
+                    height=height_abs,
                 )
                 # Add to signature collections immediately
                 signature_recogniser_results.append(recogniser_result)
                 signature_or_handwriting_recogniser_results.append(recogniser_result)
+                words = [
+                    {
+                        "text": line_text,
+                        "bounding_box": (line_left, line_top, line_right, line_bottom),
+                    }
+                ]
         else:
             line_text = ""
+            words = []
             line_left = 0
             line_top = 0
             line_right = 0
             ocr_results_with_words["text_line_" + str(text_line_number)] = {
                 "line": text_line_number,
+                "text": line_text,
+                "bounding_box": (line_left, line_top, line_right, line_bottom),
+                "words": words,
+                "page": page_no,
             }
             # Create OCRResult with absolute coordinates
+            ocr_result = OCRResult(
+                line_text,
+                line_left,
+                line_top,
+                width_abs,
+                height_abs,
+                conf=confidence,
+                line=text_line_number,
+            )
             all_ocr_results.append(ocr_result)
             # Increase line number
         # If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
         if is_signature_or_handwriting:
+            if recogniser_result not in signature_or_handwriting_recogniser_results:
                 signature_or_handwriting_recogniser_results.append(recogniser_result)
             if is_signature:
+                if recogniser_result not in signature_recogniser_results:
                     signature_recogniser_results.append(recogniser_result)
+            if is_handwriting:
+                if recogniser_result not in handwriting_recogniser_results:
                     handwriting_recogniser_results.append(recogniser_result)
     # Add page key to the line level results
     all_ocr_results_with_page = {"page": page_no, "results": all_ocr_results}
+    ocr_results_with_words_with_page = {
+        "page": page_no,
+        "results": ocr_results_with_words,
+    }
+    return (
+        all_ocr_results_with_page,
+        signature_or_handwriting_recogniser_results,
+        signature_recogniser_results,
+        handwriting_recogniser_results,
+        ocr_results_with_words_with_page,
+    )
+def load_and_convert_textract_json(
+    textract_json_file_path: str,
+    log_files_output_paths: str,
+    page_sizes_df: pd.DataFrame,
+):
     """
     Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
     """
     if not os.path.exists(textract_json_file_path):
         print("No existing Textract results file found.")
+        return (
+            {},
+            True,
+            log_files_output_paths,
+        )  # Return empty dict and flag indicating missing file
     print("Found existing Textract json results file.")
     # Track log files
         log_files_output_paths.append(textract_json_file_path)
     try:
+        with open(textract_json_file_path, "r", encoding="utf-8") as json_file:
             textract_data = json.load(json_file)
     except json.JSONDecodeError:
         print("Error: Failed to parse Textract JSON file. Returning empty data.")
     if "Blocks" in textract_data:
         print("Need to convert Textract JSON to app format.")
         try:
             textract_data = restructure_textract_output(textract_data, page_sizes_df)
+            return (
+                textract_data,
+                False,
+                log_files_output_paths,
+            )  # Successfully converted
         except Exception as e:
             print("Failed to convert JSON data to app format due to:", e)
             return {}, True, log_files_output_paths  # Conversion failed
     else:
         print("Invalid Textract JSON format: 'Blocks' missing.")
+        # print("textract data:", textract_data)
+        return (
+            {},
+            True,
+            log_files_output_paths,
+        )  # Return empty data if JSON is not recognized
+def restructure_textract_output(textract_output: dict, page_sizes_df: pd.DataFrame):
     """
+    Reorganise Textract output from the bulk Textract analysis option on AWS
     into a format that works in this redaction app, reducing size.
     """
     pages_dict = {}
     document_metadata = textract_output.get("DocumentMetadata", {})
     # For efficient lookup, set 'page' as index if it's not already
+    if "page" in page_sizes_df.columns:
+        page_sizes_df = page_sizes_df.set_index("page")
     for block in textract_output.get("Blocks", []):
         page_no = block.get("Page", 1)  # Default to 1 if missing
         # --- Geometry Conversion Logic ---
         try:
             page_info = page_sizes_df.loc[page_no]
+            cb_width = page_info["cropbox_width"]
+            cb_height = page_info["cropbox_height"]
+            mb_width = page_info["mediabox_width"]
+            mb_height = page_info["mediabox_height"]
+            cb_x_offset = page_info["cropbox_x_offset"]
+            cb_y_offset_top = page_info["cropbox_y_offset_from_top"]
             # Check if conversion is needed (and avoid division by zero)
             needs_conversion = (
+                (abs(cb_width - mb_width) > 1e-6 or abs(cb_height - mb_height) > 1e-6)
+                and mb_width > 1e-6
+                and mb_height > 1e-6
+            )  # Avoid division by zero
+            if needs_conversion and "Geometry" in block:
+                geometry = block["Geometry"]  # Work directly on the block's geometry
                 # --- Convert BoundingBox ---
+                if "BoundingBox" in geometry:
+                    bbox = geometry["BoundingBox"]
+                    old_left = bbox["Left"]
+                    old_top = bbox["Top"]
+                    old_width = bbox["Width"]
+                    old_height = bbox["Height"]
                     # Calculate absolute coordinates within CropBox
                     abs_cb_x = old_left * cb_width
                     abs_mb_y = cb_y_offset_top + abs_cb_y
                     # Convert back to normalized coordinates relative to MediaBox
+                    bbox["Left"] = abs_mb_x / mb_width
+                    bbox["Top"] = abs_mb_y / mb_height
+                    bbox["Width"] = abs_cb_width / mb_width
+                    bbox["Height"] = abs_cb_height / mb_height
         except KeyError:
+            print(
+                f"Warning: Page number {page_no} not found in page_sizes_df. Skipping coordinate conversion for this block."
+            )
             # Decide how to handle missing page info: skip conversion, raise error, etc.
         except ZeroDivisionError:
+            print(
+                f"Warning: MediaBox width or height is zero for page {page_no}. Skipping coordinate conversion for this block."
+            )
         # Initialise page structure if not already present
         if page_no not in pages_dict:
         # Keep only essential fields to reduce size
         filtered_block = {
+            key: block[key]
+            for key in [
+                "BlockType",
+                "Confidence",
+                "Text",
+                "Geometry",
+                "Page",
+                "Id",
+                "Relationships",
+            ]
             if key in block
         }
         pages_dict[page_no]["data"]["Blocks"].append(filtered_block)
     # Convert pages dictionary to a sorted list
     structured_output = {
         "DocumentMetadata": document_metadata,  # Store metadata separately
+        "pages": [pages_dict[page] for page in sorted(pages_dict.keys())],
     }
     return structured_output

tools/cli_usage_logger.py CHANGED Viewed

@@ -5,29 +5,29 @@ This module provides functionality to log usage data from CLI operations to CSV
 import csv
 import os
-import time
 import uuid
 from datetime import datetime
 from pathlib import Path
-from typing import Any, List, Optional
 import boto3
-import botocore
 from tools.aws_functions import upload_log_file_to_s3
 from tools.config import (
-    USAGE_LOGS_FOLDER,
-    SAVE_LOGS_TO_CSV,
-    SAVE_LOGS_TO_DYNAMODB,
-    USAGE_LOG_DYNAMODB_TABLE_NAME,
-    DYNAMODB_USAGE_LOG_HEADERS,
     CSV_USAGE_LOG_HEADERS,
     DISPLAY_FILE_NAMES_IN_LOGS,
     HOST_NAME,
-    AWS_REGION,
-    AWS_ACCESS_KEY,
-    AWS_SECRET_KEY,
     RUN_AWS_FUNCTIONS,
     S3_USAGE_LOGS_FOLDER,
-    DOCUMENT_REDACTION_BUCKET
 )
@@ -36,11 +36,11 @@ class CLIUsageLogger:
     A simplified usage logger for CLI operations that mimics the functionality
     of the Gradio CSVLogger_custom class.
     """
     def __init__(self, dataset_file_name: str = "usage_log.csv"):
         """
         Initialize the CLI usage logger.
         Args:
             dataset_file_name: Name of the CSV file to store logs
         """
@@ -48,34 +48,36 @@ class CLIUsageLogger:
         self.flagging_dir = Path(USAGE_LOGS_FOLDER)
         self.dataset_filepath = None
         self.headers = None
     def setup(self, headers: List[str]):
         """
         Setup the logger with the specified headers.
         Args:
             headers: List of column headers for the CSV file
         """
         self.headers = headers
         self._create_dataset_file()
     def _create_dataset_file(self):
         """Create the dataset CSV file with headers if it doesn't exist."""
         os.makedirs(self.flagging_dir, exist_ok=True)
         # Add ID and timestamp to headers (matching custom_csvlogger.py structure)
         full_headers = self.headers + ["id", "timestamp"]
         self.dataset_filepath = self.flagging_dir / self.dataset_file_name
         if not Path(self.dataset_filepath).exists():
-            with open(self.dataset_filepath, "w", newline="", encoding="utf-8") as csvfile:
                 writer = csv.writer(csvfile)
                 writer.writerow(full_headers)
             print(f"Created usage log file at: {self.dataset_filepath}")
         else:
             print(f"Using existing usage log file at: {self.dataset_filepath}")
     def log_usage(
         self,
         data: List[Any],
@@ -86,11 +88,11 @@ class CLIUsageLogger:
         s3_key_prefix: str = None,
         dynamodb_table_name: str = None,
         dynamodb_headers: List[str] = None,
-        replacement_headers: List[str] = None
     ) -> int:
         """
         Log usage data to CSV and optionally DynamoDB and S3.
         Args:
             data: List of data values to log
             save_to_csv: Whether to save to CSV (defaults to config setting)
@@ -101,17 +103,17 @@ class CLIUsageLogger:
             dynamodb_table_name: DynamoDB table name (defaults to config setting)
             dynamodb_headers: DynamoDB headers (defaults to config setting)
             replacement_headers: Replacement headers for CSV (defaults to config setting)
         Returns:
             Number of lines written
         """
         # Use config defaults if not specified
         if save_to_csv is None:
-            save_to_csv = SAVE_LOGS_TO_CSV == 'True'
         if save_to_dynamodb is None:
-            save_to_dynamodb = SAVE_LOGS_TO_DYNAMODB == 'True'
         if save_to_s3 is None:
-            save_to_s3 = RUN_AWS_FUNCTIONS == "1" and SAVE_LOGS_TO_CSV == 'True'
         if s3_bucket is None:
             s3_bucket = DOCUMENT_REDACTION_BUCKET
         if s3_key_prefix is None:
@@ -122,18 +124,22 @@ class CLIUsageLogger:
             dynamodb_headers = DYNAMODB_USAGE_LOG_HEADERS
         if replacement_headers is None:
             replacement_headers = CSV_USAGE_LOG_HEADERS
         # Generate unique ID and add timestamp (matching custom_csvlogger.py structure)
         generated_id = str(uuid.uuid4())
-        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]  # Correct format for Amazon Athena
         csv_data = data + [generated_id, timestamp]
         line_count = 0
         # Save to CSV
         if save_to_csv and self.dataset_filepath:
             try:
-                with open(self.dataset_filepath, "a", newline="", encoding="utf-8-sig") as csvfile:
                     writer = csv.writer(csvfile)
                     writer.writerow(csv_data)
                     line_count = 1
@@ -150,70 +156,86 @@ class CLIUsageLogger:
                     s3_key=s3_key_prefix,
                     s3_bucket=s3_bucket,
                     RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
-                    SAVE_LOGS_TO_CSV=SAVE_LOGS_TO_CSV
                 )
                 print(f"S3 upload result: {upload_result}")
             except Exception as e:
                 print(f"Error uploading log file to S3: {e}")
         # Save to DynamoDB
         if save_to_dynamodb and dynamodb_table_name and dynamodb_headers:
             try:
                 # Initialize DynamoDB client
                 if AWS_ACCESS_KEY and AWS_SECRET_KEY:
                     dynamodb = boto3.resource(
-                        'dynamodb',
                         region_name=AWS_REGION,
                         aws_access_key_id=AWS_ACCESS_KEY,
-                        aws_secret_access_key=AWS_SECRET_KEY
                     )
                 else:
-                    dynamodb = boto3.resource('dynamodb', region_name=AWS_REGION)
                 table = dynamodb.Table(dynamodb_table_name)
                 # Generate unique ID
                 generated_id = str(uuid.uuid4())
                 # Prepare the DynamoDB item
                 item = {
-                    'id': generated_id,
-                    'timestamp': timestamp,
                 }
                 # Map the headers to values
-                item.update({header: str(value) for header, value in zip(dynamodb_headers, data)})
                 table.put_item(Item=item)
                 print("Successfully uploaded usage log to DynamoDB")
             except Exception as e:
                 print(f"Could not upload usage log to DynamoDB: {e}")
         return line_count
 def create_cli_usage_logger() -> CLIUsageLogger:
     """
     Create and setup a CLI usage logger with the standard headers.
     Returns:
         Configured CLIUsageLogger instance
     """
     # Parse CSV headers from config
     import json
     try:
         headers = json.loads(CSV_USAGE_LOG_HEADERS)
-    except:
         # Fallback headers if parsing fails
         headers = [
-            "session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox",
-            "actual_time_taken_number", "total_page_count", "textract_query_number",
-            "pii_detection_method", "comprehend_query_number", "cost_code",
-            "textract_handwriting_signature", "host_name_textbox", "text_extraction_method",
-            "is_this_a_textract_api_call", "task"
         ]
     logger = CLIUsageLogger()
     logger.setup(headers)
     return logger
@@ -237,11 +259,11 @@ def log_redaction_usage(
     save_to_dynamodb: bool = None,
     save_to_s3: bool = None,
     s3_bucket: str = None,
-    s3_key_prefix: str = None
 ):
     """
     Log redaction usage data using the provided logger.
     Args:
         logger: CLIUsageLogger instance
         session_hash: Session identifier
@@ -263,7 +285,7 @@ def log_redaction_usage(
         s3_key_prefix: S3 key prefix (overrides config default)
     """
     # Use placeholder names if not displaying file names in logs
-    if DISPLAY_FILE_NAMES_IN_LOGS != 'True':
         if doc_file_name:
             doc_file_name = "document"
             data_file_name = ""
@@ -275,7 +297,7 @@ def log_redaction_usage(
         data_file_name = data_file_name
     rounded_time_taken = round(time_taken, 2)
     data = [
         session_hash,
         doc_file_name,
@@ -290,13 +312,13 @@ def log_redaction_usage(
         HOST_NAME,
         text_extraction_method,
         is_textract_call,
-        task
     ]
     logger.log_usage(
-        data,
         save_to_dynamodb=save_to_dynamodb,
         save_to_s3=save_to_s3,
         s3_bucket=s3_bucket,
-        s3_key_prefix=s3_key_prefix
     )

 import csv
 import os
 import uuid
 from datetime import datetime
 from pathlib import Path
+from typing import Any, List
 import boto3
 from tools.aws_functions import upload_log_file_to_s3
 from tools.config import (
+    AWS_ACCESS_KEY,
+    AWS_REGION,
+    AWS_SECRET_KEY,
     CSV_USAGE_LOG_HEADERS,
     DISPLAY_FILE_NAMES_IN_LOGS,
+    DOCUMENT_REDACTION_BUCKET,
+    DYNAMODB_USAGE_LOG_HEADERS,
     HOST_NAME,
     RUN_AWS_FUNCTIONS,
     S3_USAGE_LOGS_FOLDER,
+    SAVE_LOGS_TO_CSV,
+    SAVE_LOGS_TO_DYNAMODB,
+    USAGE_LOG_DYNAMODB_TABLE_NAME,
+    USAGE_LOGS_FOLDER,
 )
     A simplified usage logger for CLI operations that mimics the functionality
     of the Gradio CSVLogger_custom class.
     """
     def __init__(self, dataset_file_name: str = "usage_log.csv"):
         """
         Initialize the CLI usage logger.
         Args:
             dataset_file_name: Name of the CSV file to store logs
         """
         self.flagging_dir = Path(USAGE_LOGS_FOLDER)
         self.dataset_filepath = None
         self.headers = None
     def setup(self, headers: List[str]):
         """
         Setup the logger with the specified headers.
         Args:
             headers: List of column headers for the CSV file
         """
         self.headers = headers
         self._create_dataset_file()
     def _create_dataset_file(self):
         """Create the dataset CSV file with headers if it doesn't exist."""
         os.makedirs(self.flagging_dir, exist_ok=True)
         # Add ID and timestamp to headers (matching custom_csvlogger.py structure)
         full_headers = self.headers + ["id", "timestamp"]
         self.dataset_filepath = self.flagging_dir / self.dataset_file_name
         if not Path(self.dataset_filepath).exists():
+            with open(
+                self.dataset_filepath, "w", newline="", encoding="utf-8"
+            ) as csvfile:
                 writer = csv.writer(csvfile)
                 writer.writerow(full_headers)
             print(f"Created usage log file at: {self.dataset_filepath}")
         else:
             print(f"Using existing usage log file at: {self.dataset_filepath}")
     def log_usage(
         self,
         data: List[Any],
         s3_key_prefix: str = None,
         dynamodb_table_name: str = None,
         dynamodb_headers: List[str] = None,
+        replacement_headers: List[str] = None,
     ) -> int:
         """
         Log usage data to CSV and optionally DynamoDB and S3.
         Args:
             data: List of data values to log
             save_to_csv: Whether to save to CSV (defaults to config setting)
             dynamodb_table_name: DynamoDB table name (defaults to config setting)
             dynamodb_headers: DynamoDB headers (defaults to config setting)
             replacement_headers: Replacement headers for CSV (defaults to config setting)
         Returns:
             Number of lines written
         """
         # Use config defaults if not specified
         if save_to_csv is None:
+            save_to_csv = SAVE_LOGS_TO_CSV == "True"
         if save_to_dynamodb is None:
+            save_to_dynamodb = SAVE_LOGS_TO_DYNAMODB == "True"
         if save_to_s3 is None:
+            save_to_s3 = RUN_AWS_FUNCTIONS == "1" and SAVE_LOGS_TO_CSV == "True"
         if s3_bucket is None:
             s3_bucket = DOCUMENT_REDACTION_BUCKET
         if s3_key_prefix is None:
             dynamodb_headers = DYNAMODB_USAGE_LOG_HEADERS
         if replacement_headers is None:
             replacement_headers = CSV_USAGE_LOG_HEADERS
         # Generate unique ID and add timestamp (matching custom_csvlogger.py structure)
         generated_id = str(uuid.uuid4())
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[
+            :-3
+        ]  # Correct format for Amazon Athena
         csv_data = data + [generated_id, timestamp]
         line_count = 0
         # Save to CSV
         if save_to_csv and self.dataset_filepath:
             try:
+                with open(
+                    self.dataset_filepath, "a", newline="", encoding="utf-8-sig"
+                ) as csvfile:
                     writer = csv.writer(csvfile)
                     writer.writerow(csv_data)
                     line_count = 1
                     s3_key=s3_key_prefix,
                     s3_bucket=s3_bucket,
                     RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+                    SAVE_LOGS_TO_CSV=SAVE_LOGS_TO_CSV,
                 )
                 print(f"S3 upload result: {upload_result}")
             except Exception as e:
                 print(f"Error uploading log file to S3: {e}")
         # Save to DynamoDB
         if save_to_dynamodb and dynamodb_table_name and dynamodb_headers:
             try:
                 # Initialize DynamoDB client
                 if AWS_ACCESS_KEY and AWS_SECRET_KEY:
                     dynamodb = boto3.resource(
+                        "dynamodb",
                         region_name=AWS_REGION,
                         aws_access_key_id=AWS_ACCESS_KEY,
+                        aws_secret_access_key=AWS_SECRET_KEY,
                     )
                 else:
+                    dynamodb = boto3.resource("dynamodb", region_name=AWS_REGION)
                 table = dynamodb.Table(dynamodb_table_name)
                 # Generate unique ID
                 generated_id = str(uuid.uuid4())
                 # Prepare the DynamoDB item
                 item = {
+                    "id": generated_id,
+                    "timestamp": timestamp,
                 }
                 # Map the headers to values
+                item.update(
+                    {
+                        header: str(value)
+                        for header, value in zip(dynamodb_headers, data)
+                    }
+                )
                 table.put_item(Item=item)
                 print("Successfully uploaded usage log to DynamoDB")
             except Exception as e:
                 print(f"Could not upload usage log to DynamoDB: {e}")
         return line_count
 def create_cli_usage_logger() -> CLIUsageLogger:
     """
     Create and setup a CLI usage logger with the standard headers.
     Returns:
         Configured CLIUsageLogger instance
     """
     # Parse CSV headers from config
     import json
     try:
         headers = json.loads(CSV_USAGE_LOG_HEADERS)
+    except Exception as e:
+        print(f"Error parsing CSV usage log headers: {e}")
         # Fallback headers if parsing fails
         headers = [
+            "session_hash_textbox",
+            "doc_full_file_name_textbox",
+            "data_full_file_name_textbox",
+            "actual_time_taken_number",
+            "total_page_count",
+            "textract_query_number",
+            "pii_detection_method",
+            "comprehend_query_number",
+            "cost_code",
+            "textract_handwriting_signature",
+            "host_name_textbox",
+            "text_extraction_method",
+            "is_this_a_textract_api_call",
+            "task",
         ]
     logger = CLIUsageLogger()
     logger.setup(headers)
     return logger
     save_to_dynamodb: bool = None,
     save_to_s3: bool = None,
     s3_bucket: str = None,
+    s3_key_prefix: str = None,
 ):
     """
     Log redaction usage data using the provided logger.
     Args:
         logger: CLIUsageLogger instance
         session_hash: Session identifier
         s3_key_prefix: S3 key prefix (overrides config default)
     """
     # Use placeholder names if not displaying file names in logs
+    if DISPLAY_FILE_NAMES_IN_LOGS != "True":
         if doc_file_name:
             doc_file_name = "document"
             data_file_name = ""
         data_file_name = data_file_name
     rounded_time_taken = round(time_taken, 2)
     data = [
         session_hash,
         doc_file_name,
         HOST_NAME,
         text_extraction_method,
         is_textract_call,
+        task,
     ]
     logger.log_usage(
+        data,
         save_to_dynamodb=save_to_dynamodb,
         save_to_s3=save_to_s3,
         s3_bucket=s3_bucket,
+        s3_key_prefix=s3_key_prefix,
     )

tools/config.py CHANGED Viewed

@@ -1,147 +1,171 @@
 import os
-import tempfile
 import socket
-import logging
 from datetime import datetime
 from dotenv import load_dotenv
 from tldextract import TLDExtract
-from typing import List
 today_rev = datetime.now().strftime("%Y%m%d")
 HOST_NAME = socket.gethostname()
 def _get_env_list(env_var_name: str) -> List[str]:
     """Parses a comma-separated environment variable into a list of strings."""
-    value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
     if not value:
         return []
     # Split by comma and filter out any empty strings that might result from extra commas
-    return [s.strip() for s in value.split(',') if s.strip()]
 # Set or retrieve configuration variables for the redaction app
-def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
-    '''
     Get an environmental variable, and set it to a default value if it doesn't exist
-    '''
     # Get the environment variable if it exists
     value = os.environ.get(var_name)
     # If it doesn't exist, set the environment variable to the default value
     if value is None:
         os.environ[var_name] = default_value
         value = default_value
-    if print_val == True:
-        print(f'The value of {var_name} is {value}')
     return value
 def add_folder_to_path(folder_path: str):
-    '''
     Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
-    '''
     if os.path.exists(folder_path) and os.path.isdir(folder_path):
-        #print(folder_path, "folder exists.")
         # Resolve relative path to absolute path
         absolute_path = os.path.abspath(folder_path)
-        current_path = os.environ['PATH']
         if absolute_path not in current_path.split(os.pathsep):
             full_path_extension = absolute_path + os.pathsep + current_path
-            os.environ['PATH'] = full_path_extension
-            #print(f"Updated PATH with: ", full_path_extension)
         else:
             pass
-            #print(f"Directory {folder_path} already exists in PATH.")
     else:
         print(f"Folder not found at {folder_path} - not added to PATH")
 ###
 # LOAD CONFIG FROM ENV FILE
 ###
-CONFIG_FOLDER = get_or_create_env_var('CONFIG_FOLDER', 'config/')
 # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
-APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', CONFIG_FOLDER + 'app_config.env') # e.g. config/app_config.env
 if APP_CONFIG_PATH:
     if os.path.exists(APP_CONFIG_PATH):
         print(f"Loading app variables from config file {APP_CONFIG_PATH}")
         load_dotenv(APP_CONFIG_PATH)
-    else: print("App config file not found at location:", APP_CONFIG_PATH)
 ###
 # AWS OPTIONS
 ###
 # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
-AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '') # e.g. config/aws_config.env
 if AWS_CONFIG_PATH:
     if os.path.exists(AWS_CONFIG_PATH):
         print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
         load_dotenv(AWS_CONFIG_PATH)
-    else: print("AWS config file not found at location:", AWS_CONFIG_PATH)
 RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
-AWS_REGION = get_or_create_env_var('AWS_REGION', '')
-AWS_CLIENT_ID = get_or_create_env_var('AWS_CLIENT_ID', '')
-AWS_CLIENT_SECRET = get_or_create_env_var('AWS_CLIENT_SECRET', '')
-AWS_USER_POOL_ID = get_or_create_env_var('AWS_USER_POOL_ID', '')
-AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
-#if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
-AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
-#if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
-DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
 # Should the app prioritise using AWS SSO over using API keys stored in environment variables/secrets (defaults to yes)
-PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS = get_or_create_env_var('PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS', '1')
 # Custom headers e.g. if routing traffic through Cloudfront
 # Retrieving or setting CUSTOM_HEADER
-CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
 # Retrieving or setting CUSTOM_HEADER_VALUE
-CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
 ###
 # Image options
 ###
-IMAGES_DPI = float(get_or_create_env_var('IMAGES_DPI', '300.0'))
-LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True')
-MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to None if blank in file_conversion.py
 ###
 # File I/O options
 ###
-SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
-OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
-INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
 # Allow for files to be saved in a temporary folder for increased security in some instances
-if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
     # Create a temporary directory
     with tempfile.TemporaryDirectory() as temp_dir:
-        print(f'Temporary directory created at: {temp_dir}')
-        if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
-        if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
-GRADIO_TEMP_DIR = get_or_create_env_var('GRADIO_TEMP_DIR', 'tmp/gradio_tmp/') # Default Gradio temp folder
-MPLCONFIGDIR = get_or_create_env_var('MPLCONFIGDIR', 'tmp/matplotlib_cache/') # Matplotlib cache folder
 ###
 # LOGGING OPTIONS
@@ -150,57 +174,82 @@ MPLCONFIGDIR = get_or_create_env_var('MPLCONFIGDIR', 'tmp/matplotlib_cache/') #
 # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
 # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
-SAVE_LOGS_TO_CSV = get_or_create_env_var('SAVE_LOGS_TO_CSV', 'True')
-USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
-FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/')
-ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/')
-USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/')
 if USE_LOG_SUBFOLDERS == "True":
-    day_log_subfolder = today_rev + '/'
-    host_name_subfolder = HOST_NAME + '/'
     full_log_subfolder = day_log_subfolder + host_name_subfolder
     FEEDBACK_LOGS_FOLDER = FEEDBACK_LOGS_FOLDER + full_log_subfolder
     ACCESS_LOGS_FOLDER = ACCESS_LOGS_FOLDER + full_log_subfolder
     USAGE_LOGS_FOLDER = USAGE_LOGS_FOLDER + full_log_subfolder
-S3_FEEDBACK_LOGS_FOLDER = get_or_create_env_var('S3_FEEDBACK_LOGS_FOLDER', 'feedback/' + full_log_subfolder)
-S3_ACCESS_LOGS_FOLDER = get_or_create_env_var('S3_ACCESS_LOGS_FOLDER', 'logs/' + full_log_subfolder)
-S3_USAGE_LOGS_FOLDER = get_or_create_env_var('S3_USAGE_LOGS_FOLDER', 'usage/' + full_log_subfolder)
 # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
-DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
 # Further customisation options for CSV logs
-CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
-CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
-CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number",	"total_page_count",	"textract_query_number", "pii_detection_method", "comprehend_query_number",  "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call", "task"]') # If blank, uses component labels
 ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
-SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False')
-ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', 'redaction_access_log')
-DYNAMODB_ACCESS_LOG_HEADERS = get_or_create_env_var('DYNAMODB_ACCESS_LOG_HEADERS', '')
-FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('FEEDBACK_LOG_DYNAMODB_TABLE_NAME', 'redaction_feedback')
-DYNAMODB_FEEDBACK_LOG_HEADERS = get_or_create_env_var('DYNAMODB_FEEDBACK_LOG_HEADERS', '')
-USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', 'redaction_usage')
-DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var('DYNAMODB_USAGE_LOG_HEADERS', '')
 # Report logging to console?
-LOGGING = get_or_create_env_var('LOGGING', 'False')
-if LOGGING == 'True':
     # Configure logging
-    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-LOG_FILE_NAME = get_or_create_env_var('LOG_FILE_NAME', 'log.csv')
-USAGE_LOG_FILE_NAME = get_or_create_env_var('USAGE_LOG_FILE_NAME', LOG_FILE_NAME)
-FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FILE_NAME)
 ###
@@ -208,164 +257,265 @@ FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FIL
 ###
 # Create Tesseract and Poppler folders if you have installed them locally
-TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") #  # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
-TESSERACT_DATA_FOLDER = get_or_create_env_var('TESSERACT_DATA_FOLDER', "/usr/share/tessdata")
-POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
-if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
-if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)
 # List of models to use for text extraction and PII detection
 # Text extraction models
-SELECTABLE_TEXT_EXTRACT_OPTION = get_or_create_env_var('SELECTABLE_TEXT_EXTRACT_OPTION', "Local model - selectable text")
-TESSERACT_TEXT_EXTRACT_OPTION = get_or_create_env_var('TESSERACT_TEXT_EXTRACT_OPTION', "Local OCR model - PDFs without selectable text")
-TEXTRACT_TEXT_EXTRACT_OPTION = get_or_create_env_var('TEXTRACT_TEXT_EXTRACT_OPTION', "AWS Textract service - all PDF types")
 # PII detection models
-NO_REDACTION_PII_OPTION = get_or_create_env_var('NO_REDACTION_PII_OPTION', "Only extract text (no redaction)")
-LOCAL_PII_OPTION = get_or_create_env_var('LOCAL_PII_OPTION', "Local")
-AWS_PII_OPTION  = get_or_create_env_var('AWS_PII_OPTION', "AWS Comprehend")
-SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = get_or_create_env_var('SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS', 'True')
-SHOW_AWS_TEXT_EXTRACTION_OPTIONS = get_or_create_env_var('SHOW_AWS_TEXT_EXTRACTION_OPTIONS', 'True')
 # Show at least local options if everything mistakenly removed
-if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS != "True" and SHOW_AWS_TEXT_EXTRACTION_OPTIONS != "True":
     SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = "True"
 local_model_options = []
 aws_model_options = []
 text_extraction_models = []
-if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS == 'True':
     local_model_options.append(SELECTABLE_TEXT_EXTRACT_OPTION)
     local_model_options.append(TESSERACT_TEXT_EXTRACT_OPTION)
-if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == 'True':
     aws_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION)
 TEXT_EXTRACTION_MODELS = local_model_options + aws_model_options
-DO_INITIAL_TABULAR_DATA_CLEAN = get_or_create_env_var('DO_INITIAL_TABULAR_DATA_CLEAN', 'True')
-SHOW_LOCAL_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_LOCAL_PII_DETECTION_OPTIONS', 'True')
-SHOW_AWS_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_AWS_PII_DETECTION_OPTIONS', 'True')
-if SHOW_LOCAL_PII_DETECTION_OPTIONS != "True" and SHOW_AWS_PII_DETECTION_OPTIONS != "True":
     SHOW_LOCAL_PII_DETECTION_OPTIONS = "True"
 local_model_options = [NO_REDACTION_PII_OPTION]
 aws_model_options = []
 pii_detection_models = []
-if SHOW_LOCAL_PII_DETECTION_OPTIONS == 'True':
     local_model_options.append(LOCAL_PII_OPTION)
-if SHOW_AWS_PII_DETECTION_OPTIONS == 'True':
     aws_model_options.append(AWS_PII_OPTION)
 PII_DETECTION_MODELS = local_model_options + aws_model_options
 if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == "True":
-    DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var('DEFAULT_TEXT_EXTRACTION_MODEL', TEXTRACT_TEXT_EXTRACT_OPTION)
 else:
-    DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var('DEFAULT_TEXT_EXTRACTION_MODEL', SELECTABLE_TEXT_EXTRACT_OPTION)
 if SHOW_AWS_PII_DETECTION_OPTIONS == "True":
-    DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var('DEFAULT_PII_DETECTION_MODEL', AWS_PII_OPTION)
 else:
-    DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var('DEFAULT_PII_DETECTION_MODEL', LOCAL_PII_OPTION)
 # Create list of PII detection models for tabular redaction
 TABULAR_PII_DETECTION_MODELS = PII_DETECTION_MODELS.copy()
 if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
     TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
-DEFAULT_TEXT_COLUMNS = get_or_create_env_var('DEFAULT_TEXT_COLUMNS', "[]")
-DEFAULT_EXCEL_SHEETS = get_or_create_env_var('DEFAULT_EXCEL_SHEETS', "[]")
-DEFAULT_TABULAR_ANONYMISATION_STRATEGY = get_or_create_env_var('DEFAULT_TABULAR_ANONYMISATION_STRATEGY', "redact completely")
 ### Local OCR model - Tesseract vs PaddleOCR
-CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var('CHOSEN_LOCAL_OCR_MODEL', "tesseract") # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
-PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var('PREPROCESS_LOCAL_OCR_IMAGES', "True") # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this often results in WORSE results for scanned pages, so it is default False
 # Entities for redaction
-CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
-FULL_COMPREHEND_ENTITY_LIST = get_or_create_env_var('FULL_COMPREHEND_ENTITY_LIST', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', 'CUSTOM_FUZZY']")
 # Entities for local PII redaction option
-CHOSEN_REDACT_ENTITIES = get_or_create_env_var('CHOSEN_REDACT_ENTITIES', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CUSTOM']")
-FULL_ENTITY_LIST = get_or_create_env_var('FULL_ENTITY_LIST', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']")
-CUSTOM_ENTITIES = get_or_create_env_var('CUSTOM_ENTITIES', "['TITLES', 'UKPOSTCODE', 'STREETNAME', 'CUSTOM']")
-DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var('DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX', "['Extract handwriting']")
-HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS = get_or_create_env_var('HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS', "['Extract handwriting', 'Extract signatures']")
-if HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS: HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS = _get_env_list(HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS)
-INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var('INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION', "False")
-INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var('INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION', "False")
-INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var('INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION', "False")
 if INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION == "True":
-    HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append('Extract forms')
 if INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION == "True":
-    HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append('Extract layout')
 if INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION == "True":
-    HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append('Extract tables')
-DEFAULT_SEARCH_QUERY = get_or_create_env_var('DEFAULT_SEARCH_QUERY', '')
-DEFAULT_FUZZY_SPELLING_MISTAKES_NUM = int(get_or_create_env_var('DEFAULT_FUZZY_SPELLING_MISTAKES_NUM', '1'))
-DEFAULT_PAGE_MIN = int(get_or_create_env_var('DEFAULT_PAGE_MIN', '0'))
-DEFAULT_PAGE_MAX = int(get_or_create_env_var('DEFAULT_PAGE_MAX', '999'))
 # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
-PAGE_BREAK_VALUE = int(get_or_create_env_var('PAGE_BREAK_VALUE', '99999'))
-MAX_TIME_VALUE = int(get_or_create_env_var('MAX_TIME_VALUE', '999999'))
-MAX_SIMULTANEOUS_FILES = int(get_or_create_env_var('MAX_SIMULTANEOUS_FILES', '10'))
-MAX_DOC_PAGES = int(get_or_create_env_var('MAX_DOC_PAGES', '3000'))
-MAX_TABLE_ROWS = int(get_or_create_env_var('MAX_TABLE_ROWS', '250000'))
-MAX_TABLE_COLUMNS = int(get_or_create_env_var('MAX_TABLE_COLUMNS', '100'))
-MAX_OPEN_TEXT_CHARACTERS = int(get_or_create_env_var('MAX_OPEN_TEXT_CHARACTERS', '50000'))
-CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
 ### Language selection options
 SHOW_LANGUAGE_SELECTION = get_or_create_env_var("SHOW_LANGUAGE_SELECTION", "False")
-DEFAULT_LANGUAGE_FULL_NAME = get_or_create_env_var("DEFAULT_LANGUAGE_FULL_NAME", "english")
-DEFAULT_LANGUAGE = get_or_create_env_var("DEFAULT_LANGUAGE", "en") # For tesseract, ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system. You can find the relevant language packs here: https://github.com/tesseract-ocr/tessdata.
 # For paddle, ensure the paddle language data (e.g., fra.traineddata) is installed on your system. You can find information on supported languages here: https://www.paddleocr.ai/main/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html
 # For AWS Comprehend, only English and Spanish are supported https://docs.aws.amazon.com/comprehend/latest/dg/how-pii.html ['en', 'es']
 # AWS Textract automatically detects the language of the document and supports the following languages: https://aws.amazon.com/textract/faqs/#topic-0. 'English, Spanish, Italian, Portuguese, French, German. Handwriting, Invoices and Receipts, Identity documents and Queries processing are in English only'
-textract_language_choices = get_or_create_env_var("textract_language_choices", "['en', 'es', 'fr', 'de', 'it', 'pt']")
-aws_comprehend_language_choices = get_or_create_env_var("aws_comprehend_language_choices", "['en', 'es']")
 # The choices that the user sees
-MAPPED_LANGUAGE_CHOICES = get_or_create_env_var("MAPPED_LANGUAGE_CHOICES", "['english', 'french', 'german', 'spanish', 'italian', 'dutch', 'portuguese', 'chinese', 'japanese', 'korean', 'lithuanian', 'macedonian', 'norwegian_bokmaal', 'polish', 'romanian', 'russian', 'slovenian', 'swedish', 'catalan', 'ukrainian']")
-LANGUAGE_CHOICES = get_or_create_env_var("LANGUAGE_CHOICES", "['en', 'fr', 'de', 'es', 'it', 'nl', 'pt', 'zh', 'ja', 'ko', 'lt', 'mk', 'nb', 'pl', 'ro', 'ru', 'sl', 'sv', 'ca', 'uk']")
 ###
 # Duplicate detection settings
 ###
-DEFAULT_DUPLICATE_DETECTION_THRESHOLD = float(get_or_create_env_var("DEFAULT_DUPLICATE_DETECTION_THRESHOLD", "0.95"))
-DEFAULT_MIN_CONSECUTIVE_PAGES = int(get_or_create_env_var("DEFAULT_MIN_CONSECUTIVE_PAGES", "1"))
-USE_GREEDY_DUPLICATE_DETECTION = get_or_create_env_var("USE_GREEDY_DUPLICATE_DETECTION", "True")
-DEFAULT_COMBINE_PAGES = get_or_create_env_var("DEFAULT_COMBINE_PAGES", "True") # Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.
 DEFAULT_MIN_WORD_COUNT = int(get_or_create_env_var("DEFAULT_MIN_WORD_COUNT", "10"))
 REMOVE_DUPLICATE_ROWS = get_or_create_env_var("REMOVE_DUPLICATE_ROWS", "False")
@@ -373,118 +523,182 @@ REMOVE_DUPLICATE_ROWS = get_or_create_env_var("REMOVE_DUPLICATE_ROWS", "False")
 ###
 # File output options
 ###
-RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
-COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
 ###
 # APP RUN OPTIONS
 ###
-TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tmp/tld/')
-try: extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
-except: extract = TLDExtract(cache_dir=None)
 # Get some environment variables and Launch the Gradio app
-COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
-RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
 # Direct mode configuration options
-DIRECT_MODE_DEFAULT_USER = get_or_create_env_var('DIRECT_MODE_DEFAULT_USER', '') # Default username for cli/direct mode requests
-DIRECT_MODE_TASK = get_or_create_env_var('DIRECT_MODE_TASK', 'redact')  # 'redact' or 'deduplicate'
-DIRECT_MODE_INPUT_FILE = get_or_create_env_var('DIRECT_MODE_INPUT_FILE', '')  # Path to input file
-DIRECT_MODE_OUTPUT_DIR = get_or_create_env_var('DIRECT_MODE_OUTPUT_DIR', OUTPUT_FOLDER)  # Output directory
-DIRECT_MODE_DUPLICATE_TYPE = get_or_create_env_var('DIRECT_MODE_DUPLICATE_TYPE', 'pages')  # 'pages' or 'tabular'
-MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
-MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb').lower()
-GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
-ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
-DEFAULT_CONCURRENCY_LIMIT = int(get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3'))
-FILE_INPUT_HEIGHT = get_or_create_env_var('FILE_INPUT_HEIGHT', '200')
 ### ALLOW LIST
-GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', 'False')
-ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
-S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
-if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
-else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
 ### DENY LIST
-GET_DEFAULT_DENY_LIST = get_or_create_env_var('GET_DEFAULT_DENY_LIST', 'False')
-S3_DENY_LIST_PATH = get_or_create_env_var('S3_DENY_LIST_PATH', '') # default_deny_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
-DENY_LIST_PATH = get_or_create_env_var('DENY_LIST_PATH', '') # config/default_deny_list.csv
-if DENY_LIST_PATH: OUTPUT_DENY_LIST_PATH = DENY_LIST_PATH
-else: OUTPUT_DENY_LIST_PATH = 'config/default_deny_list.csv'
 ### WHOLE PAGE REDACTION LIST
-GET_DEFAULT_WHOLE_PAGE_REDACTION_LIST = get_or_create_env_var('GET_DEFAULT_WHOLE_PAGE_REDACTION_LIST', 'False')
-S3_WHOLE_PAGE_REDACTION_LIST_PATH = get_or_create_env_var('S3_WHOLE_PAGE_REDACTION_LIST_PATH', '') # default_whole_page_redaction_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
-WHOLE_PAGE_REDACTION_LIST_PATH = get_or_create_env_var('WHOLE_PAGE_REDACTION_LIST_PATH', '') # config/default_whole_page_redaction_list.csv
-if WHOLE_PAGE_REDACTION_LIST_PATH: OUTPUT_WHOLE_PAGE_REDACTION_LIST_PATH = WHOLE_PAGE_REDACTION_LIST_PATH
-else: OUTPUT_WHOLE_PAGE_REDACTION_LIST_PATH = 'config/default_whole_page_redaction_list.csv'
 ###
 # COST CODE OPTIONS
 ###
-SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
-GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
-DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '')
-COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') # 'config/COST_CENTRES.csv' # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
-S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
-# A default path in case s3 cost code location is provided but no local cost code location given
-if COST_CODES_PATH: OUTPUT_COST_CODES_PATH = COST_CODES_PATH
-else: OUTPUT_COST_CODES_PATH = 'config/cost_codes.csv'
-ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, is it compulsory to choose one before redacting?
-if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
 ###
 # WHOLE DOCUMENT API OPTIONS
 ###
-SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
-TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET', '')
-TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER', 'input')
-TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER', 'output')
-LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_JOBS_S3', 'False') # Whether or not to load previous Textract jobs from S3
-TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
-TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_INPUT_LOC', 'input') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
-TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
-DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(get_or_create_env_var('DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS', '7')) # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.

+import logging
 import os
 import socket
+import tempfile
 from datetime import datetime
+from typing import List
 from dotenv import load_dotenv
 from tldextract import TLDExtract
 today_rev = datetime.now().strftime("%Y%m%d")
 HOST_NAME = socket.gethostname()
 def _get_env_list(env_var_name: str) -> List[str]:
     """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
     if not value:
         return []
     # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(",") if s.strip()]
 # Set or retrieve configuration variables for the redaction app
+def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
+    """
     Get an environmental variable, and set it to a default value if it doesn't exist
+    """
     # Get the environment variable if it exists
     value = os.environ.get(var_name)
     # If it doesn't exist, set the environment variable to the default value
     if value is None:
         os.environ[var_name] = default_value
         value = default_value
+    if print_val is True:
+        print(f"The value of {var_name} is {value}")
     return value
 def add_folder_to_path(folder_path: str):
+    """
     Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
+    """
     if os.path.exists(folder_path) and os.path.isdir(folder_path):
+        # print(folder_path, "folder exists.")
         # Resolve relative path to absolute path
         absolute_path = os.path.abspath(folder_path)
+        current_path = os.environ["PATH"]
         if absolute_path not in current_path.split(os.pathsep):
             full_path_extension = absolute_path + os.pathsep + current_path
+            os.environ["PATH"] = full_path_extension
+            # print(f"Updated PATH with: ", full_path_extension)
         else:
             pass
+            # print(f"Directory {folder_path} already exists in PATH.")
     else:
         print(f"Folder not found at {folder_path} - not added to PATH")
 ###
 # LOAD CONFIG FROM ENV FILE
 ###
+CONFIG_FOLDER = get_or_create_env_var("CONFIG_FOLDER", "config/")
 # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
+APP_CONFIG_PATH = get_or_create_env_var(
+    "APP_CONFIG_PATH", CONFIG_FOLDER + "app_config.env"
+)  # e.g. config/app_config.env
 if APP_CONFIG_PATH:
     if os.path.exists(APP_CONFIG_PATH):
         print(f"Loading app variables from config file {APP_CONFIG_PATH}")
         load_dotenv(APP_CONFIG_PATH)
+    else:
+        print("App config file not found at location:", APP_CONFIG_PATH)
 ###
 # AWS OPTIONS
 ###
 # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
+AWS_CONFIG_PATH = get_or_create_env_var(
+    "AWS_CONFIG_PATH", ""
+)  # e.g. config/aws_config.env
 if AWS_CONFIG_PATH:
     if os.path.exists(AWS_CONFIG_PATH):
         print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
         load_dotenv(AWS_CONFIG_PATH)
+    else:
+        print("AWS config file not found at location:", AWS_CONFIG_PATH)
 RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
+AWS_REGION = get_or_create_env_var("AWS_REGION", "")
+AWS_CLIENT_ID = get_or_create_env_var("AWS_CLIENT_ID", "")
+AWS_CLIENT_SECRET = get_or_create_env_var("AWS_CLIENT_SECRET", "")
+AWS_USER_POOL_ID = get_or_create_env_var("AWS_USER_POOL_ID", "")
+AWS_ACCESS_KEY = get_or_create_env_var("AWS_ACCESS_KEY", "")
+# if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
+AWS_SECRET_KEY = get_or_create_env_var("AWS_SECRET_KEY", "")
+# if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
+DOCUMENT_REDACTION_BUCKET = get_or_create_env_var("DOCUMENT_REDACTION_BUCKET", "")
 # Should the app prioritise using AWS SSO over using API keys stored in environment variables/secrets (defaults to yes)
+PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS = get_or_create_env_var(
+    "PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS", "1"
+)
 # Custom headers e.g. if routing traffic through Cloudfront
 # Retrieving or setting CUSTOM_HEADER
+CUSTOM_HEADER = get_or_create_env_var("CUSTOM_HEADER", "")
 # Retrieving or setting CUSTOM_HEADER_VALUE
+CUSTOM_HEADER_VALUE = get_or_create_env_var("CUSTOM_HEADER_VALUE", "")
 ###
 # Image options
 ###
+IMAGES_DPI = float(get_or_create_env_var("IMAGES_DPI", "300.0"))
+LOAD_TRUNCATED_IMAGES = get_or_create_env_var("LOAD_TRUNCATED_IMAGES", "True")
+MAX_IMAGE_PIXELS = get_or_create_env_var(
+    "MAX_IMAGE_PIXELS", ""
+)  # Changed to None if blank in file_conversion.py
 ###
 # File I/O options
 ###
+SESSION_OUTPUT_FOLDER = get_or_create_env_var(
+    "SESSION_OUTPUT_FOLDER", "False"
+)  # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
+OUTPUT_FOLDER = get_or_create_env_var("GRADIO_OUTPUT_FOLDER", "output/")  # 'output/'
+INPUT_FOLDER = get_or_create_env_var("GRADIO_INPUT_FOLDER", "input/")  # 'input/'
 # Allow for files to be saved in a temporary folder for increased security in some instances
+if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
     # Create a temporary directory
     with tempfile.TemporaryDirectory() as temp_dir:
+        print(f"Temporary directory created at: {temp_dir}")
+        if OUTPUT_FOLDER == "TEMP":
+            OUTPUT_FOLDER = temp_dir + "/"
+        if INPUT_FOLDER == "TEMP":
+            INPUT_FOLDER = temp_dir + "/"
+GRADIO_TEMP_DIR = get_or_create_env_var(
+    "GRADIO_TEMP_DIR", "tmp/gradio_tmp/"
+)  # Default Gradio temp folder
+MPLCONFIGDIR = get_or_create_env_var(
+    "MPLCONFIGDIR", "tmp/matplotlib_cache/"
+)  # Matplotlib cache folder
 ###
 # LOGGING OPTIONS
 # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
 # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
+SAVE_LOGS_TO_CSV = get_or_create_env_var("SAVE_LOGS_TO_CSV", "True")
+USE_LOG_SUBFOLDERS = get_or_create_env_var("USE_LOG_SUBFOLDERS", "True")
+FEEDBACK_LOGS_FOLDER = get_or_create_env_var("FEEDBACK_LOGS_FOLDER", "feedback/")
+ACCESS_LOGS_FOLDER = get_or_create_env_var("ACCESS_LOGS_FOLDER", "logs/")
+USAGE_LOGS_FOLDER = get_or_create_env_var("USAGE_LOGS_FOLDER", "usage/")
 if USE_LOG_SUBFOLDERS == "True":
+    day_log_subfolder = today_rev + "/"
+    host_name_subfolder = HOST_NAME + "/"
     full_log_subfolder = day_log_subfolder + host_name_subfolder
     FEEDBACK_LOGS_FOLDER = FEEDBACK_LOGS_FOLDER + full_log_subfolder
     ACCESS_LOGS_FOLDER = ACCESS_LOGS_FOLDER + full_log_subfolder
     USAGE_LOGS_FOLDER = USAGE_LOGS_FOLDER + full_log_subfolder
+S3_FEEDBACK_LOGS_FOLDER = get_or_create_env_var(
+    "S3_FEEDBACK_LOGS_FOLDER", "feedback/" + full_log_subfolder
+)
+S3_ACCESS_LOGS_FOLDER = get_or_create_env_var(
+    "S3_ACCESS_LOGS_FOLDER", "logs/" + full_log_subfolder
+)
+S3_USAGE_LOGS_FOLDER = get_or_create_env_var(
+    "S3_USAGE_LOGS_FOLDER", "usage/" + full_log_subfolder
+)
 # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
+DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var(
+    "DISPLAY_FILE_NAMES_IN_LOGS", "False"
+)
 # Further customisation options for CSV logs
+CSV_ACCESS_LOG_HEADERS = get_or_create_env_var(
+    "CSV_ACCESS_LOG_HEADERS", ""
+)  # If blank, uses component labels
+CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var(
+    "CSV_FEEDBACK_LOG_HEADERS", ""
+)  # If blank, uses component labels
+CSV_USAGE_LOG_HEADERS = get_or_create_env_var(
+    "CSV_USAGE_LOG_HEADERS",
+    '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number",	"total_page_count",	"textract_query_number", "pii_detection_method", "comprehend_query_number",  "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call", "task"]',
+)  # If blank, uses component labels
 ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
+SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var("SAVE_LOGS_TO_DYNAMODB", "False")
+ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "ACCESS_LOG_DYNAMODB_TABLE_NAME", "redaction_access_log"
+)
+DYNAMODB_ACCESS_LOG_HEADERS = get_or_create_env_var("DYNAMODB_ACCESS_LOG_HEADERS", "")
+FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "FEEDBACK_LOG_DYNAMODB_TABLE_NAME", "redaction_feedback"
+)
+DYNAMODB_FEEDBACK_LOG_HEADERS = get_or_create_env_var(
+    "DYNAMODB_FEEDBACK_LOG_HEADERS", ""
+)
+USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "USAGE_LOG_DYNAMODB_TABLE_NAME", "redaction_usage"
+)
+DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var("DYNAMODB_USAGE_LOG_HEADERS", "")
 # Report logging to console?
+LOGGING = get_or_create_env_var("LOGGING", "False")
+if LOGGING == "True":
     # Configure logging
+    logging.basicConfig(
+        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+    )
+LOG_FILE_NAME = get_or_create_env_var("LOG_FILE_NAME", "log.csv")
+USAGE_LOG_FILE_NAME = get_or_create_env_var("USAGE_LOG_FILE_NAME", LOG_FILE_NAME)
+FEEDBACK_LOG_FILE_NAME = get_or_create_env_var("FEEDBACK_LOG_FILE_NAME", LOG_FILE_NAME)
 ###
 ###
 # Create Tesseract and Poppler folders if you have installed them locally
+TESSERACT_FOLDER = get_or_create_env_var(
+    "TESSERACT_FOLDER", ""
+)  #  # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
+TESSERACT_DATA_FOLDER = get_or_create_env_var(
+    "TESSERACT_DATA_FOLDER", "/usr/share/tessdata"
+)
+POPPLER_FOLDER = get_or_create_env_var(
+    "POPPLER_FOLDER", ""
+)  # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
+if TESSERACT_FOLDER:
+    add_folder_to_path(TESSERACT_FOLDER)
+if POPPLER_FOLDER:
+    add_folder_to_path(POPPLER_FOLDER)
 # List of models to use for text extraction and PII detection
 # Text extraction models
+SELECTABLE_TEXT_EXTRACT_OPTION = get_or_create_env_var(
+    "SELECTABLE_TEXT_EXTRACT_OPTION", "Local model - selectable text"
+)
+TESSERACT_TEXT_EXTRACT_OPTION = get_or_create_env_var(
+    "TESSERACT_TEXT_EXTRACT_OPTION", "Local OCR model - PDFs without selectable text"
+)
+TEXTRACT_TEXT_EXTRACT_OPTION = get_or_create_env_var(
+    "TEXTRACT_TEXT_EXTRACT_OPTION", "AWS Textract service - all PDF types"
+)
 # PII detection models
+NO_REDACTION_PII_OPTION = get_or_create_env_var(
+    "NO_REDACTION_PII_OPTION", "Only extract text (no redaction)"
+)
+LOCAL_PII_OPTION = get_or_create_env_var("LOCAL_PII_OPTION", "Local")
+AWS_PII_OPTION = get_or_create_env_var("AWS_PII_OPTION", "AWS Comprehend")
+SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = get_or_create_env_var(
+    "SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS", "True"
+)
+SHOW_AWS_TEXT_EXTRACTION_OPTIONS = get_or_create_env_var(
+    "SHOW_AWS_TEXT_EXTRACTION_OPTIONS", "True"
+)
 # Show at least local options if everything mistakenly removed
+if (
+    SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS != "True"
+    and SHOW_AWS_TEXT_EXTRACTION_OPTIONS != "True"
+):
     SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = "True"
 local_model_options = []
 aws_model_options = []
 text_extraction_models = []
+if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS == "True":
     local_model_options.append(SELECTABLE_TEXT_EXTRACT_OPTION)
     local_model_options.append(TESSERACT_TEXT_EXTRACT_OPTION)
+if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == "True":
     aws_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION)
 TEXT_EXTRACTION_MODELS = local_model_options + aws_model_options
+DO_INITIAL_TABULAR_DATA_CLEAN = get_or_create_env_var(
+    "DO_INITIAL_TABULAR_DATA_CLEAN", "True"
+)
+SHOW_LOCAL_PII_DETECTION_OPTIONS = get_or_create_env_var(
+    "SHOW_LOCAL_PII_DETECTION_OPTIONS", "True"
+)
+SHOW_AWS_PII_DETECTION_OPTIONS = get_or_create_env_var(
+    "SHOW_AWS_PII_DETECTION_OPTIONS", "True"
+)
+if (
+    SHOW_LOCAL_PII_DETECTION_OPTIONS != "True"
+    and SHOW_AWS_PII_DETECTION_OPTIONS != "True"
+):
     SHOW_LOCAL_PII_DETECTION_OPTIONS = "True"
 local_model_options = [NO_REDACTION_PII_OPTION]
 aws_model_options = []
 pii_detection_models = []
+if SHOW_LOCAL_PII_DETECTION_OPTIONS == "True":
     local_model_options.append(LOCAL_PII_OPTION)
+if SHOW_AWS_PII_DETECTION_OPTIONS == "True":
     aws_model_options.append(AWS_PII_OPTION)
 PII_DETECTION_MODELS = local_model_options + aws_model_options
 if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == "True":
+    DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var(
+        "DEFAULT_TEXT_EXTRACTION_MODEL", TEXTRACT_TEXT_EXTRACT_OPTION
+    )
 else:
+    DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var(
+        "DEFAULT_TEXT_EXTRACTION_MODEL", SELECTABLE_TEXT_EXTRACT_OPTION
+    )
 if SHOW_AWS_PII_DETECTION_OPTIONS == "True":
+    DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var(
+        "DEFAULT_PII_DETECTION_MODEL", AWS_PII_OPTION
+    )
 else:
+    DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var(
+        "DEFAULT_PII_DETECTION_MODEL", LOCAL_PII_OPTION
+    )
 # Create list of PII detection models for tabular redaction
 TABULAR_PII_DETECTION_MODELS = PII_DETECTION_MODELS.copy()
 if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
     TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
+DEFAULT_TEXT_COLUMNS = get_or_create_env_var("DEFAULT_TEXT_COLUMNS", "[]")
+DEFAULT_EXCEL_SHEETS = get_or_create_env_var("DEFAULT_EXCEL_SHEETS", "[]")
+DEFAULT_TABULAR_ANONYMISATION_STRATEGY = get_or_create_env_var(
+    "DEFAULT_TABULAR_ANONYMISATION_STRATEGY", "redact completely"
+)
 ### Local OCR model - Tesseract vs PaddleOCR
+CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var(
+    "CHOSEN_LOCAL_OCR_MODEL", "tesseract"
+)  # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
+PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var(
+    "PREPROCESS_LOCAL_OCR_IMAGES", "True"
+)  # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this doesn't necessarily imporove results, and greatly slows down extraction.
 # Entities for redaction
+CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var(
+    "CHOSEN_COMPREHEND_ENTITIES",
+    "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']",
+)
+FULL_COMPREHEND_ENTITY_LIST = get_or_create_env_var(
+    "FULL_COMPREHEND_ENTITY_LIST",
+    "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', 'CUSTOM_FUZZY']",
+)
 # Entities for local PII redaction option
+CHOSEN_REDACT_ENTITIES = get_or_create_env_var(
+    "CHOSEN_REDACT_ENTITIES",
+    "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CUSTOM']",
+)
+FULL_ENTITY_LIST = get_or_create_env_var(
+    "FULL_ENTITY_LIST",
+    "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']",
+)
+CUSTOM_ENTITIES = get_or_create_env_var(
+    "CUSTOM_ENTITIES", "['TITLES', 'UKPOSTCODE', 'STREETNAME', 'CUSTOM']"
+)
+DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var(
+    "DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX", "['Extract handwriting']"
+)
+HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS = get_or_create_env_var(
+    "HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS",
+    "['Extract handwriting', 'Extract signatures']",
+)
+if HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS:
+    HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS = _get_env_list(
+        HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS
+    )
+INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var(
+    "INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION", "False"
+)
+INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var(
+    "INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION", "False"
+)
+INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var(
+    "INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION", "False"
+)
 if INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION == "True":
+    HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append("Extract forms")
 if INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION == "True":
+    HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append("Extract layout")
 if INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION == "True":
+    HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append("Extract tables")
+DEFAULT_SEARCH_QUERY = get_or_create_env_var("DEFAULT_SEARCH_QUERY", "")
+DEFAULT_FUZZY_SPELLING_MISTAKES_NUM = int(
+    get_or_create_env_var("DEFAULT_FUZZY_SPELLING_MISTAKES_NUM", "1")
+)
+DEFAULT_PAGE_MIN = int(get_or_create_env_var("DEFAULT_PAGE_MIN", "0"))
+DEFAULT_PAGE_MAX = int(get_or_create_env_var("DEFAULT_PAGE_MAX", "999"))
 # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
+PAGE_BREAK_VALUE = int(get_or_create_env_var("PAGE_BREAK_VALUE", "99999"))
+MAX_TIME_VALUE = int(get_or_create_env_var("MAX_TIME_VALUE", "999999"))
+MAX_SIMULTANEOUS_FILES = int(get_or_create_env_var("MAX_SIMULTANEOUS_FILES", "10"))
+MAX_DOC_PAGES = int(get_or_create_env_var("MAX_DOC_PAGES", "3000"))
+MAX_TABLE_ROWS = int(get_or_create_env_var("MAX_TABLE_ROWS", "250000"))
+MAX_TABLE_COLUMNS = int(get_or_create_env_var("MAX_TABLE_COLUMNS", "100"))
+MAX_OPEN_TEXT_CHARACTERS = int(
+    get_or_create_env_var("MAX_OPEN_TEXT_CHARACTERS", "50000")
+)
+CUSTOM_BOX_COLOUR = get_or_create_env_var(
+    "CUSTOM_BOX_COLOUR", ""
+)  # only "grey" is currently supported as a custom box colour
 ### Language selection options
 SHOW_LANGUAGE_SELECTION = get_or_create_env_var("SHOW_LANGUAGE_SELECTION", "False")
+DEFAULT_LANGUAGE_FULL_NAME = get_or_create_env_var(
+    "DEFAULT_LANGUAGE_FULL_NAME", "english"
+)
+DEFAULT_LANGUAGE = get_or_create_env_var(
+    "DEFAULT_LANGUAGE", "en"
+)  # For tesseract, ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system. You can find the relevant language packs here: https://github.com/tesseract-ocr/tessdata.
 # For paddle, ensure the paddle language data (e.g., fra.traineddata) is installed on your system. You can find information on supported languages here: https://www.paddleocr.ai/main/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html
 # For AWS Comprehend, only English and Spanish are supported https://docs.aws.amazon.com/comprehend/latest/dg/how-pii.html ['en', 'es']
 # AWS Textract automatically detects the language of the document and supports the following languages: https://aws.amazon.com/textract/faqs/#topic-0. 'English, Spanish, Italian, Portuguese, French, German. Handwriting, Invoices and Receipts, Identity documents and Queries processing are in English only'
+textract_language_choices = get_or_create_env_var(
+    "textract_language_choices", "['en', 'es', 'fr', 'de', 'it', 'pt']"
+)
+aws_comprehend_language_choices = get_or_create_env_var(
+    "aws_comprehend_language_choices", "['en', 'es']"
+)
 # The choices that the user sees
+MAPPED_LANGUAGE_CHOICES = get_or_create_env_var(
+    "MAPPED_LANGUAGE_CHOICES",
+    "['english', 'french', 'german', 'spanish', 'italian', 'dutch', 'portuguese', 'chinese', 'japanese', 'korean', 'lithuanian', 'macedonian', 'norwegian_bokmaal', 'polish', 'romanian', 'russian', 'slovenian', 'swedish', 'catalan', 'ukrainian']",
+)
+LANGUAGE_CHOICES = get_or_create_env_var(
+    "LANGUAGE_CHOICES",
+    "['en', 'fr', 'de', 'es', 'it', 'nl', 'pt', 'zh', 'ja', 'ko', 'lt', 'mk', 'nb', 'pl', 'ro', 'ru', 'sl', 'sv', 'ca', 'uk']",
+)
 ###
 # Duplicate detection settings
 ###
+DEFAULT_DUPLICATE_DETECTION_THRESHOLD = float(
+    get_or_create_env_var("DEFAULT_DUPLICATE_DETECTION_THRESHOLD", "0.95")
+)
+DEFAULT_MIN_CONSECUTIVE_PAGES = int(
+    get_or_create_env_var("DEFAULT_MIN_CONSECUTIVE_PAGES", "1")
+)
+USE_GREEDY_DUPLICATE_DETECTION = get_or_create_env_var(
+    "USE_GREEDY_DUPLICATE_DETECTION", "True"
+)
+DEFAULT_COMBINE_PAGES = get_or_create_env_var(
+    "DEFAULT_COMBINE_PAGES", "True"
+)  # Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.
 DEFAULT_MIN_WORD_COUNT = int(get_or_create_env_var("DEFAULT_MIN_WORD_COUNT", "10"))
 REMOVE_DUPLICATE_ROWS = get_or_create_env_var("REMOVE_DUPLICATE_ROWS", "False")
 ###
 # File output options
 ###
+RETURN_PDF_END_OF_REDACTION = get_or_create_env_var(
+    "RETURN_PDF_END_OF_REDACTION", "True"
+)  # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
+COMPRESS_REDACTED_PDF = get_or_create_env_var(
+    "COMPRESS_REDACTED_PDF", "False"
+)  # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
 ###
 # APP RUN OPTIONS
 ###
+TLDEXTRACT_CACHE = get_or_create_env_var("TLDEXTRACT_CACHE", "tmp/tld/")
+try:
+    extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
+except Exception as e:
+    print(f"Error initialising TLDExtract: {e}")
+    extract = TLDExtract(cache_dir=None)
 # Get some environment variables and Launch the Gradio app
+COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
+RUN_DIRECT_MODE = get_or_create_env_var("RUN_DIRECT_MODE", "0")
 # Direct mode configuration options
+DIRECT_MODE_DEFAULT_USER = get_or_create_env_var(
+    "DIRECT_MODE_DEFAULT_USER", ""
+)  # Default username for cli/direct mode requests
+DIRECT_MODE_TASK = get_or_create_env_var(
+    "DIRECT_MODE_TASK", "redact"
+)  # 'redact' or 'deduplicate'
+DIRECT_MODE_INPUT_FILE = get_or_create_env_var(
+    "DIRECT_MODE_INPUT_FILE", ""
+)  # Path to input file
+DIRECT_MODE_OUTPUT_DIR = get_or_create_env_var(
+    "DIRECT_MODE_OUTPUT_DIR", OUTPUT_FOLDER
+)  # Output directory
+DIRECT_MODE_DUPLICATE_TYPE = get_or_create_env_var(
+    "DIRECT_MODE_DUPLICATE_TYPE", "pages"
+)  # 'pages' or 'tabular'
+MAX_QUEUE_SIZE = int(get_or_create_env_var("MAX_QUEUE_SIZE", "5"))
+MAX_FILE_SIZE = get_or_create_env_var("MAX_FILE_SIZE", "250mb").lower()
+GRADIO_SERVER_PORT = int(get_or_create_env_var("GRADIO_SERVER_PORT", "7860"))
+ROOT_PATH = get_or_create_env_var("ROOT_PATH", "")
+DEFAULT_CONCURRENCY_LIMIT = int(get_or_create_env_var("DEFAULT_CONCURRENCY_LIMIT", "3"))
+FILE_INPUT_HEIGHT = get_or_create_env_var("FILE_INPUT_HEIGHT", "200")
 ### ALLOW LIST
+GET_DEFAULT_ALLOW_LIST = get_or_create_env_var("GET_DEFAULT_ALLOW_LIST", "False")
+ALLOW_LIST_PATH = get_or_create_env_var(
+    "ALLOW_LIST_PATH", ""
+)  # config/default_allow_list.csv
+S3_ALLOW_LIST_PATH = get_or_create_env_var(
+    "S3_ALLOW_LIST_PATH", ""
+)  # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
+if ALLOW_LIST_PATH:
+    OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
+else:
+    OUTPUT_ALLOW_LIST_PATH = "config/default_allow_list.csv"
 ### DENY LIST
+GET_DEFAULT_DENY_LIST = get_or_create_env_var("GET_DEFAULT_DENY_LIST", "False")
+S3_DENY_LIST_PATH = get_or_create_env_var(
+    "S3_DENY_LIST_PATH", ""
+)  # default_deny_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
+DENY_LIST_PATH = get_or_create_env_var(
+    "DENY_LIST_PATH", ""
+)  # config/default_deny_list.csv
+if DENY_LIST_PATH:
+    OUTPUT_DENY_LIST_PATH = DENY_LIST_PATH
+else:
+    OUTPUT_DENY_LIST_PATH = "config/default_deny_list.csv"
 ### WHOLE PAGE REDACTION LIST
+GET_DEFAULT_WHOLE_PAGE_REDACTION_LIST = get_or_create_env_var(
+    "GET_DEFAULT_WHOLE_PAGE_REDACTION_LIST", "False"
+)
+S3_WHOLE_PAGE_REDACTION_LIST_PATH = get_or_create_env_var(
+    "S3_WHOLE_PAGE_REDACTION_LIST_PATH", ""
+)  # default_whole_page_redaction_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
+WHOLE_PAGE_REDACTION_LIST_PATH = get_or_create_env_var(
+    "WHOLE_PAGE_REDACTION_LIST_PATH", ""
+)  # config/default_whole_page_redaction_list.csv
+if WHOLE_PAGE_REDACTION_LIST_PATH:
+    OUTPUT_WHOLE_PAGE_REDACTION_LIST_PATH = WHOLE_PAGE_REDACTION_LIST_PATH
+else:
+    OUTPUT_WHOLE_PAGE_REDACTION_LIST_PATH = (
+        "config/default_whole_page_redaction_list.csv"
+    )
 ###
 # COST CODE OPTIONS
 ###
+SHOW_COSTS = get_or_create_env_var("SHOW_COSTS", "False")
+GET_COST_CODES = get_or_create_env_var("GET_COST_CODES", "False")
+DEFAULT_COST_CODE = get_or_create_env_var("DEFAULT_COST_CODE", "")
+COST_CODES_PATH = get_or_create_env_var(
+    "COST_CODES_PATH", ""
+)  # 'config/COST_CENTRES.csv' # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
+S3_COST_CODES_PATH = get_or_create_env_var(
+    "S3_COST_CODES_PATH", ""
+)  # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
+# A default path in case s3 cost code location is provided but no local cost code location given
+if COST_CODES_PATH:
+    OUTPUT_COST_CODES_PATH = COST_CODES_PATH
+else:
+    OUTPUT_COST_CODES_PATH = "config/cost_codes.csv"
+ENFORCE_COST_CODES = get_or_create_env_var(
+    "ENFORCE_COST_CODES", "False"
+)  # If you have cost codes listed, is it compulsory to choose one before redacting?
+if ENFORCE_COST_CODES == "True":
+    GET_COST_CODES = "True"
 ###
 # WHOLE DOCUMENT API OPTIONS
 ###
+SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = get_or_create_env_var(
+    "SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS", "False"
+)  # This feature not currently implemented
+TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET = get_or_create_env_var(
+    "TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET", ""
+)
+TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var(
+    "TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER", "input"
+)
+TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var(
+    "TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER", "output"
+)
+LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var(
+    "LOAD_PREVIOUS_TEXTRACT_JOBS_S3", "False"
+)  # Whether or not to load previous Textract jobs from S3
+TEXTRACT_JOBS_S3_LOC = get_or_create_env_var(
+    "TEXTRACT_JOBS_S3_LOC", "output"
+)  # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
+TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var(
+    "TEXTRACT_JOBS_S3_INPUT_LOC", "input"
+)  # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
+TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var(
+    "TEXTRACT_JOBS_LOCAL_LOC", "output"
+)  # Local subfolder where the Textract jobs are stored
+DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(
+    get_or_create_env_var("DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS", "7")
+)  # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.

tools/custom_csvlogger.py CHANGED Viewed

@@ -1,28 +1,30 @@
 from __future__ import annotations
-import contextlib
 import csv
-import datetime
-from datetime import datetime
 import os
 import re
-import boto3
-import botocore
-import uuid
 import time
 from collections.abc import Sequence
-from multiprocessing import Lock
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
-from gradio_client import utils as client_utils
-import gradio as gr
 from gradio import utils
-from tools.config import AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
 if TYPE_CHECKING:
     from gradio.components import Component
 from gradio.flagging import FlaggingCallback
-from threading import Lock
 class CSVLogger_custom(FlaggingCallback):
     """
@@ -68,15 +70,15 @@ class CSVLogger_custom(FlaggingCallback):
         self.first_time = True
     def _create_dataset_file(
-    self,
-    additional_headers: list[str] | None = None,
-    replacement_headers: list[str] | None = None
-):
         os.makedirs(self.flagging_dir, exist_ok=True)
         if replacement_headers:
             if additional_headers is None:
-                additional_headers = list()
             if len(replacement_headers) != len(self.components):
                 raise ValueError(
@@ -87,10 +89,14 @@ class CSVLogger_custom(FlaggingCallback):
         else:
             if additional_headers is None:
                 additional_headers = []
-            headers = [
-                getattr(component, "label", None) or f"component {idx}"
-                for idx, component in enumerate(self.components)
-            ] + additional_headers + ["timestamp"]
         headers = utils.sanitize_list_for_csv(headers)
         dataset_files = list(Path(self.flagging_dir).glob("dataset*.csv"))
@@ -130,16 +136,16 @@ class CSVLogger_custom(FlaggingCallback):
             print("Using existing dataset file at:", self.dataset_filepath)
     def flag(
-    self,
-    flag_data: list[Any],
-    flag_option: str | None = None,
-    username: str | None = None,
-    save_to_csv: bool = True,
-    save_to_dynamodb: bool = False,
-    dynamodb_table_name: str | None = None,
-    dynamodb_headers: list[str] | None = None,  # New: specify headers for DynamoDB
-    replacement_headers: list[str] | None = None
-) -> int:
         if self.first_time:
             additional_headers = list()
             if flag_option is not None:
@@ -147,7 +153,10 @@ class CSVLogger_custom(FlaggingCallback):
             if username is not None:
                 additional_headers.append("username")
             additional_headers.append("id")
-            self._create_dataset_file(additional_headers=additional_headers, replacement_headers=replacement_headers)
             self.first_time = False
         csv_data = list()
@@ -180,59 +189,77 @@ class CSVLogger_custom(FlaggingCallback):
         generated_id = str(uuid.uuid4())
         csv_data.append(generated_id)
-        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] # Correct format for Amazon Athena
-        csv_data.append(timestamp)
         # Build the headers
-        headers = (
-            [getattr(component, "label", None) or f"component {idx}" for idx, component in enumerate(self.components)]
-        )
         if flag_option is not None:
             headers.append("flag")
         if username is not None:
             headers.append("username")
         headers.append("id")
-        headers.append("timestamp")
         line_count = -1
         if save_to_csv:
             with self.lock:
-                with open(self.dataset_filepath, "a", newline="", encoding="utf-8") as csvfile:
                     writer = csv.writer(csvfile)
                     writer.writerow(utils.sanitize_list_for_csv(csv_data))
                 with open(self.dataset_filepath, encoding="utf-8") as csvfile:
                     line_count = len(list(csv.reader(csvfile))) - 1
-        if save_to_dynamodb == True:
             if RUN_AWS_FUNCTIONS == "1":
                 try:
                     print("Connecting to DynamoDB via existing SSO connection")
-                    dynamodb = boto3.resource('dynamodb', region_name=AWS_REGION)
-                    test_connection = dynamodb.meta.client.list_tables()
                 except Exception as e:
-                    print("No SSO credentials found:", e)
                     if AWS_ACCESS_KEY and AWS_SECRET_KEY:
                         print("Trying DynamoDB credentials from environment variables")
-                        dynamodb = boto3.resource('dynamodb',aws_access_key_id=AWS_ACCESS_KEY,
-                            aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
                     else:
-                        raise Exception("AWS credentials for DynamoDB logging not found")
             else:
                 raise Exception("AWS credentials for DynamoDB logging not found")
             if dynamodb_table_name is None:
-                raise ValueError("You must provide a dynamodb_table_name if save_to_dynamodb is True")
-            if dynamodb_headers: dynamodb_headers = dynamodb_headers
-            if not dynamodb_headers and replacement_headers: dynamodb_headers = replacement_headers
-            elif headers: dynamodb_headers = headers
             elif not dynamodb_headers:
-                raise ValueError("Headers not found. You must provide dynamodb_headers or replacement_headers to create a new table.")
             if flag_option is not None:
                 if "flag" not in dynamodb_headers:
                     dynamodb_headers.append("flag")
@@ -249,22 +276,27 @@ class CSVLogger_custom(FlaggingCallback):
                 table = dynamodb.Table(dynamodb_table_name)
                 table.load()
             except botocore.exceptions.ClientError as e:
-                if e.response['Error']['Code'] == 'ResourceNotFoundException':
                     attribute_definitions = [
-                        {'AttributeName': 'id', 'AttributeType': 'S'}  # Only define key attributes here
                     ]
                     table = dynamodb.create_table(
                         TableName=dynamodb_table_name,
                         KeySchema=[
-                            {'AttributeName': 'id', 'KeyType': 'HASH'}  # Partition key
                         ],
                         AttributeDefinitions=attribute_definitions,
-                        BillingMode='PAY_PER_REQUEST'
-)
                     # Wait until the table exists
-                    table.meta.client.get_waiter('table_exists').wait(TableName=dynamodb_table_name)
                     time.sleep(5)
                     print(f"Table '{dynamodb_table_name}' created successfully.")
                 else:
@@ -274,12 +306,17 @@ class CSVLogger_custom(FlaggingCallback):
             try:
                 item = {
-                    'id': str(generated_id),  # UUID primary key
-                    'timestamp': timestamp,
                 }
                 # Map the headers to values
-                item.update({header: str(value) for header, value in zip(dynamodb_headers, csv_data)})
                 table.put_item(Item=item)
@@ -287,4 +324,4 @@ class CSVLogger_custom(FlaggingCallback):
             except Exception as e:
                 print("Could not upload log to DynamobDB due to", e)
-        return line_count

 from __future__ import annotations
 import csv
 import os
 import re
 import time
+import uuid
 from collections.abc import Sequence
+from datetime import datetime
 from pathlib import Path
+# from multiprocessing import Lock
+from threading import Lock
 from typing import TYPE_CHECKING, Any
+import boto3
+import botocore
 from gradio import utils
+from gradio_client import utils as client_utils
+from tools.config import AWS_ACCESS_KEY, AWS_REGION, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
 if TYPE_CHECKING:
     from gradio.components import Component
 from gradio.flagging import FlaggingCallback
 class CSVLogger_custom(FlaggingCallback):
     """
         self.first_time = True
     def _create_dataset_file(
+        self,
+        additional_headers: list[str] | None = None,
+        replacement_headers: list[str] | None = None,
+    ):
         os.makedirs(self.flagging_dir, exist_ok=True)
         if replacement_headers:
             if additional_headers is None:
+                additional_headers = list()
             if len(replacement_headers) != len(self.components):
                 raise ValueError(
         else:
             if additional_headers is None:
                 additional_headers = []
+            headers = (
+                [
+                    getattr(component, "label", None) or f"component {idx}"
+                    for idx, component in enumerate(self.components)
+                ]
+                + additional_headers
+                + ["timestamp"]
+            )
         headers = utils.sanitize_list_for_csv(headers)
         dataset_files = list(Path(self.flagging_dir).glob("dataset*.csv"))
             print("Using existing dataset file at:", self.dataset_filepath)
     def flag(
+        self,
+        flag_data: list[Any],
+        flag_option: str | None = None,
+        username: str | None = None,
+        save_to_csv: bool = True,
+        save_to_dynamodb: bool = False,
+        dynamodb_table_name: str | None = None,
+        dynamodb_headers: list[str] | None = None,  # New: specify headers for DynamoDB
+        replacement_headers: list[str] | None = None,
+    ) -> int:
         if self.first_time:
             additional_headers = list()
             if flag_option is not None:
             if username is not None:
                 additional_headers.append("username")
             additional_headers.append("id")
+            self._create_dataset_file(
+                additional_headers=additional_headers,
+                replacement_headers=replacement_headers,
+            )
             self.first_time = False
         csv_data = list()
         generated_id = str(uuid.uuid4())
         csv_data.append(generated_id)
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[
+            :-3
+        ]  # Correct format for Amazon Athena
+        csv_data.append(timestamp)
         # Build the headers
+        headers = [
+            getattr(component, "label", None) or f"component {idx}"
+            for idx, component in enumerate(self.components)
+        ]
         if flag_option is not None:
             headers.append("flag")
         if username is not None:
             headers.append("username")
         headers.append("id")
+        headers.append("timestamp")
         line_count = -1
         if save_to_csv:
             with self.lock:
+                with open(
+                    self.dataset_filepath, "a", newline="", encoding="utf-8"
+                ) as csvfile:
                     writer = csv.writer(csvfile)
                     writer.writerow(utils.sanitize_list_for_csv(csv_data))
                 with open(self.dataset_filepath, encoding="utf-8") as csvfile:
                     line_count = len(list(csv.reader(csvfile))) - 1
+        if save_to_dynamodb is True:
             if RUN_AWS_FUNCTIONS == "1":
                 try:
                     print("Connecting to DynamoDB via existing SSO connection")
+                    dynamodb = boto3.resource("dynamodb", region_name=AWS_REGION)
+                    dynamodb.meta.client.list_tables()
                 except Exception as e:
+                    print("No SSO credentials found:", e)
                     if AWS_ACCESS_KEY and AWS_SECRET_KEY:
                         print("Trying DynamoDB credentials from environment variables")
+                        dynamodb = boto3.resource(
+                            "dynamodb",
+                            aws_access_key_id=AWS_ACCESS_KEY,
+                            aws_secret_access_key=AWS_SECRET_KEY,
+                            region_name=AWS_REGION,
+                        )
                     else:
+                        raise Exception(
+                            "AWS credentials for DynamoDB logging not found"
+                        )
             else:
                 raise Exception("AWS credentials for DynamoDB logging not found")
             if dynamodb_table_name is None:
+                raise ValueError(
+                    "You must provide a dynamodb_table_name if save_to_dynamodb is True"
+                )
+            if dynamodb_headers:
+                dynamodb_headers = dynamodb_headers
+            if not dynamodb_headers and replacement_headers:
+                dynamodb_headers = replacement_headers
+            elif headers:
+                dynamodb_headers = headers
             elif not dynamodb_headers:
+                raise ValueError(
+                    "Headers not found. You must provide dynamodb_headers or replacement_headers to create a new table."
+                )
             if flag_option is not None:
                 if "flag" not in dynamodb_headers:
                     dynamodb_headers.append("flag")
                 table = dynamodb.Table(dynamodb_table_name)
                 table.load()
             except botocore.exceptions.ClientError as e:
+                if e.response["Error"]["Code"] == "ResourceNotFoundException":
                     attribute_definitions = [
+                        {
+                            "AttributeName": "id",
+                            "AttributeType": "S",
+                        }  # Only define key attributes here
                     ]
                     table = dynamodb.create_table(
                         TableName=dynamodb_table_name,
                         KeySchema=[
+                            {"AttributeName": "id", "KeyType": "HASH"}  # Partition key
                         ],
                         AttributeDefinitions=attribute_definitions,
+                        BillingMode="PAY_PER_REQUEST",
+                    )
                     # Wait until the table exists
+                    table.meta.client.get_waiter("table_exists").wait(
+                        TableName=dynamodb_table_name
+                    )
                     time.sleep(5)
                     print(f"Table '{dynamodb_table_name}' created successfully.")
                 else:
             try:
                 item = {
+                    "id": str(generated_id),  # UUID primary key
+                    "timestamp": timestamp,
                 }
                 # Map the headers to values
+                item.update(
+                    {
+                        header: str(value)
+                        for header, value in zip(dynamodb_headers, csv_data)
+                    }
+                )
                 table.put_item(Item=item)
             except Exception as e:
                 print("Could not upload log to DynamobDB due to", e)
+        return line_count

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -1,32 +1,40 @@
-import pytesseract
-import numpy as np
-import pandas as pd
-import gradio as gr
-from presidio_analyzer import AnalyzerEngine, RecognizerResult
-from typing import List, Dict, Optional, Union, Tuple, Any
-from dataclasses import dataclass
-import time
-import cv2
-import re
 import copy
-import botocore
 from copy import deepcopy
 from pdfminer.layout import LTChar
 from PIL import Image
-from typing import Optional, Tuple, Union
 from tools.helper_functions import clean_unicode_text
-from tools.presidio_analyzer_custom import recognizer_result_from_dict
 from tools.load_spacy_model_custom_recognisers import custom_entities
-from tools.config import PREPROCESS_LOCAL_OCR_IMAGES, DEFAULT_LANGUAGE, LOCAL_PII_OPTION, AWS_PII_OPTION
-if PREPROCESS_LOCAL_OCR_IMAGES == "True": PREPROCESS_LOCAL_OCR_IMAGES = True
-else: PREPROCESS_LOCAL_OCR_IMAGES = False
 try:
     from paddleocr import PaddleOCR
 except ImportError:
     PaddleOCR = None
 # --- Language utilities ---
 def _normalize_lang(language: str) -> str:
     return language.strip().lower().replace("-", "_") if language else "en"
@@ -38,35 +46,75 @@ def _tesseract_lang_code(language: str) -> str:
     mapping = {
         # Common
-        "en": "eng", "eng": "eng",
-        "fr": "fra", "fre": "fra", "fra": "fra",
-        "de": "deu", "ger": "deu", "deu": "deu",
-        "es": "spa", "spa": "spa",
-        "it": "ita", "ita": "ita",
-        "nl": "nld", "dut": "nld", "nld": "nld",
-        "pt": "por", "por": "por",
-        "ru": "rus", "rus": "rus",
-        "ar": "ara", "ara": "ara",
         # Nordics
-        "sv": "swe", "swe": "swe",
-        "no": "nor", "nb": "nor", "nn": "nor", "nor": "nor",
-        "fi": "fin", "fin": "fin",
-        "da": "dan", "dan": "dan",
         # Eastern/Central
-        "pl": "pol", "pol": "pol",
-        "cs": "ces", "cz": "ces", "ces": "ces",
-        "hu": "hun", "hun": "hun",
-        "ro": "ron", "rum": "ron", "ron": "ron",
-        "bg": "bul", "bul": "bul",
-        "el": "ell", "gre": "ell", "ell": "ell",
         # Asian
-        "ja": "jpn", "jp": "jpn", "jpn": "jpn",
-        "zh": "chi_sim", "zh_cn": "chi_sim", "zh_hans": "chi_sim", "chi_sim": "chi_sim",
-        "zh_tw": "chi_tra", "zh_hk": "chi_tra", "zh_tr": "chi_tra", "chi_tra": "chi_tra",
-        "hi": "hin", "hin": "hin",
-        "bn": "ben", "ben": "ben",
-        "ur": "urd", "urd": "urd",
-        "fa": "fas", "per": "fas", "fas": "fas",
     }
     return mapping.get(lang, "eng")
@@ -107,6 +155,7 @@ def _paddle_lang_code(language: str) -> str:
     return mapping.get(lang, "en")
 @dataclass
 class OCRResult:
     text: str
@@ -117,6 +166,7 @@ class OCRResult:
     conf: float = None
     line: int = None
 @dataclass
 class CustomImageRecognizerResult:
     entity_type: str
@@ -127,9 +177,12 @@ class CustomImageRecognizerResult:
     top: int
     width: int
     height: int
-    text: str
 class ImagePreprocessor:
     """ImagePreprocessor class. Parent class for image preprocessing objects."""
     def __init__(self, use_greyscale: bool = True) -> None:
         self.use_greyscale = use_greyscale
@@ -146,11 +199,13 @@ class ImagePreprocessor:
         return img
     @staticmethod
-    def _get_bg_color(image: np.ndarray, is_greyscale: bool, invert: bool = False) -> Union[int, Tuple[int, int, int]]:
         # Note: Modified to expect numpy array for bincount
         if invert:
-             image = 255 - image # Simple inversion for greyscale numpy array
         if is_greyscale:
             bg_color = int(np.bincount(image.flatten()).argmax())
         else:
@@ -158,6 +213,7 @@ class ImagePreprocessor:
             # For this pipeline, we only use greyscale, so it's fine.
             # A simple alternative:
             from scipy import stats
             bg_color = tuple(stats.mode(image.reshape(-1, 3), axis=0)[0][0])
         return bg_color
@@ -166,10 +222,14 @@ class ImagePreprocessor:
         contrast = np.std(image)
         mean_intensity = np.mean(image)
         return contrast, mean_intensity
 class BilateralFilter(ImagePreprocessor):
     """Applies bilateral filtering."""
-    def __init__(self, diameter: int = 9, sigma_color: int = 75, sigma_space: int = 75) -> None:
         super().__init__(use_greyscale=True)
         self.diameter = diameter
         self.sigma_color = sigma_color
@@ -177,16 +237,32 @@ class BilateralFilter(ImagePreprocessor):
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, dict]:
         # Modified to accept and return numpy array for consistency in the pipeline
-        filtered_image = cv2.bilateralFilter(image, self.diameter, self.sigma_color, self.sigma_space)
-        metadata = {"diameter": self.diameter, "sigma_color": self.sigma_color, "sigma_space": self.sigma_space}
         return filtered_image, metadata
 class SegmentedAdaptiveThreshold(ImagePreprocessor):
     """Applies adaptive thresholding."""
-    def __init__(self, block_size: int = 21, contrast_threshold: int = 40, c_low_contrast: int = 5,
-                 c_high_contrast: int = 10, bg_threshold: int = 127) -> None:
         super().__init__(use_greyscale=True)
-        self.block_size = block_size if block_size % 2 == 1 else block_size + 1 # Ensure odd
         self.c_low_contrast = c_low_contrast
         self.c_high_contrast = c_high_contrast
         self.bg_threshold = bg_threshold
@@ -196,20 +272,37 @@ class SegmentedAdaptiveThreshold(ImagePreprocessor):
         # Modified to accept and return numpy array
         background_color = self._get_bg_color(image, True)
         contrast, _ = self._get_image_contrast(image)
-        c = self.c_low_contrast if contrast <= self.contrast_threshold else self.c_high_contrast
-        if background_color < self.bg_threshold: # Dark background, light text
             adaptive_threshold_image = cv2.adaptiveThreshold(
-                image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, self.block_size, -c
             )
-        else: # Light background, dark text
             adaptive_threshold_image = cv2.adaptiveThreshold(
-                image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, self.block_size, c
             )
         metadata = {"C": c, "background_color": background_color, "contrast": contrast}
         return adaptive_threshold_image, metadata
 class ImageRescaling(ImagePreprocessor):
     """Rescales images based on their size."""
     def __init__(self, target_dpi: int = 300, assumed_input_dpi: int = 96) -> None:
         super().__init__(use_greyscale=True)
         self.target_dpi = target_dpi
@@ -224,17 +317,19 @@ class ImageRescaling(ImagePreprocessor):
             width = int(image.shape[1] * scale_factor)
             height = int(image.shape[0] * scale_factor)
             dimensions = (width, height)
             # Use better interpolation for upscaling vs downscaling
             interpolation = cv2.INTER_CUBIC if scale_factor > 1.0 else cv2.INTER_AREA
             rescaled_image = cv2.resize(image, dimensions, interpolation=interpolation)
             metadata["scale_factor"] = scale_factor
             return rescaled_image, metadata
         return image, metadata
 class ContrastSegmentedImageEnhancer(ImagePreprocessor):
     """Class containing all logic to perform contrastive segmentation."""
     def __init__(
         self,
         bilateral_filter: Optional[BilateralFilter] = None,
@@ -260,7 +355,9 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
             adjusted_contrast = contrast
         return adjusted_image, contrast, adjusted_contrast
-    def preprocess_image(self, image: Image.Image, perform_binarization: bool = False) -> Tuple[Image.Image, dict]:
         """
         A corrected, logical pipeline for OCR preprocessing.
         Order: Greyscale -> Rescale -> Denoise -> Enhance Contrast -> Binarize
@@ -271,7 +368,9 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
         image_np = self.convert_image_to_array(image)
         # 2. Rescale image to optimal DPI (while still greyscale)
-        rescaled_image_np, scale_metadata = self.image_rescaling.preprocess_image(image_np)
         # 3. Apply bilateral filtering for noise reduction
         filtered_image_np, _ = self.bilateral_filter.preprocess_image(rescaled_image_np)
@@ -281,31 +380,32 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
         # 5. Adaptive Thresholding (Binarization) - This is the final step
         if perform_binarization:
-            final_image_np, threshold_metadata = self.adaptive_threshold.preprocess_image(
-                adjusted_image_np
             )
         else:
             final_image_np = adjusted_image_np
             threshold_metadata = {}
         # Combine metadata
         final_metadata = {**scale_metadata, **threshold_metadata}
         # Convert final numpy array back to PIL Image for return
         return Image.fromarray(final_image_np), final_metadata
-def rescale_ocr_data(ocr_data, scale_factor:float):
     # We loop from 0 to the number of detected words.
-    num_boxes = len(ocr_data['text'])
     for i in range(num_boxes):
         # We only want to process actual words, not empty boxes Tesseract might find
-        if int(ocr_data['conf'][i]) > -1: # -1 confidence is for structural elements
             # Get coordinates from the processed image using the index 'i'
-            x_proc = ocr_data['left'][i]
-            y_proc = ocr_data['top'][i]
-            w_proc = ocr_data['width'][i]
-            h_proc = ocr_data['height'][i]
             # Apply the inverse transformation (division)
             x_orig = int(x_proc / scale_factor)
@@ -315,49 +415,54 @@ def rescale_ocr_data(ocr_data, scale_factor:float):
             # --- THE MAPPING STEP ---
             # Update the dictionary values in-place using the same index 'i'
-            ocr_data['left'][i] = x_orig
-            ocr_data['top'][i] = y_orig
-            ocr_data['width'][i] = w_orig
-            ocr_data['height'][i] = h_orig
     return ocr_data
-def filter_entities_for_language(entities: List[str], valid_language_entities: List[str], language: str) -> List[str]:
     if not valid_language_entities:
         print(f"No valid entities supported for language: {language}")
-        #raise Warning(f"No valid entities supported for language: {language}")
     if not entities:
         print(f"No entities provided for language: {language}")
-        #raise Warning(f"No entities provided for language: {language}")
-    #print("entities:", entities)
-    #print("valid_language_entities:", valid_language_entities)
-   # print("language:", language)
     filtered_entities = [
-        entity for entity in entities
-        if entity in valid_language_entities
     ]
     if not filtered_entities:
         print(f"No relevant entities supported for language: {language}")
-        #raise Warning(f"No relevant entities supported for language: {language}")
     if language != "en":
-        gr.Info(f"Using {str(filtered_entities)} entities for local model analysis for language: {language}")
     return filtered_entities
 class CustomImageAnalyzerEngine:
     def __init__(
         self,
         analyzer_engine: Optional[AnalyzerEngine] = None,
-        ocr_engine: str = "tesseract",
         tesseract_config: Optional[str] = None,
         paddle_kwargs: Optional[Dict[str, Any]] = None,
         image_preprocessor: Optional[ImagePreprocessor] = None,
-        language: Optional[str] = DEFAULT_LANGUAGE
     ):
         """
         Initializes the CustomImageAnalyzerEngine.
@@ -370,7 +475,9 @@ class CustomImageAnalyzerEngine:
         :param language: Preferred OCR language (e.g., "en", "fr", "de"). Defaults to DEFAULT_LANGUAGE.
         """
         if ocr_engine not in ["tesseract", "paddle", "hybrid"]:
-            raise ValueError("ocr_engine must be either 'tesseract', 'hybrid', or 'paddle'")
         self.ocr_engine = ocr_engine
@@ -378,23 +485,28 @@ class CustomImageAnalyzerEngine:
         self.language = language or DEFAULT_LANGUAGE or "en"
         self.tesseract_lang = _tesseract_lang_code(self.language)
         self.paddle_lang = _paddle_lang_code(self.language)
         if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
             if PaddleOCR is None:
-                raise ImportError("paddleocr is not installed. Please run 'pip install paddleocr paddlepaddle'")
             # Default paddle configuration if none provided
             if paddle_kwargs is None:
-                paddle_kwargs = {'use_textline_orientation': True, 'lang': self.paddle_lang}
             else:
                 # Enforce language if not explicitly provided
-                paddle_kwargs.setdefault('lang', self.paddle_lang)
             self.paddle_ocr = PaddleOCR(**paddle_kwargs)
         if not analyzer_engine:
             analyzer_engine = AnalyzerEngine()
         self.analyzer_engine = analyzer_engine
-        self.tesseract_config = tesseract_config or '--oem 3 --psm 11'
         if not image_preprocessor:
             image_preprocessor = ContrastSegmentedImageEnhancer()
@@ -403,71 +515,82 @@ class CustomImageAnalyzerEngine:
     def _sanitize_filename(self, text: str, max_length: int = 20) -> str:
         """
         Sanitizes text for use in filenames by removing invalid characters and limiting length.
         :param text: The text to sanitize
         :param max_length: Maximum length of the sanitized text
         :return: Sanitized text safe for filenames
         """
         # Remove or replace invalid filename characters
         # Windows: < > : " | ? * \ /
         # Unix: / (forward slash)
         # Also remove control characters and other problematic chars
         invalid_chars = r'[<>:"|?*\\/\x00-\x1f\x7f-\x9f]'
-        sanitized = re.sub(invalid_chars, '_', text)
         # Replace multiple consecutive underscores with a single one
-        sanitized = re.sub(r'_+', '_', sanitized)
         # Remove leading/trailing underscores and spaces
-        sanitized = sanitized.strip('_ ')
         # If empty after sanitization, use a default value
         if not sanitized:
-            sanitized = 'text'
         # Limit to max_length characters
         if len(sanitized) > max_length:
             sanitized = sanitized[:max_length]
             # Ensure we don't end with an underscore if we cut in the middle
-            sanitized = sanitized.rstrip('_')
         return sanitized
-    def _convert_paddle_to_tesseract_format(self, paddle_results: List[Any]) -> Dict[str, List]:
         """Converts PaddleOCR result format to Tesseract's dictionary format. NOTE: This attempts to create word-level bounding boxes by estimating the distance between characters in sentence-level text output. This is currently quite inaccurate, and word-level bounding boxes should not be relied upon."""
-        output = {'text': [], 'left': [], 'top': [], 'width': [], 'height': [], 'conf': []}
         # paddle_results is now a list of dictionaries with detailed information
         if not paddle_results:
             return output
         for page_result in paddle_results:
             # Extract text recognition results from the new format
-            rec_texts = page_result.get('rec_texts', [])
-            rec_scores = page_result.get('rec_scores', [])
-            rec_polys = page_result.get('rec_polys', [])
-            for line_text, line_confidence, bounding_box in zip(rec_texts, rec_scores, rec_polys):
                 # bounding_box is now a numpy array with shape (4, 2)
                 # Convert to list of coordinates if it's a numpy array
-                if hasattr(bounding_box, 'tolist'):
                     box = bounding_box.tolist()
                 else:
                     box = bounding_box
                 # box is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
                 x_coords = [p[0] for p in box]
                 y_coords = [p[1] for p in box]
                 line_left = int(min(x_coords))
                 line_top = int(min(y_coords))
                 line_width = int(max(x_coords) - line_left)
                 line_height = int(max(y_coords) - line_top)
-                #line_y_center = (max(y_coords) + min(y_coords)) / 2
-                 # 2. Split the line into words
                 words = line_text.split()
                 if not words:
                     continue
@@ -482,69 +605,78 @@ class CustomImageAnalyzerEngine:
                 for word in words:
                     word_width = int(len(word) * avg_char_width)
                     word_left = line_left + int(current_char_offset * avg_char_width)
-                    output['text'].append(word)
-                    output['left'].append(word_left)
-                    output['top'].append(line_top)
-                    output['width'].append(word_width)
-                    output['height'].append(line_height)
                     # Use the line's confidence for each word derived from it
-                    output['conf'].append(int(line_confidence * 100))
                     # Update offset for the next word (add word length + 1 for the space)
                     current_char_offset += len(word) + 1
         return output
     def _perform_hybrid_ocr(
-    self,
-    image: Image.Image,
-    confidence_threshold: int = 65,
-    padding: int = 5,
-    ocr: Optional[Any] = None
-) -> Dict[str, list]:
         """
         Performs OCR using Tesseract for bounding boxes and PaddleOCR for low-confidence text.
         Returns data in the same dictionary format as pytesseract.image_to_data.
         """
         if ocr is None:
-            if hasattr(self, 'paddle_ocr') and self.paddle_ocr is not None:
                 ocr = self.paddle_ocr
             else:
-                raise ValueError("No OCR object provided and 'paddle_ocr' is not initialized.")
         print("Starting hybrid OCR process...")
         # 1. Get initial word-level results from Tesseract
         tesseract_data = pytesseract.image_to_data(
             image,
             output_type=pytesseract.Output.DICT,
             config=self.tesseract_config,
-            lang=self.tesseract_lang
         )
-        #tesseract_data['abs_line_id'] = tesseract_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
-        final_data = {'text': [], 'left': [], 'top': [], 'width': [], 'height': [], 'conf': []}
-        num_words = len(tesseract_data['text'])
         # This handles the "no text on page" case. If num_words is 0, the loop is skipped
         # and an empty dictionary with empty lists is returned, which is the correct behavior.
         for i in range(num_words):
-            text = tesseract_data['text'][i]
-            conf = int(tesseract_data['conf'][i])
             # Skip empty text boxes or non-word elements (like page/block markers)
             if not text.strip() or conf == -1:
                 continue
-            left = tesseract_data['left'][i]
-            top = tesseract_data['top'][i]
-            width = tesseract_data['width'][i]
-            height = tesseract_data['height'][i]
-            #line_number = tesseract_data['abs_line_id'][i]
             # If confidence is low, use PaddleOCR for a second opinion
             if conf < confidence_threshold:
                 img_width, img_height = image.size
@@ -552,66 +684,76 @@ class CustomImageAnalyzerEngine:
                 crop_top = max(0, top - padding)
                 crop_right = min(img_width, left + width + padding + 15)
                 crop_bottom = min(img_height, top + height + padding)
                 # Ensure crop dimensions are valid
                 if crop_right <= crop_left or crop_bottom <= crop_top:
-                    continue # Skip invalid crops
-                cropped_image = image.crop((crop_left, crop_top, crop_right, crop_bottom))
                 cropped_image_np = np.array(cropped_image)
                 if len(cropped_image_np.shape) == 2:
                     cropped_image_np = np.stack([cropped_image_np] * 3, axis=-1)
                 paddle_results = ocr.predict(cropped_image_np)
                 if paddle_results and paddle_results[0]:
-                    rec_texts = paddle_results[0].get('rec_texts', [])
-                    rec_scores = paddle_results[0].get('rec_scores', [])
                     if rec_texts and rec_scores:
                         new_text = " ".join(rec_texts)
-                        new_conf = int(round(np.median(rec_scores) * 100,0))
                         # Only replace if Paddle's confidence is better
                         if new_conf > conf:
-                            print(f"  Re-OCR'd word: '{text}' (conf: {conf}) -> '{new_text}' (conf: {new_conf:.0f})")
                             # For exporting example image comparisons, not used here
                             safe_text = self._sanitize_filename(text, max_length=20)
-                            safe_new_text = self._sanitize_filename(new_text, max_length=20)
                             output_image_path = f"examples/tess_vs_paddle_examples/{conf}_conf_{safe_text}_to_{new_text}_{new_conf}.png"
                             cropped_image.save(output_image_path)
                             text = new_text
                             conf = new_conf
                         else:
-                            print(f"  '{text}' (conf: {conf}) -> Paddle result '{new_text}' (conf: {new_conf:.0f}) was not better. Keeping original.")
                     else:
                         # Paddle ran but found nothing, so discard the original low-confidence word
-                        print(f"  '{text}' (conf: {conf}) -> No text found by Paddle. Discarding.")
-                        text = ''
                 else:
                     # Paddle found nothing, discard original word
-                    print(f"  '{text}' (conf: {conf}) -> No text found by Paddle. Discarding.")
-                    text = ''
             # Append the final result (either original, replaced, or skipped if empty)
             if text.strip():
-                final_data['text'].append(clean_unicode_text(text))
-                final_data['left'].append(left)
-                final_data['top'].append(top)
-                final_data['width'].append(width)
-                final_data['height'].append(height)
-                final_data['conf'].append(int(conf))
-                #final_data['line_number'].append(int(line_number))
         return final_data
-    def perform_ocr(self,
-        image: Union[str, Image.Image, np.ndarray],
-        ocr: Optional[Any] = None) -> List[OCRResult]:
         """
         Performs OCR on the given image using the configured engine.
         """
@@ -619,10 +761,12 @@ class CustomImageAnalyzerEngine:
             image = Image.open(image)
         elif isinstance(image, np.ndarray):
             image = Image.fromarray(image)
         # Pre-process image - currently seems to give worse results!
-        if str(PREPROCESS_LOCAL_OCR_IMAGES).lower() == 'true':
-            image, preprocessing_metadata = self.image_preprocessor.preprocess_image(image)
         else:
             preprocessing_metadata = {}
@@ -637,68 +781,71 @@ class CustomImageAnalyzerEngine:
                 image,
                 output_type=pytesseract.Output.DICT,
                 config=self.tesseract_config,
-                lang=self.tesseract_lang # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
             )
-            #ocr_data['abs_line_id'] = ocr_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
         elif self.ocr_engine == "paddle":
-            image_np = np.array(image) # image_processed
             # PaddleOCR may need an RGB image. Ensure it has 3 channels.
             if len(image_np.shape) == 2:
                 image_np = np.stack([image_np] * 3, axis=-1)
             if ocr is None:
-                if hasattr(self, 'paddle_ocr') and self.paddle_ocr is not None:
                     ocr = self.paddle_ocr
                 else:
-                    raise ValueError("No OCR object provided and 'paddle_ocr' is not initialised.")
-            #ocr = PaddleOCR(use_textline_orientation=True, lang='en')
             paddle_results = ocr.predict(image_np)
             ocr_data = self._convert_paddle_to_tesseract_format(paddle_results)
         else:
             raise RuntimeError(f"Unsupported OCR engine: {self.ocr_engine}")
         if preprocessing_metadata:
-            scale_factor = preprocessing_metadata.get('scale_factor', 1.0)
             ocr_data = rescale_ocr_data(ocr_data, scale_factor)
         # The rest of your processing pipeline now works for both engines
         ocr_result = ocr_data
         # Filter out empty strings and low confidence results
         valid_indices = [
-            i for i, text in enumerate(ocr_result['text'])
-            if text.strip() and int(ocr_result['conf'][i]) > 0
         ]
         return [
             OCRResult(
-                text=clean_unicode_text(ocr_result['text'][i]),
-                left=ocr_result['left'][i],
-                top=ocr_result['top'][i],
-                width=ocr_result['width'][i],
-                height=ocr_result['height'][i]#,
-                #line_number=ocr_result['abs_line_id'][i]
             )
             for i in valid_indices
         ]
     def analyze_text(
-        self,
-        line_level_ocr_results: List[OCRResult],
         ocr_results_with_words: Dict[str, Dict],
         chosen_redact_comprehend_entities: List[str],
         pii_identification_method: str = LOCAL_PII_OPTION,
-        comprehend_client = "",
-        custom_entities:List[str]=custom_entities,
         language: Optional[str] = DEFAULT_LANGUAGE,
         nlp_analyser: AnalyzerEngine = None,
-        **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
         page_text = ""
@@ -719,9 +866,11 @@ class CustomImageAnalyzerEngine:
             page_text_mapping.append((start_pos, i, line_level_ocr_result, None))
         # Determine language for downstream services
-        aws_language = language or getattr(self, 'language', None) or 'en'
-        valid_language_entities = nlp_analyser.registry.get_supported_entities(languages=[language])
         if "CUSTOM" not in valid_language_entities:
             valid_language_entities.append("CUSTOM")
         if "CUSTOM_FUZZY" not in valid_language_entities:
@@ -730,53 +879,52 @@ class CustomImageAnalyzerEngine:
         # Process using either Local or AWS Comprehend
         if pii_identification_method == LOCAL_PII_OPTION:
-            language_supported_entities = filter_entities_for_language(custom_entities, valid_language_entities, language)
             if language_supported_entities:
                 text_analyzer_kwargs["entities"] = language_supported_entities
-                #if language != "en":
                 #    gr.Info(f"Using {str(language_supported_entities)} entities for local model analysis for language: {language}")
             else:
                 print(f"No relevant entities supported for language: {language}")
-                raise Warning(f"No relevant entities supported for language: {language}")
             analyzer_result = nlp_analyser.analyze(
-                text=page_text,
-                language=language,
-                **text_analyzer_kwargs
             )
             all_text_line_results = map_back_entity_results(
-                analyzer_result,
-                page_text_mapping,
-                all_text_line_results
             )
-        elif pii_identification_method == AWS_PII_OPTION:
             # Handle custom entities first
             if custom_entities:
                 custom_redact_entities = [
-                    entity for entity in chosen_redact_comprehend_entities
                     if entity in custom_entities
                 ]
                 if custom_redact_entities:
                     # Filter entities to only include those supported by the language
-                    language_supported_entities = filter_entities_for_language(custom_redact_entities, valid_language_entities, language)
                     if language_supported_entities:
                         text_analyzer_kwargs["entities"] = language_supported_entities
                     page_analyser_result = nlp_analyser.analyze(
-                        text=page_text,
-                        language=language,
-                        **text_analyzer_kwargs
                     )
                     all_text_line_results = map_back_entity_results(
-                        page_analyser_result,
-                        page_text_mapping,
-                        all_text_line_results
                     )
             # Process text in batches for AWS Comprehend
@@ -789,14 +937,14 @@ class CustomImageAnalyzerEngine:
                 words = text_line.text.split()
                 word_start_positions = list()
                 current_pos = 0
                 for word in words:
                     word_start_positions.append(current_pos)
                     current_pos += len(word) + 1
                 for word_idx, word in enumerate(words):
                     new_batch_char_count = len(current_batch) + len(word) + 1
                     if batch_word_count >= 50 or new_batch_char_count >= 200:
                         # Process current batch
                         all_text_line_results = do_aws_comprehend_call(
@@ -804,17 +952,19 @@ class CustomImageAnalyzerEngine:
                             current_batch_mapping,
                             comprehend_client,
                             aws_language,
-                            text_analyzer_kwargs.get('allow_list', []),
                             chosen_redact_comprehend_entities,
-                            all_text_line_results
                         )
                         comprehend_query_number += 1
                         # Reset batch
                         current_batch = word
                         batch_word_count = 1
                         batch_char_count = len(word)
-                        current_batch_mapping = [(0, i, text_line, None, word_start_positions[word_idx])]
                     else:
                         if current_batch:
                             current_batch += " "
@@ -822,15 +972,20 @@ class CustomImageAnalyzerEngine:
                         current_batch += word
                         batch_char_count += len(word)
                         batch_word_count += 1
-                        if not current_batch_mapping or current_batch_mapping[-1][1] != i:
-                            current_batch_mapping.append((
-                                batch_char_count - len(word),
-                                i,
-                                text_line,
-                                None,
-                                word_start_positions[word_idx]
-                            ))
             # Process final batch if any
             if current_batch:
@@ -839,33 +994,39 @@ class CustomImageAnalyzerEngine:
                     current_batch_mapping,
                     comprehend_client,
                     aws_language,
-                    text_analyzer_kwargs.get('allow_list', []),
                     chosen_redact_comprehend_entities,
-                    all_text_line_results
                 )
-                comprehend_query_number += 1
         # Process results and create bounding boxes
         combined_results = list()
         for i, text_line in enumerate(line_level_ocr_results):
-            line_results = next((results for idx, results in all_text_line_results if idx == i), [])
             if line_results and i < len(ocr_results_with_words):
                 child_level_key = list(ocr_results_with_words.keys())[i]
-                ocr_results_with_words_line_level = ocr_results_with_words[child_level_key]
                 for result in line_results:
                     bbox_results = self.map_analyzer_results_to_bounding_boxes(
                         [result],
-                        [OCRResult(
-                            text=text_line.text[result.start:result.end],
-                            left=text_line.left,
-                            top=text_line.top,
-                            width=text_line.width,
-                            height=text_line.height
-                        )],
                         text_line.text,
-                        text_analyzer_kwargs.get('allow_list', []),
-                        ocr_results_with_words_line_level
                     )
                     combined_results.extend(bbox_results)
@@ -873,61 +1034,65 @@ class CustomImageAnalyzerEngine:
     @staticmethod
     def map_analyzer_results_to_bounding_boxes(
-    text_analyzer_results: List[RecognizerResult],
-    redaction_relevant_ocr_results: List[OCRResult],
-    full_text: str,
-    allow_list: List[str],
-    ocr_results_with_words_child_info: Dict[str, Dict]
-) -> List[CustomImageRecognizerResult]:
         redaction_bboxes = list()
         for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
-            #print("ocr_results_with_words_child_info:", ocr_results_with_words_child_info)
-            line_text = ocr_results_with_words_child_info['text']
             line_length = len(line_text)
             redaction_text = redaction_relevant_ocr_result.text
             for redaction_result in text_analyzer_results:
                 # Check if the redaction text is not in the allow list
                 if redaction_text not in allow_list:
                     # Adjust start and end to be within line bounds
                     start_in_line = max(0, redaction_result.start)
                     end_in_line = min(line_length, redaction_result.end)
                     # Get the matched text from this line
                     matched_text = line_text[start_in_line:end_in_line]
-                    matched_words = matched_text.split()
                     # Find the corresponding words in the OCR results
                     matching_word_boxes = list()
                     current_position = 0
-                    for word_info in ocr_results_with_words_child_info.get('words', []):
-                        word_text = word_info['text']
                         word_length = len(word_text)
                         word_start = current_position
                         word_end = current_position + word_length
                         # Update current position for the next word
-                        current_position += word_length + 1  # +1 for the space after the word
                         # Check if the word's bounding box is within the start and end bounds
-                        if word_start >= start_in_line and word_end <= (end_in_line + 1):
-                            matching_word_boxes.append(word_info['bounding_box'])
-                            #print(f"Matched word: {word_info['text']}")
                     if matching_word_boxes:
                         # Calculate the combined bounding box for all matching words
                         left = min(box[0] for box in matching_word_boxes)
                         top = min(box[1] for box in matching_word_boxes)
                         right = max(box[2] for box in matching_word_boxes)
                         bottom = max(box[3] for box in matching_word_boxes)
                         redaction_bboxes.append(
                             CustomImageRecognizerResult(
                                 entity_type=redaction_result.entity_type,
@@ -938,12 +1103,12 @@ class CustomImageAnalyzerEngine:
                                 top=top,
                                 width=right - left,
                                 height=bottom - top,
-                                text=matched_text
                             )
                         )
         return redaction_bboxes
     @staticmethod
     def remove_space_boxes(ocr_result: dict) -> dict:
         """Remove OCR bboxes that are for spaces.
@@ -963,7 +1128,7 @@ class CustomImageAnalyzerEngine:
             filtered_ocr_result[key] = [ocr_result[key][i] for i in idx]
         return filtered_ocr_result
     @staticmethod
     def _scale_bbox_results(
         ocr_result: Dict[str, List[Union[int, str]]], scale_factor: float
@@ -994,67 +1159,87 @@ class CustomImageAnalyzerEngine:
         # Estimate the x-offset based on character position
         # This is a simple estimation and might need refinement for variable-width fonts
         return int(start / len(full_text) * len(full_text))
     def estimate_width(self, ocr_result: OCRResult, start: int, end: int) -> int:
         # Extract the relevant text portion
         relevant_text = ocr_result.text[start:end]
         # If the relevant text is the same as the full text, return the full width
         if relevant_text == ocr_result.text:
             return ocr_result.width
         # Estimate width based on the proportion of the relevant text length to the total text length
         total_text_length = len(ocr_result.text)
         relevant_text_length = len(relevant_text)
         if total_text_length == 0:
             return 0  # Avoid division by zero
         # Proportion of the relevant text to the total text
         proportion = relevant_text_length / total_text_length
         # Estimate the width based on the proportion
         estimated_width = int(proportion * ocr_result.width)
         return estimated_width
-def bounding_boxes_overlap(box1:List, box2:List):
     """Check if two bounding boxes overlap."""
-    return (box1[0] < box2[2] and box2[0] < box1[2] and
-            box1[1] < box2[3] and box2[1] < box1[3])
-def map_back_entity_results(page_analyser_result:dict, page_text_mapping:dict, all_text_line_results:List[Tuple]):
     for entity in page_analyser_result:
         entity_start = entity.start
         entity_end = entity.end
         # Track if the entity has been added to any line
         added_to_line = False
         for batch_start, line_idx, original_line, chars in page_text_mapping:
             batch_end = batch_start + len(original_line.text)
             # Check if the entity overlaps with the current line
-            if batch_start < entity_end and batch_end > entity_start:  # Overlap condition
-                relative_start = max(0, entity_start - batch_start)  # Adjust start relative to the line
-                relative_end = min(entity_end - batch_start, len(original_line.text))  # Adjust end relative to the line
                 # Create a new adjusted entity
                 adjusted_entity = copy.deepcopy(entity)
                 adjusted_entity.start = relative_start
                 adjusted_entity.end = relative_end
                 # Check if this line already has an entry
-                existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
                 if existing_entry is None:
                     all_text_line_results.append((line_idx, [adjusted_entity]))
                 else:
-                    existing_entry.append(adjusted_entity)  # Append to the existing list of entities
                 added_to_line = True
         # If the entity spans multiple lines, you may want to handle that here
         if not added_to_line:
             # Handle cases where the entity does not fit in any line (optional)
@@ -1062,7 +1247,14 @@ def map_back_entity_results(page_analyser_result:dict, page_text_mapping:dict, a
     return all_text_line_results
-def map_back_comprehend_entity_results(response:object, current_batch_mapping:List[Tuple], allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
     if not response or "Entities" not in response:
         return all_text_line_results
@@ -1077,29 +1269,50 @@ def map_back_comprehend_entity_results(response:object, current_batch_mapping:Li
         added_to_line = False
         # Find the correct line and offset within that line
-        for batch_start, line_idx, original_line, chars, line_offset in current_batch_mapping:
             batch_end = batch_start + len(original_line.text[line_offset:])
             # Check if the entity overlaps with the current line
-            if batch_start < entity_end and batch_end > entity_start:  # Overlap condition
                 # Calculate the absolute position within the line
                 relative_start = max(0, entity_start - batch_start + line_offset)
-                relative_end = min(entity_end - batch_start + line_offset, len(original_line.text))
                 result_text = original_line.text[relative_start:relative_end]
                 if result_text not in allow_list:
                     adjusted_entity = entity.copy()
-                    adjusted_entity["BeginOffset"] = relative_start  # Now relative to the full line
                     adjusted_entity["EndOffset"] = relative_end
                     recogniser_entity = recognizer_result_from_dict(adjusted_entity)
-                    existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
                     if existing_entry is None:
                         all_text_line_results.append((line_idx, [recogniser_entity]))
                     else:
-                        existing_entry.append(recogniser_entity)  # Append to the existing list of entities
                 added_to_line = True
@@ -1109,7 +1322,16 @@ def map_back_comprehend_entity_results(response:object, current_batch_mapping:Li
     return all_text_line_results
-def do_aws_comprehend_call(current_batch:str, current_batch_mapping:List[Tuple], comprehend_client:botocore.client.BaseClient, language:str, allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
     if not current_batch:
         return all_text_line_results
@@ -1119,26 +1341,26 @@ def do_aws_comprehend_call(current_batch:str, current_batch_mapping:List[Tuple],
     for attempt in range(max_retries):
         try:
             response = comprehend_client.detect_pii_entities(
-                Text=current_batch.strip(),
-                LanguageCode=language
             )
             all_text_line_results = map_back_comprehend_entity_results(
-                response,
-                current_batch_mapping,
-                allow_list,
-                chosen_redact_comprehend_entities,
-                all_text_line_results
             )
             return all_text_line_results
         except Exception as e:
             if attempt == max_retries - 1:
                 print("AWS Comprehend calls failed due to", e)
                 raise
             time.sleep(retry_delay)
 def run_page_text_redaction(
     language: str,
     chosen_redact_entities: List[str],
@@ -1147,13 +1369,13 @@ def run_page_text_redaction(
     line_characters: List,
     page_analyser_results: List = list(),
     page_analysed_bounding_boxes: List = list(),
-    comprehend_client = None,
     allow_list: List[str] = None,
     pii_identification_method: str = LOCAL_PII_OPTION,
     nlp_analyser: AnalyzerEngine = None,
     score_threshold: float = 0.0,
     custom_entities: List[str] = None,
-    comprehend_query_number:int = 0
 ):
     """
     This function performs text redaction on a page based on the specified language and chosen entities.
@@ -1174,7 +1396,7 @@ def run_page_text_redaction(
         custom_entities (List[str], optional): A list of custom entities for redaction. Defaults to None.
         comprehend_query_number (int, optional): A counter for the number of Comprehend queries made. Defaults to 0.
     """
     page_text = ""
     page_text_mapping = list()
     all_text_line_results = list()
@@ -1185,13 +1407,14 @@ def run_page_text_redaction(
         if chosen_redact_entities:
             if page_text:
                 page_text += " "
             start_pos = len(page_text)
             page_text += text_line.text
             page_text_mapping.append((start_pos, i, text_line, line_characters[i]))
-    valid_language_entities = nlp_analyser.registry.get_supported_entities(languages=[language])
     if "CUSTOM" not in valid_language_entities:
         valid_language_entities.append("CUSTOM")
     if "CUSTOM_FUZZY" not in valid_language_entities:
@@ -1201,8 +1424,10 @@ def run_page_text_redaction(
     if pii_identification_method == LOCAL_PII_OPTION:
         if not nlp_analyser:
             raise ValueError("nlp_analyser is required for Local identification method")
-        language_supported_entities = filter_entities_for_language(chosen_redact_entities, valid_language_entities, language)
         page_analyser_result = nlp_analyser.analyze(
             text=page_text,
@@ -1210,14 +1435,11 @@ def run_page_text_redaction(
             entities=language_supported_entities,
             score_threshold=score_threshold,
             return_decision_process=True,
-            allow_list=allow_list
         )
         all_text_line_results = map_back_entity_results(
-            page_analyser_result,
-            page_text_mapping,
-            all_text_line_results
         )
     elif pii_identification_method == AWS_PII_OPTION:
@@ -1225,11 +1447,14 @@ def run_page_text_redaction(
         # Process custom entities if any
         if custom_entities:
             custom_redact_entities = [
-                entity for entity in chosen_redact_comprehend_entities
                 if entity in custom_entities
             ]
-            language_supported_entities = filter_entities_for_language(custom_redact_entities, valid_language_entities, language)
             if language_supported_entities:
                 page_analyser_result = nlp_analyser.analyze(
@@ -1238,13 +1463,11 @@ def run_page_text_redaction(
                     entities=language_supported_entities,
                     score_threshold=score_threshold,
                     return_decision_process=True,
-                    allow_list=allow_list
                 )
                 all_text_line_results = map_back_entity_results(
-                    page_analyser_result,
-                    page_text_mapping,
-                    all_text_line_results
                 )
         current_batch = ""
@@ -1255,16 +1478,16 @@ def run_page_text_redaction(
         for i, text_line in enumerate(line_level_text_results_list):
             words = text_line.text.split()
             word_start_positions = list()
             # Calculate word start positions within the line
             current_pos = 0
             for word in words:
                 word_start_positions.append(current_pos)
                 current_pos += len(word) + 1  # +1 for space
             for word_idx, word in enumerate(words):
                 new_batch_char_count = len(current_batch) + len(word) + 1
                 if batch_word_count >= 50 or new_batch_char_count >= 200:
                     # Process current batch
                     all_text_line_results = do_aws_comprehend_call(
@@ -1274,15 +1497,23 @@ def run_page_text_redaction(
                         language,
                         allow_list,
                         chosen_redact_comprehend_entities,
-                        all_text_line_results
                     )
                     comprehend_query_number += 1
                     # Start new batch
                     current_batch = word
                     batch_word_count = 1
                     batch_char_count = len(word)
-                    current_batch_mapping = [(0, i, text_line, line_characters[i], word_start_positions[word_idx])]
                 else:
                     if current_batch:
                         current_batch += " "
@@ -1290,15 +1521,19 @@ def run_page_text_redaction(
                     current_batch += word
                     batch_char_count += len(word)
                     batch_word_count += 1
                     if not current_batch_mapping or current_batch_mapping[-1][1] != i:
-                        current_batch_mapping.append((
-                            batch_char_count - len(word),
-                            i,
-                            text_line,
-                            line_characters[i],
-                            word_start_positions[word_idx]  # Add the word's start position within its line
-                        ))
         # Process final batch
         if current_batch:
@@ -1309,29 +1544,36 @@ def run_page_text_redaction(
                 language,
                 allow_list,
                 chosen_redact_comprehend_entities,
-                all_text_line_results
             )
             comprehend_query_number += 1
     # Process results for each line
     for i, text_line in enumerate(line_level_text_results_list):
-        line_results = next((results for idx, results in all_text_line_results if idx == i), [])
         if line_results:
             text_line_bounding_boxes = merge_text_bounding_boxes(
-                line_results,
-                line_characters[i]
             )
             page_analyser_results.extend(line_results)
             page_analysed_bounding_boxes.extend(text_line_bounding_boxes)
     return page_analysed_bounding_boxes
-def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
-    '''
     Merge identified bounding boxes containing PII that are very close to one another
-    '''
     analysed_bounding_boxes = list()
     original_bounding_boxes = list()  # List to hold original bounding boxes
@@ -1339,9 +1581,17 @@ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], c
         # Extract bounding box coordinates for sorting
         bounding_boxes = list()
         for result in analyser_results:
-            #print("Result:", result)
-            char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
-            char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
             if char_boxes:
                 # Calculate the bounding box that encompasses all characters
                 left = min(box[0] for box in char_boxes)
@@ -1349,11 +1599,19 @@ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], c
                 right = max(box[2] for box in char_boxes)
                 top = max(box[3] for box in char_boxes) + vertical_padding
                 bbox = [left, bottom, right, top]
-                bounding_boxes.append((bottom, left, result, bbox, char_text))  # (y, x, result, bbox, text)
                 # Store original bounding boxes
-                original_bounding_boxes.append({"text": "".join(char_text), "boundingBox": bbox, "result": copy.deepcopy(result)})
-                #print("Original bounding boxes:", original_bounding_boxes)
         # Sort the results by y-coordinate and then by x-coordinate
         bounding_boxes.sort()
@@ -1375,19 +1633,26 @@ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], c
                 vertical_diff_bboxes = abs(next_box[1] - current_y)
                 horizontal_diff_bboxes = abs(next_box[0] - current_box[2])
-                if vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist:
                     # Merge bounding boxes
-                    #print("Merging boxes")
                     merged_box = current_box.copy()
                     merged_result = current_result
                     merged_text = current_text.copy()
                     merged_box[2] = next_box[2]  # Extend horizontally
                     merged_box[3] = max(current_box[3], next_box[3])  # Adjust the top
-                    merged_result.end = max(current_result.end, result.end)  # Extend text range
                     try:
                         if current_result.entity_type != result.entity_type:
-                            merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
                         else:
                             merged_result.entity_type = current_result.entity_type
                     except Exception as e:
@@ -1396,11 +1661,13 @@ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], c
                         merged_text.append(" ")  # Add space between texts
                     merged_text.extend(text)
-                    merged_bounding_boxes.append({
-                        "text": "".join(merged_text),
-                        "boundingBox": merged_box,
-                        "result": merged_result
-                    })
                 else:
                     # Start a new bounding box
@@ -1413,18 +1680,21 @@ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], c
         analysed_bounding_boxes.extend(original_bounding_boxes)
         analysed_bounding_boxes.extend(merged_bounding_boxes)
-        #print("Analysed bounding boxes:", analysed_bounding_boxes)
     return analysed_bounding_boxes
-def recreate_page_line_level_ocr_results_with_page(page_line_level_ocr_results_with_words: dict):
     reconstructed_results = list()
     # Assume all lines belong to the same page, so we can just read it from one item
-    #page = next(iter(page_line_level_ocr_results_with_words.values()))["page"]
     page = page_line_level_ocr_results_with_words["page"]
     for line_data in page_line_level_ocr_results_with_words["results"].values():
         bbox = line_data["bounding_box"]
         text = line_data["text"]
@@ -1438,15 +1708,21 @@ def recreate_page_line_level_ocr_results_with_page(page_line_level_ocr_results_w
             top=bbox[1],
             width=bbox[2] - bbox[0],
             height=bbox[3] - bbox[1],
-            line=line_number
         )
         reconstructed_results.append(line_result)
-    page_line_level_ocr_results_with_page = {"page": page, "results": reconstructed_results}
     return page_line_level_ocr_results_with_page
-def split_words_and_punctuation_from_line(line_of_words: List[OCRResult]) -> List[OCRResult]:
     """
     Takes a list of OCRResult objects and splits words with trailing/leading punctuation.
@@ -1455,85 +1731,119 @@ def split_words_and_punctuation_from_line(line_of_words: List[OCRResult]) -> Lis
     "high-tech" are preserved.
     """
     # Punctuation that will be split off. Hyphen is not included.
-    PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
     new_word_list = list()
     for word_result in line_of_words:
         word_text = word_result.text
         # This regex finds a central "core" word, and captures leading and trailing punctuation
         # Handles cases like "(word)." -> group1='(', group2='word', group3='.'
         match = re.match(r"([(\[{]*)(.*?)_?([.,?!:;)\}\]]*)$", word_text)
         # Handle words with internal hyphens that might confuse the regex
-        if '-' in word_text and not match.group(2):
-             core_part_text = word_text
-             leading_punc = ""
-             trailing_punc = ""
         elif match:
             leading_punc, core_part_text, trailing_punc = match.groups()
-        else: # Failsafe
             new_word_list.append(word_result)
             continue
         # If no split is needed, just add the original and continue
         if not leading_punc and not trailing_punc:
             new_word_list.append(word_result)
             continue
         # --- A split is required ---
         # Estimate new bounding boxes by proportionally allocating width
         original_width = word_result.width
-        if not word_text or original_width == 0: continue # Failsafe
         avg_char_width = original_width / len(word_text)
         current_left = word_result.left
         # Add leading punctuation if it exists
         if leading_punc:
             punc_width = avg_char_width * len(leading_punc)
-            new_word_list.append(OCRResult(
-                text=leading_punc, left=current_left, top=word_result.top,
-                width=punc_width, height=word_result.height
-            ))
             current_left += punc_width
         # Add the core part of the word
         if core_part_text:
             core_width = avg_char_width * len(core_part_text)
-            new_word_list.append(OCRResult(
-                text=core_part_text, left=current_left, top=word_result.top,
-                width=core_width, height=word_result.height
-            ))
             current_left += core_width
         # Add trailing punctuation if it exists
         if trailing_punc:
             punc_width = avg_char_width * len(trailing_punc)
-            new_word_list.append(OCRResult(
-                text=trailing_punc, left=current_left, top=word_result.top,
-                width=punc_width, height=word_result.height
-            ))
     return new_word_list
-def create_ocr_result_with_children(combined_results:dict, i:int, current_bbox:dict, current_line:list):
-        combined_results["text_line_" + str(i)] = {
         "line": i,
-        'text': current_bbox.text,
-        'bounding_box': (current_bbox.left, current_bbox.top,
-                            current_bbox.left + current_bbox.width,
-                            current_bbox.top + current_bbox.height),
-        'words': [{'text': word.text,
-                    'bounding_box': (word.left, word.top,
-                                    word.left + word.width,
-                                    word.top + word.height)}
-                    for word in current_line]
     }
-        return combined_results["text_line_" + str(i)]
-def combine_ocr_results(ocr_results: List[OCRResult], x_threshold: float = 50.0, y_threshold: float = 12.0, page: int = 1):
     """
     Group OCR results into lines, splitting words from punctuation.
     """
@@ -1568,28 +1878,36 @@ def combine_ocr_results(ocr_results: List[OCRResult], x_threshold: float = 50.0,
         line_top = min(word.top for word in line)
         line_right = max(word.left + word.width for word in line)
         line_bottom = max(word.top + word.height for word in line)
         final_line_bbox = OCRResult(
             text=line_text,
             left=line_left,
             top=line_top,
             width=line_right - line_left,
             height=line_bottom - line_top,
-            line=line_counter
         )
         page_line_level_ocr_results.append(final_line_bbox)
         # Use the PROCESSED line to create the children. Creates a result within page_line_level_ocr_results_with_words
-        page_line_level_ocr_results_with_words["text_line_" + str(line_counter)] = create_ocr_result_with_children(
-            page_line_level_ocr_results_with_words,
-            line_counter,
-            final_line_bbox,
-            processed_line  # <-- Use the new, split list of words
         )
         line_counter += 1
-    page_level_results_with_page = {"page": page, "results": page_line_level_ocr_results}
-    page_level_results_with_words = {"page": page, "results": page_line_level_ocr_results_with_words}
     return page_level_results_with_page, page_level_results_with_words

 import copy
+import re
+import time
 from copy import deepcopy
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import botocore
+import cv2
+import gradio as gr
+import numpy as np
+import pytesseract
 from pdfminer.layout import LTChar
 from PIL import Image
+from presidio_analyzer import AnalyzerEngine, RecognizerResult
+from tools.config import (
+    AWS_PII_OPTION,
+    DEFAULT_LANGUAGE,
+    LOCAL_PII_OPTION,
+    PREPROCESS_LOCAL_OCR_IMAGES,
+)
 from tools.helper_functions import clean_unicode_text
 from tools.load_spacy_model_custom_recognisers import custom_entities
+from tools.presidio_analyzer_custom import recognizer_result_from_dict
+if PREPROCESS_LOCAL_OCR_IMAGES == "True":
+    PREPROCESS_LOCAL_OCR_IMAGES = True
+else:
+    PREPROCESS_LOCAL_OCR_IMAGES = False
 try:
     from paddleocr import PaddleOCR
 except ImportError:
     PaddleOCR = None
 # --- Language utilities ---
 def _normalize_lang(language: str) -> str:
     return language.strip().lower().replace("-", "_") if language else "en"
     mapping = {
         # Common
+        "en": "eng",
+        "eng": "eng",
+        "fr": "fra",
+        "fre": "fra",
+        "fra": "fra",
+        "de": "deu",
+        "ger": "deu",
+        "deu": "deu",
+        "es": "spa",
+        "spa": "spa",
+        "it": "ita",
+        "ita": "ita",
+        "nl": "nld",
+        "dut": "nld",
+        "nld": "nld",
+        "pt": "por",
+        "por": "por",
+        "ru": "rus",
+        "rus": "rus",
+        "ar": "ara",
+        "ara": "ara",
         # Nordics
+        "sv": "swe",
+        "swe": "swe",
+        "no": "nor",
+        "nb": "nor",
+        "nn": "nor",
+        "nor": "nor",
+        "fi": "fin",
+        "fin": "fin",
+        "da": "dan",
+        "dan": "dan",
         # Eastern/Central
+        "pl": "pol",
+        "pol": "pol",
+        "cs": "ces",
+        "cz": "ces",
+        "ces": "ces",
+        "hu": "hun",
+        "hun": "hun",
+        "ro": "ron",
+        "rum": "ron",
+        "ron": "ron",
+        "bg": "bul",
+        "bul": "bul",
+        "el": "ell",
+        "gre": "ell",
+        "ell": "ell",
         # Asian
+        "ja": "jpn",
+        "jp": "jpn",
+        "jpn": "jpn",
+        "zh": "chi_sim",
+        "zh_cn": "chi_sim",
+        "zh_hans": "chi_sim",
+        "chi_sim": "chi_sim",
+        "zh_tw": "chi_tra",
+        "zh_hk": "chi_tra",
+        "zh_tr": "chi_tra",
+        "chi_tra": "chi_tra",
+        "hi": "hin",
+        "hin": "hin",
+        "bn": "ben",
+        "ben": "ben",
+        "ur": "urd",
+        "urd": "urd",
+        "fa": "fas",
+        "per": "fas",
+        "fas": "fas",
     }
     return mapping.get(lang, "eng")
     return mapping.get(lang, "en")
 @dataclass
 class OCRResult:
     text: str
     conf: float = None
     line: int = None
 @dataclass
 class CustomImageRecognizerResult:
     entity_type: str
     top: int
     width: int
     height: int
+    text: str
 class ImagePreprocessor:
     """ImagePreprocessor class. Parent class for image preprocessing objects."""
     def __init__(self, use_greyscale: bool = True) -> None:
         self.use_greyscale = use_greyscale
         return img
     @staticmethod
+    def _get_bg_color(
+        image: np.ndarray, is_greyscale: bool, invert: bool = False
+    ) -> Union[int, Tuple[int, int, int]]:
         # Note: Modified to expect numpy array for bincount
         if invert:
+            image = 255 - image  # Simple inversion for greyscale numpy array
         if is_greyscale:
             bg_color = int(np.bincount(image.flatten()).argmax())
         else:
             # For this pipeline, we only use greyscale, so it's fine.
             # A simple alternative:
             from scipy import stats
             bg_color = tuple(stats.mode(image.reshape(-1, 3), axis=0)[0][0])
         return bg_color
         contrast = np.std(image)
         mean_intensity = np.mean(image)
         return contrast, mean_intensity
 class BilateralFilter(ImagePreprocessor):
     """Applies bilateral filtering."""
+    def __init__(
+        self, diameter: int = 9, sigma_color: int = 75, sigma_space: int = 75
+    ) -> None:
         super().__init__(use_greyscale=True)
         self.diameter = diameter
         self.sigma_color = sigma_color
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, dict]:
         # Modified to accept and return numpy array for consistency in the pipeline
+        filtered_image = cv2.bilateralFilter(
+            image, self.diameter, self.sigma_color, self.sigma_space
+        )
+        metadata = {
+            "diameter": self.diameter,
+            "sigma_color": self.sigma_color,
+            "sigma_space": self.sigma_space,
+        }
         return filtered_image, metadata
 class SegmentedAdaptiveThreshold(ImagePreprocessor):
     """Applies adaptive thresholding."""
+    def __init__(
+        self,
+        block_size: int = 21,
+        contrast_threshold: int = 40,
+        c_low_contrast: int = 5,
+        c_high_contrast: int = 10,
+        bg_threshold: int = 127,
+    ) -> None:
         super().__init__(use_greyscale=True)
+        self.block_size = (
+            block_size if block_size % 2 == 1 else block_size + 1
+        )  # Ensure odd
         self.c_low_contrast = c_low_contrast
         self.c_high_contrast = c_high_contrast
         self.bg_threshold = bg_threshold
         # Modified to accept and return numpy array
         background_color = self._get_bg_color(image, True)
         contrast, _ = self._get_image_contrast(image)
+        c = (
+            self.c_low_contrast
+            if contrast <= self.contrast_threshold
+            else self.c_high_contrast
+        )
+        if background_color < self.bg_threshold:  # Dark background, light text
             adaptive_threshold_image = cv2.adaptiveThreshold(
+                image,
+                255,
+                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                cv2.THRESH_BINARY_INV,
+                self.block_size,
+                -c,
             )
+        else:  # Light background, dark text
             adaptive_threshold_image = cv2.adaptiveThreshold(
+                image,
+                255,
+                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                cv2.THRESH_BINARY,
+                self.block_size,
+                c,
             )
         metadata = {"C": c, "background_color": background_color, "contrast": contrast}
         return adaptive_threshold_image, metadata
 class ImageRescaling(ImagePreprocessor):
     """Rescales images based on their size."""
     def __init__(self, target_dpi: int = 300, assumed_input_dpi: int = 96) -> None:
         super().__init__(use_greyscale=True)
         self.target_dpi = target_dpi
             width = int(image.shape[1] * scale_factor)
             height = int(image.shape[0] * scale_factor)
             dimensions = (width, height)
             # Use better interpolation for upscaling vs downscaling
             interpolation = cv2.INTER_CUBIC if scale_factor > 1.0 else cv2.INTER_AREA
             rescaled_image = cv2.resize(image, dimensions, interpolation=interpolation)
             metadata["scale_factor"] = scale_factor
             return rescaled_image, metadata
         return image, metadata
 class ContrastSegmentedImageEnhancer(ImagePreprocessor):
     """Class containing all logic to perform contrastive segmentation."""
     def __init__(
         self,
         bilateral_filter: Optional[BilateralFilter] = None,
             adjusted_contrast = contrast
         return adjusted_image, contrast, adjusted_contrast
+    def preprocess_image(
+        self, image: Image.Image, perform_binarization: bool = False
+    ) -> Tuple[Image.Image, dict]:
         """
         A corrected, logical pipeline for OCR preprocessing.
         Order: Greyscale -> Rescale -> Denoise -> Enhance Contrast -> Binarize
         image_np = self.convert_image_to_array(image)
         # 2. Rescale image to optimal DPI (while still greyscale)
+        rescaled_image_np, scale_metadata = self.image_rescaling.preprocess_image(
+            image_np
+        )
         # 3. Apply bilateral filtering for noise reduction
         filtered_image_np, _ = self.bilateral_filter.preprocess_image(rescaled_image_np)
         # 5. Adaptive Thresholding (Binarization) - This is the final step
         if perform_binarization:
+            final_image_np, threshold_metadata = (
+                self.adaptive_threshold.preprocess_image(adjusted_image_np)
             )
         else:
             final_image_np = adjusted_image_np
             threshold_metadata = {}
         # Combine metadata
         final_metadata = {**scale_metadata, **threshold_metadata}
         # Convert final numpy array back to PIL Image for return
         return Image.fromarray(final_image_np), final_metadata
+def rescale_ocr_data(ocr_data, scale_factor: float):
     # We loop from 0 to the number of detected words.
+    num_boxes = len(ocr_data["text"])
     for i in range(num_boxes):
         # We only want to process actual words, not empty boxes Tesseract might find
+        if int(ocr_data["conf"][i]) > -1:  # -1 confidence is for structural elements
             # Get coordinates from the processed image using the index 'i'
+            x_proc = ocr_data["left"][i]
+            y_proc = ocr_data["top"][i]
+            w_proc = ocr_data["width"][i]
+            h_proc = ocr_data["height"][i]
             # Apply the inverse transformation (division)
             x_orig = int(x_proc / scale_factor)
             # --- THE MAPPING STEP ---
             # Update the dictionary values in-place using the same index 'i'
+            ocr_data["left"][i] = x_orig
+            ocr_data["top"][i] = y_orig
+            ocr_data["width"][i] = w_orig
+            ocr_data["height"][i] = h_orig
     return ocr_data
+def filter_entities_for_language(
+    entities: List[str], valid_language_entities: List[str], language: str
+) -> List[str]:
     if not valid_language_entities:
         print(f"No valid entities supported for language: {language}")
+        # raise Warning(f"No valid entities supported for language: {language}")
     if not entities:
         print(f"No entities provided for language: {language}")
+        # raise Warning(f"No entities provided for language: {language}")
+    # print("entities:", entities)
+    # print("valid_language_entities:", valid_language_entities)
+    # print("language:", language)
     filtered_entities = [
+        entity for entity in entities if entity in valid_language_entities
     ]
     if not filtered_entities:
         print(f"No relevant entities supported for language: {language}")
+        # raise Warning(f"No relevant entities supported for language: {language}")
     if language != "en":
+        gr.Info(
+            f"Using {str(filtered_entities)} entities for local model analysis for language: {language}"
+        )
     return filtered_entities
 class CustomImageAnalyzerEngine:
     def __init__(
         self,
         analyzer_engine: Optional[AnalyzerEngine] = None,
+        ocr_engine: str = "tesseract",
         tesseract_config: Optional[str] = None,
         paddle_kwargs: Optional[Dict[str, Any]] = None,
         image_preprocessor: Optional[ImagePreprocessor] = None,
+        language: Optional[str] = DEFAULT_LANGUAGE,
     ):
         """
         Initializes the CustomImageAnalyzerEngine.
         :param language: Preferred OCR language (e.g., "en", "fr", "de"). Defaults to DEFAULT_LANGUAGE.
         """
         if ocr_engine not in ["tesseract", "paddle", "hybrid"]:
+            raise ValueError(
+                "ocr_engine must be either 'tesseract', 'hybrid', or 'paddle'"
+            )
         self.ocr_engine = ocr_engine
         self.language = language or DEFAULT_LANGUAGE or "en"
         self.tesseract_lang = _tesseract_lang_code(self.language)
         self.paddle_lang = _paddle_lang_code(self.language)
         if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
             if PaddleOCR is None:
+                raise ImportError(
+                    "paddleocr is not installed. Please run 'pip install paddleocr paddlepaddle'"
+                )
             # Default paddle configuration if none provided
             if paddle_kwargs is None:
+                paddle_kwargs = {
+                    "use_textline_orientation": True,
+                    "lang": self.paddle_lang,
+                }
             else:
                 # Enforce language if not explicitly provided
+                paddle_kwargs.setdefault("lang", self.paddle_lang)
             self.paddle_ocr = PaddleOCR(**paddle_kwargs)
         if not analyzer_engine:
             analyzer_engine = AnalyzerEngine()
         self.analyzer_engine = analyzer_engine
+        self.tesseract_config = tesseract_config or "--oem 3 --psm 11"
         if not image_preprocessor:
             image_preprocessor = ContrastSegmentedImageEnhancer()
     def _sanitize_filename(self, text: str, max_length: int = 20) -> str:
         """
         Sanitizes text for use in filenames by removing invalid characters and limiting length.
         :param text: The text to sanitize
         :param max_length: Maximum length of the sanitized text
         :return: Sanitized text safe for filenames
         """
         # Remove or replace invalid filename characters
         # Windows: < > : " | ? * \ /
         # Unix: / (forward slash)
         # Also remove control characters and other problematic chars
         invalid_chars = r'[<>:"|?*\\/\x00-\x1f\x7f-\x9f]'
+        sanitized = re.sub(invalid_chars, "_", text)
         # Replace multiple consecutive underscores with a single one
+        sanitized = re.sub(r"_+", "_", sanitized)
         # Remove leading/trailing underscores and spaces
+        sanitized = sanitized.strip("_ ")
         # If empty after sanitization, use a default value
         if not sanitized:
+            sanitized = "text"
         # Limit to max_length characters
         if len(sanitized) > max_length:
             sanitized = sanitized[:max_length]
             # Ensure we don't end with an underscore if we cut in the middle
+            sanitized = sanitized.rstrip("_")
         return sanitized
+    def _convert_paddle_to_tesseract_format(
+        self, paddle_results: List[Any]
+    ) -> Dict[str, List]:
         """Converts PaddleOCR result format to Tesseract's dictionary format. NOTE: This attempts to create word-level bounding boxes by estimating the distance between characters in sentence-level text output. This is currently quite inaccurate, and word-level bounding boxes should not be relied upon."""
+        output = {
+            "text": [],
+            "left": [],
+            "top": [],
+            "width": [],
+            "height": [],
+            "conf": [],
+        }
         # paddle_results is now a list of dictionaries with detailed information
         if not paddle_results:
             return output
         for page_result in paddle_results:
             # Extract text recognition results from the new format
+            rec_texts = page_result.get("rec_texts", [])
+            rec_scores = page_result.get("rec_scores", [])
+            rec_polys = page_result.get("rec_polys", [])
+            for line_text, line_confidence, bounding_box in zip(
+                rec_texts, rec_scores, rec_polys
+            ):
                 # bounding_box is now a numpy array with shape (4, 2)
                 # Convert to list of coordinates if it's a numpy array
+                if hasattr(bounding_box, "tolist"):
                     box = bounding_box.tolist()
                 else:
                     box = bounding_box
                 # box is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
                 x_coords = [p[0] for p in box]
                 y_coords = [p[1] for p in box]
                 line_left = int(min(x_coords))
                 line_top = int(min(y_coords))
                 line_width = int(max(x_coords) - line_left)
                 line_height = int(max(y_coords) - line_top)
+                # line_y_center = (max(y_coords) + min(y_coords)) / 2
+                # 2. Split the line into words
                 words = line_text.split()
                 if not words:
                     continue
                 for word in words:
                     word_width = int(len(word) * avg_char_width)
                     word_left = line_left + int(current_char_offset * avg_char_width)
+                    output["text"].append(word)
+                    output["left"].append(word_left)
+                    output["top"].append(line_top)
+                    output["width"].append(word_width)
+                    output["height"].append(line_height)
                     # Use the line's confidence for each word derived from it
+                    output["conf"].append(int(line_confidence * 100))
                     # Update offset for the next word (add word length + 1 for the space)
                     current_char_offset += len(word) + 1
         return output
     def _perform_hybrid_ocr(
+        self,
+        image: Image.Image,
+        confidence_threshold: int = 65,
+        padding: int = 5,
+        ocr: Optional[Any] = None,
+    ) -> Dict[str, list]:
         """
         Performs OCR using Tesseract for bounding boxes and PaddleOCR for low-confidence text.
         Returns data in the same dictionary format as pytesseract.image_to_data.
         """
         if ocr is None:
+            if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
                 ocr = self.paddle_ocr
             else:
+                raise ValueError(
+                    "No OCR object provided and 'paddle_ocr' is not initialized."
+                )
         print("Starting hybrid OCR process...")
         # 1. Get initial word-level results from Tesseract
         tesseract_data = pytesseract.image_to_data(
             image,
             output_type=pytesseract.Output.DICT,
             config=self.tesseract_config,
+            lang=self.tesseract_lang,
         )
+        # tesseract_data['abs_line_id'] = tesseract_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
+        final_data = {
+            "text": [],
+            "left": [],
+            "top": [],
+            "width": [],
+            "height": [],
+            "conf": [],
+        }
+        num_words = len(tesseract_data["text"])
         # This handles the "no text on page" case. If num_words is 0, the loop is skipped
         # and an empty dictionary with empty lists is returned, which is the correct behavior.
         for i in range(num_words):
+            text = tesseract_data["text"][i]
+            conf = int(tesseract_data["conf"][i])
             # Skip empty text boxes or non-word elements (like page/block markers)
             if not text.strip() or conf == -1:
                 continue
+            left = tesseract_data["left"][i]
+            top = tesseract_data["top"][i]
+            width = tesseract_data["width"][i]
+            height = tesseract_data["height"][i]
+            # line_number = tesseract_data['abs_line_id'][i]
             # If confidence is low, use PaddleOCR for a second opinion
             if conf < confidence_threshold:
                 img_width, img_height = image.size
                 crop_top = max(0, top - padding)
                 crop_right = min(img_width, left + width + padding + 15)
                 crop_bottom = min(img_height, top + height + padding)
                 # Ensure crop dimensions are valid
                 if crop_right <= crop_left or crop_bottom <= crop_top:
+                    continue  # Skip invalid crops
+                cropped_image = image.crop(
+                    (crop_left, crop_top, crop_right, crop_bottom)
+                )
                 cropped_image_np = np.array(cropped_image)
                 if len(cropped_image_np.shape) == 2:
                     cropped_image_np = np.stack([cropped_image_np] * 3, axis=-1)
                 paddle_results = ocr.predict(cropped_image_np)
                 if paddle_results and paddle_results[0]:
+                    rec_texts = paddle_results[0].get("rec_texts", [])
+                    rec_scores = paddle_results[0].get("rec_scores", [])
                     if rec_texts and rec_scores:
                         new_text = " ".join(rec_texts)
+                        new_conf = int(round(np.median(rec_scores) * 100, 0))
                         # Only replace if Paddle's confidence is better
                         if new_conf > conf:
+                            print(
+                                f"  Re-OCR'd word: '{text}' (conf: {conf}) -> '{new_text}' (conf: {new_conf:.0f})"
+                            )
                             # For exporting example image comparisons, not used here
                             safe_text = self._sanitize_filename(text, max_length=20)
+                            self._sanitize_filename(new_text, max_length=20)
                             output_image_path = f"examples/tess_vs_paddle_examples/{conf}_conf_{safe_text}_to_{new_text}_{new_conf}.png"
                             cropped_image.save(output_image_path)
                             text = new_text
                             conf = new_conf
                         else:
+                            print(
+                                f"  '{text}' (conf: {conf}) -> Paddle result '{new_text}' (conf: {new_conf:.0f}) was not better. Keeping original."
+                            )
                     else:
                         # Paddle ran but found nothing, so discard the original low-confidence word
+                        print(
+                            f"  '{text}' (conf: {conf}) -> No text found by Paddle. Discarding."
+                        )
+                        text = ""
                 else:
                     # Paddle found nothing, discard original word
+                    print(
+                        f"  '{text}' (conf: {conf}) -> No text found by Paddle. Discarding."
+                    )
+                    text = ""
             # Append the final result (either original, replaced, or skipped if empty)
             if text.strip():
+                final_data["text"].append(clean_unicode_text(text))
+                final_data["left"].append(left)
+                final_data["top"].append(top)
+                final_data["width"].append(width)
+                final_data["height"].append(height)
+                final_data["conf"].append(int(conf))
+                # final_data['line_number'].append(int(line_number))
         return final_data
+    def perform_ocr(
+        self, image: Union[str, Image.Image, np.ndarray], ocr: Optional[Any] = None
+    ) -> List[OCRResult]:
         """
         Performs OCR on the given image using the configured engine.
         """
             image = Image.open(image)
         elif isinstance(image, np.ndarray):
             image = Image.fromarray(image)
         # Pre-process image - currently seems to give worse results!
+        if str(PREPROCESS_LOCAL_OCR_IMAGES).lower() == "true":
+            image, preprocessing_metadata = self.image_preprocessor.preprocess_image(
+                image
+            )
         else:
             preprocessing_metadata = {}
                 image,
                 output_type=pytesseract.Output.DICT,
                 config=self.tesseract_config,
+                lang=self.tesseract_lang,  # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
             )
+            # ocr_data['abs_line_id'] = ocr_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
         elif self.ocr_engine == "paddle":
+            image_np = np.array(image)  # image_processed
             # PaddleOCR may need an RGB image. Ensure it has 3 channels.
             if len(image_np.shape) == 2:
                 image_np = np.stack([image_np] * 3, axis=-1)
             if ocr is None:
+                if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
                     ocr = self.paddle_ocr
                 else:
+                    raise ValueError(
+                        "No OCR object provided and 'paddle_ocr' is not initialised."
+                    )
+            # ocr = PaddleOCR(use_textline_orientation=True, lang='en')
             paddle_results = ocr.predict(image_np)
             ocr_data = self._convert_paddle_to_tesseract_format(paddle_results)
         else:
             raise RuntimeError(f"Unsupported OCR engine: {self.ocr_engine}")
         if preprocessing_metadata:
+            scale_factor = preprocessing_metadata.get("scale_factor", 1.0)
             ocr_data = rescale_ocr_data(ocr_data, scale_factor)
         # The rest of your processing pipeline now works for both engines
         ocr_result = ocr_data
         # Filter out empty strings and low confidence results
         valid_indices = [
+            i
+            for i, text in enumerate(ocr_result["text"])
+            if text.strip() and int(ocr_result["conf"][i]) > 0
         ]
         return [
             OCRResult(
+                text=clean_unicode_text(ocr_result["text"][i]),
+                left=ocr_result["left"][i],
+                top=ocr_result["top"][i],
+                width=ocr_result["width"][i],
+                height=ocr_result["height"][i],  # ,
+                # line_number=ocr_result['abs_line_id'][i]
             )
             for i in valid_indices
         ]
     def analyze_text(
+        self,
+        line_level_ocr_results: List[OCRResult],
         ocr_results_with_words: Dict[str, Dict],
         chosen_redact_comprehend_entities: List[str],
         pii_identification_method: str = LOCAL_PII_OPTION,
+        comprehend_client="",
+        custom_entities: List[str] = custom_entities,
         language: Optional[str] = DEFAULT_LANGUAGE,
         nlp_analyser: AnalyzerEngine = None,
+        **text_analyzer_kwargs,
     ) -> List[CustomImageRecognizerResult]:
         page_text = ""
             page_text_mapping.append((start_pos, i, line_level_ocr_result, None))
         # Determine language for downstream services
+        aws_language = language or getattr(self, "language", None) or "en"
+        valid_language_entities = nlp_analyser.registry.get_supported_entities(
+            languages=[language]
+        )
         if "CUSTOM" not in valid_language_entities:
             valid_language_entities.append("CUSTOM")
         if "CUSTOM_FUZZY" not in valid_language_entities:
         # Process using either Local or AWS Comprehend
         if pii_identification_method == LOCAL_PII_OPTION:
+            language_supported_entities = filter_entities_for_language(
+                custom_entities, valid_language_entities, language
+            )
             if language_supported_entities:
                 text_analyzer_kwargs["entities"] = language_supported_entities
+                # if language != "en":
                 #    gr.Info(f"Using {str(language_supported_entities)} entities for local model analysis for language: {language}")
             else:
                 print(f"No relevant entities supported for language: {language}")
+                raise Warning(
+                    f"No relevant entities supported for language: {language}"
+                )
             analyzer_result = nlp_analyser.analyze(
+                text=page_text, language=language, **text_analyzer_kwargs
             )
             all_text_line_results = map_back_entity_results(
+                analyzer_result, page_text_mapping, all_text_line_results
             )
+        elif pii_identification_method == AWS_PII_OPTION:
             # Handle custom entities first
             if custom_entities:
                 custom_redact_entities = [
+                    entity
+                    for entity in chosen_redact_comprehend_entities
                     if entity in custom_entities
                 ]
                 if custom_redact_entities:
                     # Filter entities to only include those supported by the language
+                    language_supported_entities = filter_entities_for_language(
+                        custom_redact_entities, valid_language_entities, language
+                    )
                     if language_supported_entities:
                         text_analyzer_kwargs["entities"] = language_supported_entities
                     page_analyser_result = nlp_analyser.analyze(
+                        text=page_text, language=language, **text_analyzer_kwargs
                     )
                     all_text_line_results = map_back_entity_results(
+                        page_analyser_result, page_text_mapping, all_text_line_results
                     )
             # Process text in batches for AWS Comprehend
                 words = text_line.text.split()
                 word_start_positions = list()
                 current_pos = 0
                 for word in words:
                     word_start_positions.append(current_pos)
                     current_pos += len(word) + 1
                 for word_idx, word in enumerate(words):
                     new_batch_char_count = len(current_batch) + len(word) + 1
                     if batch_word_count >= 50 or new_batch_char_count >= 200:
                         # Process current batch
                         all_text_line_results = do_aws_comprehend_call(
                             current_batch_mapping,
                             comprehend_client,
                             aws_language,
+                            text_analyzer_kwargs.get("allow_list", []),
                             chosen_redact_comprehend_entities,
+                            all_text_line_results,
                         )
                         comprehend_query_number += 1
                         # Reset batch
                         current_batch = word
                         batch_word_count = 1
                         batch_char_count = len(word)
+                        current_batch_mapping = [
+                            (0, i, text_line, None, word_start_positions[word_idx])
+                        ]
                     else:
                         if current_batch:
                             current_batch += " "
                         current_batch += word
                         batch_char_count += len(word)
                         batch_word_count += 1
+                        if (
+                            not current_batch_mapping
+                            or current_batch_mapping[-1][1] != i
+                        ):
+                            current_batch_mapping.append(
+                                (
+                                    batch_char_count - len(word),
+                                    i,
+                                    text_line,
+                                    None,
+                                    word_start_positions[word_idx],
+                                )
+                            )
             # Process final batch if any
             if current_batch:
                     current_batch_mapping,
                     comprehend_client,
                     aws_language,
+                    text_analyzer_kwargs.get("allow_list", []),
                     chosen_redact_comprehend_entities,
+                    all_text_line_results,
                 )
+                comprehend_query_number += 1
         # Process results and create bounding boxes
         combined_results = list()
         for i, text_line in enumerate(line_level_ocr_results):
+            line_results = next(
+                (results for idx, results in all_text_line_results if idx == i), []
+            )
             if line_results and i < len(ocr_results_with_words):
                 child_level_key = list(ocr_results_with_words.keys())[i]
+                ocr_results_with_words_line_level = ocr_results_with_words[
+                    child_level_key
+                ]
                 for result in line_results:
                     bbox_results = self.map_analyzer_results_to_bounding_boxes(
                         [result],
+                        [
+                            OCRResult(
+                                text=text_line.text[result.start : result.end],
+                                left=text_line.left,
+                                top=text_line.top,
+                                width=text_line.width,
+                                height=text_line.height,
+                            )
+                        ],
                         text_line.text,
+                        text_analyzer_kwargs.get("allow_list", []),
+                        ocr_results_with_words_line_level,
                     )
                     combined_results.extend(bbox_results)
     @staticmethod
     def map_analyzer_results_to_bounding_boxes(
+        text_analyzer_results: List[RecognizerResult],
+        redaction_relevant_ocr_results: List[OCRResult],
+        full_text: str,
+        allow_list: List[str],
+        ocr_results_with_words_child_info: Dict[str, Dict],
+    ) -> List[CustomImageRecognizerResult]:
         redaction_bboxes = list()
         for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
+            # print("ocr_results_with_words_child_info:", ocr_results_with_words_child_info)
+            line_text = ocr_results_with_words_child_info["text"]
             line_length = len(line_text)
             redaction_text = redaction_relevant_ocr_result.text
             for redaction_result in text_analyzer_results:
                 # Check if the redaction text is not in the allow list
                 if redaction_text not in allow_list:
                     # Adjust start and end to be within line bounds
                     start_in_line = max(0, redaction_result.start)
                     end_in_line = min(line_length, redaction_result.end)
                     # Get the matched text from this line
                     matched_text = line_text[start_in_line:end_in_line]
+                    matched_text.split()
                     # Find the corresponding words in the OCR results
                     matching_word_boxes = list()
                     current_position = 0
+                    for word_info in ocr_results_with_words_child_info.get("words", []):
+                        word_text = word_info["text"]
                         word_length = len(word_text)
                         word_start = current_position
                         word_end = current_position + word_length
                         # Update current position for the next word
+                        current_position += (
+                            word_length + 1
+                        )  # +1 for the space after the word
                         # Check if the word's bounding box is within the start and end bounds
+                        if word_start >= start_in_line and word_end <= (
+                            end_in_line + 1
+                        ):
+                            matching_word_boxes.append(word_info["bounding_box"])
+                            # print(f"Matched word: {word_info['text']}")
                     if matching_word_boxes:
                         # Calculate the combined bounding box for all matching words
                         left = min(box[0] for box in matching_word_boxes)
                         top = min(box[1] for box in matching_word_boxes)
                         right = max(box[2] for box in matching_word_boxes)
                         bottom = max(box[3] for box in matching_word_boxes)
                         redaction_bboxes.append(
                             CustomImageRecognizerResult(
                                 entity_type=redaction_result.entity_type,
                                 top=top,
                                 width=right - left,
                                 height=bottom - top,
+                                text=matched_text,
                             )
                         )
         return redaction_bboxes
     @staticmethod
     def remove_space_boxes(ocr_result: dict) -> dict:
         """Remove OCR bboxes that are for spaces.
             filtered_ocr_result[key] = [ocr_result[key][i] for i in idx]
         return filtered_ocr_result
     @staticmethod
     def _scale_bbox_results(
         ocr_result: Dict[str, List[Union[int, str]]], scale_factor: float
         # Estimate the x-offset based on character position
         # This is a simple estimation and might need refinement for variable-width fonts
         return int(start / len(full_text) * len(full_text))
     def estimate_width(self, ocr_result: OCRResult, start: int, end: int) -> int:
         # Extract the relevant text portion
         relevant_text = ocr_result.text[start:end]
         # If the relevant text is the same as the full text, return the full width
         if relevant_text == ocr_result.text:
             return ocr_result.width
         # Estimate width based on the proportion of the relevant text length to the total text length
         total_text_length = len(ocr_result.text)
         relevant_text_length = len(relevant_text)
         if total_text_length == 0:
             return 0  # Avoid division by zero
         # Proportion of the relevant text to the total text
         proportion = relevant_text_length / total_text_length
         # Estimate the width based on the proportion
         estimated_width = int(proportion * ocr_result.width)
         return estimated_width
+def bounding_boxes_overlap(box1: List, box2: List):
     """Check if two bounding boxes overlap."""
+    return (
+        box1[0] < box2[2]
+        and box2[0] < box1[2]
+        and box1[1] < box2[3]
+        and box2[1] < box1[3]
+    )
+def map_back_entity_results(
+    page_analyser_result: dict,
+    page_text_mapping: dict,
+    all_text_line_results: List[Tuple],
+):
     for entity in page_analyser_result:
         entity_start = entity.start
         entity_end = entity.end
         # Track if the entity has been added to any line
         added_to_line = False
         for batch_start, line_idx, original_line, chars in page_text_mapping:
             batch_end = batch_start + len(original_line.text)
             # Check if the entity overlaps with the current line
+            if (
+                batch_start < entity_end and batch_end > entity_start
+            ):  # Overlap condition
+                relative_start = max(
+                    0, entity_start - batch_start
+                )  # Adjust start relative to the line
+                relative_end = min(
+                    entity_end - batch_start, len(original_line.text)
+                )  # Adjust end relative to the line
                 # Create a new adjusted entity
                 adjusted_entity = copy.deepcopy(entity)
                 adjusted_entity.start = relative_start
                 adjusted_entity.end = relative_end
                 # Check if this line already has an entry
+                existing_entry = next(
+                    (entry for idx, entry in all_text_line_results if idx == line_idx),
+                    None,
+                )
                 if existing_entry is None:
                     all_text_line_results.append((line_idx, [adjusted_entity]))
                 else:
+                    existing_entry.append(
+                        adjusted_entity
+                    )  # Append to the existing list of entities
                 added_to_line = True
         # If the entity spans multiple lines, you may want to handle that here
         if not added_to_line:
             # Handle cases where the entity does not fit in any line (optional)
     return all_text_line_results
+def map_back_comprehend_entity_results(
+    response: object,
+    current_batch_mapping: List[Tuple],
+    allow_list: List[str],
+    chosen_redact_comprehend_entities: List[str],
+    all_text_line_results: List[Tuple],
+):
     if not response or "Entities" not in response:
         return all_text_line_results
         added_to_line = False
         # Find the correct line and offset within that line
+        for (
+            batch_start,
+            line_idx,
+            original_line,
+            chars,
+            line_offset,
+        ) in current_batch_mapping:
             batch_end = batch_start + len(original_line.text[line_offset:])
             # Check if the entity overlaps with the current line
+            if (
+                batch_start < entity_end and batch_end > entity_start
+            ):  # Overlap condition
                 # Calculate the absolute position within the line
                 relative_start = max(0, entity_start - batch_start + line_offset)
+                relative_end = min(
+                    entity_end - batch_start + line_offset, len(original_line.text)
+                )
                 result_text = original_line.text[relative_start:relative_end]
                 if result_text not in allow_list:
                     adjusted_entity = entity.copy()
+                    adjusted_entity["BeginOffset"] = (
+                        relative_start  # Now relative to the full line
+                    )
                     adjusted_entity["EndOffset"] = relative_end
                     recogniser_entity = recognizer_result_from_dict(adjusted_entity)
+                    existing_entry = next(
+                        (
+                            entry
+                            for idx, entry in all_text_line_results
+                            if idx == line_idx
+                        ),
+                        None,
+                    )
                     if existing_entry is None:
                         all_text_line_results.append((line_idx, [recogniser_entity]))
                     else:
+                        existing_entry.append(
+                            recogniser_entity
+                        )  # Append to the existing list of entities
                 added_to_line = True
     return all_text_line_results
+def do_aws_comprehend_call(
+    current_batch: str,
+    current_batch_mapping: List[Tuple],
+    comprehend_client: botocore.client.BaseClient,
+    language: str,
+    allow_list: List[str],
+    chosen_redact_comprehend_entities: List[str],
+    all_text_line_results: List[Tuple],
+):
     if not current_batch:
         return all_text_line_results
     for attempt in range(max_retries):
         try:
             response = comprehend_client.detect_pii_entities(
+                Text=current_batch.strip(), LanguageCode=language
             )
             all_text_line_results = map_back_comprehend_entity_results(
+                response,
+                current_batch_mapping,
+                allow_list,
+                chosen_redact_comprehend_entities,
+                all_text_line_results,
             )
             return all_text_line_results
         except Exception as e:
             if attempt == max_retries - 1:
                 print("AWS Comprehend calls failed due to", e)
                 raise
             time.sleep(retry_delay)
 def run_page_text_redaction(
     language: str,
     chosen_redact_entities: List[str],
     line_characters: List,
     page_analyser_results: List = list(),
     page_analysed_bounding_boxes: List = list(),
+    comprehend_client=None,
     allow_list: List[str] = None,
     pii_identification_method: str = LOCAL_PII_OPTION,
     nlp_analyser: AnalyzerEngine = None,
     score_threshold: float = 0.0,
     custom_entities: List[str] = None,
+    comprehend_query_number: int = 0,
 ):
     """
     This function performs text redaction on a page based on the specified language and chosen entities.
         custom_entities (List[str], optional): A list of custom entities for redaction. Defaults to None.
         comprehend_query_number (int, optional): A counter for the number of Comprehend queries made. Defaults to 0.
     """
     page_text = ""
     page_text_mapping = list()
     all_text_line_results = list()
         if chosen_redact_entities:
             if page_text:
                 page_text += " "
             start_pos = len(page_text)
             page_text += text_line.text
             page_text_mapping.append((start_pos, i, text_line, line_characters[i]))
+    valid_language_entities = nlp_analyser.registry.get_supported_entities(
+        languages=[language]
+    )
     if "CUSTOM" not in valid_language_entities:
         valid_language_entities.append("CUSTOM")
     if "CUSTOM_FUZZY" not in valid_language_entities:
     if pii_identification_method == LOCAL_PII_OPTION:
         if not nlp_analyser:
             raise ValueError("nlp_analyser is required for Local identification method")
+        language_supported_entities = filter_entities_for_language(
+            chosen_redact_entities, valid_language_entities, language
+        )
         page_analyser_result = nlp_analyser.analyze(
             text=page_text,
             entities=language_supported_entities,
             score_threshold=score_threshold,
             return_decision_process=True,
+            allow_list=allow_list,
         )
         all_text_line_results = map_back_entity_results(
+            page_analyser_result, page_text_mapping, all_text_line_results
         )
     elif pii_identification_method == AWS_PII_OPTION:
         # Process custom entities if any
         if custom_entities:
             custom_redact_entities = [
+                entity
+                for entity in chosen_redact_comprehend_entities
                 if entity in custom_entities
             ]
+            language_supported_entities = filter_entities_for_language(
+                custom_redact_entities, valid_language_entities, language
+            )
             if language_supported_entities:
                 page_analyser_result = nlp_analyser.analyze(
                     entities=language_supported_entities,
                     score_threshold=score_threshold,
                     return_decision_process=True,
+                    allow_list=allow_list,
                 )
                 all_text_line_results = map_back_entity_results(
+                    page_analyser_result, page_text_mapping, all_text_line_results
                 )
         current_batch = ""
         for i, text_line in enumerate(line_level_text_results_list):
             words = text_line.text.split()
             word_start_positions = list()
             # Calculate word start positions within the line
             current_pos = 0
             for word in words:
                 word_start_positions.append(current_pos)
                 current_pos += len(word) + 1  # +1 for space
             for word_idx, word in enumerate(words):
                 new_batch_char_count = len(current_batch) + len(word) + 1
                 if batch_word_count >= 50 or new_batch_char_count >= 200:
                     # Process current batch
                     all_text_line_results = do_aws_comprehend_call(
                         language,
                         allow_list,
                         chosen_redact_comprehend_entities,
+                        all_text_line_results,
                     )
                     comprehend_query_number += 1
                     # Start new batch
                     current_batch = word
                     batch_word_count = 1
                     batch_char_count = len(word)
+                    current_batch_mapping = [
+                        (
+                            0,
+                            i,
+                            text_line,
+                            line_characters[i],
+                            word_start_positions[word_idx],
+                        )
+                    ]
                 else:
                     if current_batch:
                         current_batch += " "
                     current_batch += word
                     batch_char_count += len(word)
                     batch_word_count += 1
                     if not current_batch_mapping or current_batch_mapping[-1][1] != i:
+                        current_batch_mapping.append(
+                            (
+                                batch_char_count - len(word),
+                                i,
+                                text_line,
+                                line_characters[i],
+                                word_start_positions[
+                                    word_idx
+                                ],  # Add the word's start position within its line
+                            )
+                        )
         # Process final batch
         if current_batch:
                 language,
                 allow_list,
                 chosen_redact_comprehend_entities,
+                all_text_line_results,
             )
             comprehend_query_number += 1
     # Process results for each line
     for i, text_line in enumerate(line_level_text_results_list):
+        line_results = next(
+            (results for idx, results in all_text_line_results if idx == i), []
+        )
         if line_results:
             text_line_bounding_boxes = merge_text_bounding_boxes(
+                line_results, line_characters[i]
             )
             page_analyser_results.extend(line_results)
             page_analysed_bounding_boxes.extend(text_line_bounding_boxes)
     return page_analysed_bounding_boxes
+def merge_text_bounding_boxes(
+    analyser_results: dict,
+    characters: List[LTChar],
+    combine_pixel_dist: int = 20,
+    vertical_padding: int = 0,
+):
+    """
     Merge identified bounding boxes containing PII that are very close to one another
+    """
     analysed_bounding_boxes = list()
     original_bounding_boxes = list()  # List to hold original bounding boxes
         # Extract bounding box coordinates for sorting
         bounding_boxes = list()
         for result in analyser_results:
+            # print("Result:", result)
+            char_boxes = [
+                char.bbox
+                for char in characters[result.start : result.end]
+                if isinstance(char, LTChar)
+            ]
+            char_text = [
+                char._text
+                for char in characters[result.start : result.end]
+                if isinstance(char, LTChar)
+            ]
             if char_boxes:
                 # Calculate the bounding box that encompasses all characters
                 left = min(box[0] for box in char_boxes)
                 right = max(box[2] for box in char_boxes)
                 top = max(box[3] for box in char_boxes) + vertical_padding
                 bbox = [left, bottom, right, top]
+                bounding_boxes.append(
+                    (bottom, left, result, bbox, char_text)
+                )  # (y, x, result, bbox, text)
                 # Store original bounding boxes
+                original_bounding_boxes.append(
+                    {
+                        "text": "".join(char_text),
+                        "boundingBox": bbox,
+                        "result": copy.deepcopy(result),
+                    }
+                )
+                # print("Original bounding boxes:", original_bounding_boxes)
         # Sort the results by y-coordinate and then by x-coordinate
         bounding_boxes.sort()
                 vertical_diff_bboxes = abs(next_box[1] - current_y)
                 horizontal_diff_bboxes = abs(next_box[0] - current_box[2])
+                if (
+                    vertical_diff_bboxes <= 5
+                    and horizontal_diff_bboxes <= combine_pixel_dist
+                ):
                     # Merge bounding boxes
+                    # print("Merging boxes")
                     merged_box = current_box.copy()
                     merged_result = current_result
                     merged_text = current_text.copy()
                     merged_box[2] = next_box[2]  # Extend horizontally
                     merged_box[3] = max(current_box[3], next_box[3])  # Adjust the top
+                    merged_result.end = max(
+                        current_result.end, result.end
+                    )  # Extend text range
                     try:
                         if current_result.entity_type != result.entity_type:
+                            merged_result.entity_type = (
+                                current_result.entity_type + " - " + result.entity_type
+                            )
                         else:
                             merged_result.entity_type = current_result.entity_type
                     except Exception as e:
                         merged_text.append(" ")  # Add space between texts
                     merged_text.extend(text)
+                    merged_bounding_boxes.append(
+                        {
+                            "text": "".join(merged_text),
+                            "boundingBox": merged_box,
+                            "result": merged_result,
+                        }
+                    )
                 else:
                     # Start a new bounding box
         analysed_bounding_boxes.extend(original_bounding_boxes)
         analysed_bounding_boxes.extend(merged_bounding_boxes)
+        # print("Analysed bounding boxes:", analysed_bounding_boxes)
     return analysed_bounding_boxes
+def recreate_page_line_level_ocr_results_with_page(
+    page_line_level_ocr_results_with_words: dict,
+):
     reconstructed_results = list()
     # Assume all lines belong to the same page, so we can just read it from one item
+    # page = next(iter(page_line_level_ocr_results_with_words.values()))["page"]
     page = page_line_level_ocr_results_with_words["page"]
     for line_data in page_line_level_ocr_results_with_words["results"].values():
         bbox = line_data["bounding_box"]
         text = line_data["text"]
             top=bbox[1],
             width=bbox[2] - bbox[0],
             height=bbox[3] - bbox[1],
+            line=line_number,
         )
         reconstructed_results.append(line_result)
+    page_line_level_ocr_results_with_page = {
+        "page": page,
+        "results": reconstructed_results,
+    }
     return page_line_level_ocr_results_with_page
+def split_words_and_punctuation_from_line(
+    line_of_words: List[OCRResult],
+) -> List[OCRResult]:
     """
     Takes a list of OCRResult objects and splits words with trailing/leading punctuation.
     "high-tech" are preserved.
     """
     # Punctuation that will be split off. Hyphen is not included.
     new_word_list = list()
     for word_result in line_of_words:
         word_text = word_result.text
         # This regex finds a central "core" word, and captures leading and trailing punctuation
         # Handles cases like "(word)." -> group1='(', group2='word', group3='.'
         match = re.match(r"([(\[{]*)(.*?)_?([.,?!:;)\}\]]*)$", word_text)
         # Handle words with internal hyphens that might confuse the regex
+        if "-" in word_text and not match.group(2):
+            core_part_text = word_text
+            leading_punc = ""
+            trailing_punc = ""
         elif match:
             leading_punc, core_part_text, trailing_punc = match.groups()
+        else:  # Failsafe
             new_word_list.append(word_result)
             continue
         # If no split is needed, just add the original and continue
         if not leading_punc and not trailing_punc:
             new_word_list.append(word_result)
             continue
         # --- A split is required ---
         # Estimate new bounding boxes by proportionally allocating width
         original_width = word_result.width
+        if not word_text or original_width == 0:
+            continue  # Failsafe
         avg_char_width = original_width / len(word_text)
         current_left = word_result.left
         # Add leading punctuation if it exists
         if leading_punc:
             punc_width = avg_char_width * len(leading_punc)
+            new_word_list.append(
+                OCRResult(
+                    text=leading_punc,
+                    left=current_left,
+                    top=word_result.top,
+                    width=punc_width,
+                    height=word_result.height,
+                )
+            )
             current_left += punc_width
         # Add the core part of the word
         if core_part_text:
             core_width = avg_char_width * len(core_part_text)
+            new_word_list.append(
+                OCRResult(
+                    text=core_part_text,
+                    left=current_left,
+                    top=word_result.top,
+                    width=core_width,
+                    height=word_result.height,
+                )
+            )
             current_left += core_width
         # Add trailing punctuation if it exists
         if trailing_punc:
             punc_width = avg_char_width * len(trailing_punc)
+            new_word_list.append(
+                OCRResult(
+                    text=trailing_punc,
+                    left=current_left,
+                    top=word_result.top,
+                    width=punc_width,
+                    height=word_result.height,
+                )
+            )
     return new_word_list
+def create_ocr_result_with_children(
+    combined_results: dict, i: int, current_bbox: dict, current_line: list
+):
+    combined_results["text_line_" + str(i)] = {
         "line": i,
+        "text": current_bbox.text,
+        "bounding_box": (
+            current_bbox.left,
+            current_bbox.top,
+            current_bbox.left + current_bbox.width,
+            current_bbox.top + current_bbox.height,
+        ),
+        "words": [
+            {
+                "text": word.text,
+                "bounding_box": (
+                    word.left,
+                    word.top,
+                    word.left + word.width,
+                    word.top + word.height,
+                ),
+            }
+            for word in current_line
+        ],
     }
+    return combined_results["text_line_" + str(i)]
+def combine_ocr_results(
+    ocr_results: List[OCRResult],
+    x_threshold: float = 50.0,
+    y_threshold: float = 12.0,
+    page: int = 1,
+):
     """
     Group OCR results into lines, splitting words from punctuation.
     """
         line_top = min(word.top for word in line)
         line_right = max(word.left + word.width for word in line)
         line_bottom = max(word.top + word.height for word in line)
         final_line_bbox = OCRResult(
             text=line_text,
             left=line_left,
             top=line_top,
             width=line_right - line_left,
             height=line_bottom - line_top,
+            line=line_counter,
         )
         page_line_level_ocr_results.append(final_line_bbox)
         # Use the PROCESSED line to create the children. Creates a result within page_line_level_ocr_results_with_words
+        page_line_level_ocr_results_with_words["text_line_" + str(line_counter)] = (
+            create_ocr_result_with_children(
+                page_line_level_ocr_results_with_words,
+                line_counter,
+                final_line_bbox,
+                processed_line,  # <-- Use the new, split list of words
+            )
         )
         line_counter += 1
+    page_level_results_with_page = {
+        "page": page,
+        "results": page_line_level_ocr_results,
+    }
+    page_level_results_with_words = {
+        "page": page,
+        "results": page_line_level_ocr_results_with_words,
+    }
     return page_level_results_with_page, page_level_results_with_words

tools/data_anonymise.py CHANGED Viewed

@@ -1,65 +1,112 @@
-import re
 import os
 import secrets
-import base64
 import time
 import boto3
 import botocore
-import pandas as pd
-import polars as pl
-import unicodedata
 import docx
 import gradio as gr
-from openpyxl import Workbook
 from faker import Faker
 from gradio import Progress
-from typing import List, Dict, Any, Optional
-from botocore.client import BaseClient
-from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
-from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
-from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices, DO_INITIAL_TABULAR_DATA_CLEAN, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS, AWS_REGION, MAX_TABLE_ROWS, MAX_TABLE_COLUMNS, MAX_SIMULTANEOUS_FILES
-from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type, _get_env_list
-from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer,  create_nlp_analyser, load_spacy_model
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict
-if DO_INITIAL_TABULAR_DATA_CLEAN == "True": DO_INITIAL_TABULAR_DATA_CLEAN = True
-else: DO_INITIAL_TABULAR_DATA_CLEAN = False
-if CUSTOM_ENTITIES: CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
 custom_entities = CUSTOM_ENTITIES
 fake = Faker("en_UK")
 def fake_first_name(x):
     return fake.first_name()
 # #### Some of my cleaning functions
-url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(?:www\.)[a-zA-Z0-9._-]+\.[a-zA-Z]{2,}'
-html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
-html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
-non_ascii_pattern = r'[^\x00-\x7F]+'
-and_sign_regex = r'&'
-multiple_spaces_regex = r'\s{2,}'
-multiple_new_lines_regex = r'(\r\n|\n)+'
 multiple_punctuation_regex = r"(\p{P})\p{P}+"
-def initial_clean(texts:pd.Series) -> pd.Series:
-    '''
     This function cleans the text by removing URLs, HTML tags, and non-ASCII characters.
-    '''
     for text in texts:
         if not text or pd.isnull(text):
             text = ""
         # Normalize unicode characters to decompose any special forms
-        normalized_text = unicodedata.normalize('NFKC', text)
         # Replace smart quotes and special punctuation with standard ASCII equivalents
         replacements = {
-            '‘': "'", '’': "'", '“': '"', '”': '"',
-            '–': '-', '—': '-', '…': '...', '•': '*',
         }
         # Perform replacements
@@ -70,55 +117,70 @@ def initial_clean(texts:pd.Series) -> pd.Series:
     # Convert to polars Series
     texts = pl.Series(texts).str.strip_chars()
     # Define a list of patterns and their replacements
     patterns = [
-        (multiple_new_lines_regex, '  '),
-        (r'\r', ''),
-        (url_pattern, ' '),
-        (html_pattern_regex, ' '),
-        (html_start_pattern_end_dots_regex, ' '),
-        (non_ascii_pattern, ' '),
-        (multiple_spaces_regex, ' '),
         (multiple_punctuation_regex, "${1}"),
-        (and_sign_regex, 'and')
     ]
     # Apply each regex replacement
     for pattern, replacement in patterns:
         texts = texts.str.replace_all(pattern, replacement)
     # Convert the series back to a list
     texts = texts.to_list()
     return texts
-def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
-        output = list()
-        if hasattr(result, 'value'):
-            text = result.value[data_row]
-        else:
-            text = ""
-        if isinstance(recognizer_result, list):
-            for sub_result in recognizer_result:
-                if isinstance(text, str):
-                    found_text = text[sub_result.start:sub_result.end]
-                else:
-                    found_text = ''
-                analysis_explanation = {key: sub_result.__dict__[key] for key in keys_to_keep}
-                analysis_explanation.update({
-                    'data_row': str(data_row),
-                    'column': list(df_dict.keys())[dictionary_key],
-                    'entity': found_text
-                })
-                output.append(str(analysis_explanation))
-        return output
 # Writing decision making process to file
-def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult], df_dict: Dict[str, List[Any]]) -> str:
     """
     Generate a detailed output of the decision process for entity recognition.
@@ -135,35 +197,46 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
         str: A string containing the detailed decision process output.
     """
     decision_process_output = list()
-    keys_to_keep = ['entity_type', 'start', 'end']
     # Run through each column to analyse for PII
     for i, result in enumerate(analyzer_results):
         # If a single result
         if isinstance(result, RecognizerResult):
-            decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
         # If a list of results
         elif isinstance(result, list) or isinstance(result, DictAnalyzerResult):
             for x, recognizer_result in enumerate(result.recognizer_results):
-                decision_process_output.extend(process_recognizer_result(result, recognizer_result, x, i, df_dict, keys_to_keep))
         else:
             try:
-                decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
             except Exception as e:
                 print(e)
-    decision_process_output_str = '\n'.join(decision_process_output)
     return decision_process_output_str
-def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
     # ## Pick out common names and replace them with the same person value
     df_dict = df.to_dict(orient="list")
-    #analyzer = AnalyzerEngine()
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
     analyzer_results = batch_analyzer.analyze_dict(df_dict, language=DEFAULT_LANGUAGE)
@@ -177,17 +250,17 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
     # Adjusting the parse_dict function to handle trailing ']'
     # Splitting the main data string into individual list strings
-    list_strs = data_str[1:-1].split('], [')
     def parse_dict(s):
-        s = s.strip('[]')  # Removing any surrounding brackets
-        items = s.split(', ')
         d = {}
         for item in items:
-            key, value = item.split(': ')
-            if key == 'score':
                 d[key] = float(value)
-            elif key in ['start', 'end']:
                 d[key] = int(value)
             else:
                 d[key] = value
@@ -199,9 +272,11 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
     for lst_str in list_strs:
         # Splitting each list string into individual dictionary strings
-        dict_strs = lst_str.split(', type: ')
-        dict_strs = [dict_strs[0]] + ['type: ' + s for s in dict_strs[1:]]  # Prepending "type: " back to the split strings
         # Parsing each dictionary string
         dicts = [parse_dict(d) for d in dict_strs]
         result.append(dicts)
@@ -211,29 +286,36 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
     for idx, paragraph in enumerate(text):
         paragraph_texts = list()
         for dictionary in result[idx]:
-            if dictionary['type'] == 'PERSON':
-                paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
         names.append(paragraph_texts)
     # Flatten the list of lists and extract unique names
     unique_names = list(set(name for sublist in names for name in sublist))
     fake_names = pd.Series(unique_names).apply(fake_first_name)
-    mapping_df = pd.DataFrame(data={"Unique names":unique_names,
-                    "Fake names": fake_names})
     # Convert mapping dataframe to dictionary, adding word boundaries for full-word match
-    name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
     name_map
-    scrubbed_df_consistent_names = df.replace(name_map, regex = True)
     scrubbed_df_consistent_names
     return scrubbed_df_consistent_names
 def handle_docx_anonymisation(
     file_path: str,
     output_folder: str,
@@ -247,7 +329,7 @@ def handle_docx_anonymisation(
     comprehend_query_number: int,
     comprehend_client: BaseClient,
     language: Optional[str] = DEFAULT_LANGUAGE,
-    nlp_analyser: AnalyzerEngine = nlp_analyser
 ):
     """
     Anonymises a .docx file by extracting text, processing it, and re-inserting it.
@@ -255,11 +337,13 @@ def handle_docx_anonymisation(
     Returns:
         A tuple containing the output file path and the log file path.
     """
     # 1. Load the document and extract text elements
     doc = docx.Document(file_path)
-    text_elements = list()  # This will store the actual docx objects (paragraphs, cells)
-    original_texts = list() # This will store the text from those objects
     paragraph_count = len(doc.paragraphs)
@@ -278,18 +362,18 @@ def handle_docx_anonymisation(
     for table in doc.tables:
         for row in table.rows:
             for cell in row.cells:
-                if cell.text.strip(): # Only process non-empty cells
                     text_elements.append(cell)
                     original_texts.append(cell.text)
     # If there's no text to process, return early
     if not original_texts:
         print(f"No text found in {file_path}. Skipping.")
         return None, None, 0
     # 2. Convert to a DataFrame for the existing anonymisation script
-    df_to_anonymise = pd.DataFrame({'text_to_redact': original_texts})
     # 3. Call the core anonymisation script
     anonymised_df, _, decision_log, comprehend_query_number = anonymise_script(
         df=df_to_anonymise,
@@ -303,10 +387,10 @@ def handle_docx_anonymisation(
         chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
         comprehend_query_number=comprehend_query_number,
         comprehend_client=comprehend_client,
-        nlp_analyser=nlp_analyser
     )
-    anonymised_texts = anonymised_df['text_to_redact'].tolist()
     # 4. Re-insert the anonymised text back into the document objects
     for element, new_text in zip(text_elements, anonymised_texts):
@@ -321,44 +405,53 @@ def handle_docx_anonymisation(
     # 5. Save the redacted document and the log file
     base_name = os.path.basename(file_path)
     file_name_without_ext = os.path.splitext(base_name)[0]
-    output_docx_path = os.path.join(output_folder, f"{file_name_without_ext}_redacted.docx")
-    log_file_path = os.path.join(output_folder, f"{file_name_without_ext}_redacted_log.txt")
-    output_xlsx_path = os.path.join(output_folder, f"{file_name_without_ext}_redacted.csv")
-    anonymised_df.to_csv(output_xlsx_path, encoding="utf-8-sig", index=None)
     doc.save(output_docx_path)
     with open(log_file_path, "w", encoding="utf-8-sig") as f:
         f.write(decision_log)
     return output_docx_path, log_file_path, output_xlsx_path, comprehend_query_number
-def anonymise_files_with_open_text(file_paths: List[str],
-                         in_text: str,
-                         anon_strategy: str,
-                         chosen_cols: List[str],
-                         chosen_redact_entities: List[str],
-                         in_allow_list: List[str] = None,
-                         latest_file_completed: int = 0,
-                         out_message: list = list(),
-                         out_file_paths: list = list(),
-                         log_files_output_paths: list = list(),
-                         in_excel_sheets: list = list(),
-                         first_loop_state: bool = False,
-                         output_folder: str = OUTPUT_FOLDER,
-                         in_deny_list:list[str]=list(),
-                         max_fuzzy_spelling_mistakes_num:int=0,
-                         pii_identification_method:str="Local",
-                         chosen_redact_comprehend_entities:List[str]=list(),
-                         comprehend_query_number:int=0,
-                         aws_access_key_textbox:str='',
-                         aws_secret_key_textbox:str='',
-                         actual_time_taken_number:float=0,
-                         do_initial_clean:bool=DO_INITIAL_TABULAR_DATA_CLEAN,
-                         language: Optional[str] = None,
-                         progress: Progress = Progress(track_tqdm=True)):
     """
     This function anonymises data files based on the provided parameters.
@@ -379,7 +472,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
     - output_folder (str, optional): The output folder path. Defaults to the global output_folder variable.
     - in_deny_list (list[str], optional): A list of specific terms to redact.
     - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
-    - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
@@ -389,13 +482,14 @@ def anonymise_files_with_open_text(file_paths: List[str],
     - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
     - do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
     """
     tic = time.perf_counter()
     comprehend_client = ""
     # If output folder doesn't end with a forward slash, add one
-    if not output_folder.endswith('/'): output_folder = output_folder + '/'
     # Use provided language or default
     language = language or DEFAULT_LANGUAGE
@@ -405,7 +499,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
             raise Warning(out_message)
     # If this is the first time around, set variables to 0/blank
-    if first_loop_state==True:
         latest_file_completed = 0
         out_message = list()
         out_file_paths = list()
@@ -415,14 +509,14 @@ def anonymise_files_with_open_text(file_paths: List[str],
     if isinstance(out_message, str):
         out_message = [out_message]
-    #print("log_files_output_paths:",log_files_output_paths)
     if isinstance(log_files_output_paths, str):
         log_files_output_paths = list()
     if not out_file_paths:
-        out_file_paths = list()
     if isinstance(in_allow_list, list):
         if in_allow_list:
             in_allow_list_flat = in_allow_list
@@ -435,39 +529,45 @@ def anonymise_files_with_open_text(file_paths: List[str],
             in_allow_list_flat = list()
     else:
         in_allow_list_flat = list()
     anon_df = pd.DataFrame()
-     # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
     if pii_identification_method == "AWS Comprehend":
         print("Trying to connect to AWS Comprehend service")
         if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
             print("Connecting to Comprehend via existing SSO connection")
-            comprehend_client = boto3.client('comprehend', region_name=AWS_REGION)
         elif aws_access_key_textbox and aws_secret_key_textbox:
-            print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
             print("aws_access_key_textbox:", aws_access_key_textbox)
             print("aws_secret_access_key:", aws_secret_key_textbox)
-            comprehend_client = boto3.client('comprehend',
-                aws_access_key_id=aws_access_key_textbox,
-                aws_secret_access_key=aws_secret_key_textbox)
         elif RUN_AWS_FUNCTIONS == "1":
             print("Connecting to Comprehend via existing SSO connection")
-            comprehend_client = boto3.client('comprehend')
         elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
             print("Getting Comprehend credentials from environment variables")
-            comprehend_client = boto3.client('comprehend',
-                aws_access_key_id=AWS_ACCESS_KEY,
-                aws_secret_access_key=AWS_SECRET_KEY)
         else:
             comprehend_client = ""
             out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
-            raise(out_message)
     # Check if files and text exist
     if not file_paths:
         if in_text:
-            file_paths=['open_text']
         else:
             out_message = "Please enter text or a file to redact."
             raise Exception(out_message)
@@ -479,31 +579,73 @@ def anonymise_files_with_open_text(file_paths: List[str],
         out_message = f"Number of files to anonymise is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
         print(out_message)
         raise Exception(out_message)
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     if latest_file_completed >= len(file_paths):
-        print("Last file reached") #, returning files:", str(latest_file_completed))
         # Set to a very high number so as not to mess with subsequent file processing by the user
-        #latest_file_completed = 99
-        final_out_message = '\n'.join(out_message)
-        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, actual_time_taken_number, comprehend_query_number
     file_path_loop = [file_paths[int(latest_file_completed)]]
-    for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "files"):
         # Get a string file path
-        if isinstance(anon_file, str): file_path = anon_file
-        else: file_path = anon_file
-        if anon_file=='open_text':
-            anon_df = pd.DataFrame(data={'text':[in_text]})
-            chosen_cols=['text']
             out_file_part = anon_file
             sheet_name = ""
             file_type = ""
-            out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number = tabular_anonymise_wrapper_func(file_path, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strategy, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=OUTPUT_FOLDER, do_initial_clean=do_initial_clean)
         else:
             # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
             file_type = detect_file_type(file_path)
@@ -511,20 +653,22 @@ def anonymise_files_with_open_text(file_paths: List[str],
             out_file_part = get_file_name_without_type(file_path)
-            if file_type == 'docx':
-                output_path, log_path, output_xlsx_path, comprehend_query_number = handle_docx_anonymisation(
-                    file_path=file_path,
-                    output_folder=output_folder,
-                    anon_strategy=anon_strategy,
-                    chosen_redact_entities=chosen_redact_entities,
-                    in_allow_list=in_allow_list_flat,
-                    in_deny_list=in_deny_list,
-                    max_fuzzy_spelling_mistakes_num=max_fuzzy_spelling_mistakes_num,
-                    pii_identification_method=pii_identification_method,
-                    chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
-                    comprehend_query_number=comprehend_query_number,
-                    comprehend_client=comprehend_client,
-                    language=language
                 )
                 if output_path:
                     out_file_paths.append(output_path)
@@ -532,34 +676,100 @@ def anonymise_files_with_open_text(file_paths: List[str],
                     out_file_paths.append(output_xlsx_path)
                 if log_path:
                     log_files_output_paths.append(log_path)
-            elif file_type == 'xlsx':
                 print("Running through all xlsx sheets")
-                #anon_xlsx = pd.ExcelFile(anon_file)
                 if not in_excel_sheets:
-                    out_message.append("No Excel sheets selected. Please select at least one to anonymise.")
                     continue
                 # Create xlsx file:
-                anon_xlsx = pd.ExcelFile(file_path)
-                anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
                 # Iterate through the sheet names
-                for sheet_name in progress.tqdm(in_excel_sheets, desc="Anonymising sheets", unit = "sheets"):
                     # Read each sheet into a DataFrame
                     if sheet_name not in anon_xlsx.sheet_names:
                         continue
                     anon_df = pd.read_excel(file_path, sheet_name=sheet_name)
-                    out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number  = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strategy, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder, do_initial_clean=do_initial_clean)
             else:
                 sheet_name = ""
                 anon_df = read_file(file_path)
                 out_file_part = get_file_name_without_type(file_path)
-                out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strategy, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder, do_initial_clean=do_initial_clean)
         # Increase latest file completed count unless we are at the last file
         if latest_file_completed != len(file_paths):
@@ -575,46 +785,61 @@ def anonymise_files_with_open_text(file_paths: List[str],
         if isinstance(out_message, str):
             out_message = [out_message]
-        out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
-        out_message_out = '\n'.join(out_message)
         out_message_out = out_message_out + " " + out_time
         if anon_strategy == "encrypt":
-            out_message_out.append(". Your decryption key is " + key_string)
-        out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
-        out_message_out = re.sub(r'^\n+|^\. ', '', out_message_out).strip()
-    return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, actual_time_taken_number, comprehend_query_number
 def tabular_anonymise_wrapper_func(
-    anon_file: str,
-    anon_df: pd.DataFrame,
-    chosen_cols: List[str],
-    out_file_paths: List[str],
-    out_file_part: str,
-    out_message: str,
-    excel_sheet_name: str,
-    anon_strategy: str,
     language: str,
-    chosen_redact_entities: List[str],
-    in_allow_list: List[str],
-    file_type: str,
-    anon_xlsx_export_file_name: str,
     log_files_output_paths: List[str],
-    in_deny_list: List[str]=list(),
-    max_fuzzy_spelling_mistakes_num:int=0,
-    pii_identification_method:str="Local",
     comprehend_language: Optional[str] = None,
-    chosen_redact_comprehend_entities:List[str]=list(),
-    comprehend_query_number:int=0,
-    comprehend_client:botocore.client.BaseClient="",
     nlp_analyser: AnalyzerEngine = nlp_analyser,
     output_folder: str = OUTPUT_FOLDER,
-    do_initial_clean:bool=DO_INITIAL_TABULAR_DATA_CLEAN
 ):
     """
     This function wraps the anonymisation process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymisation strategy using the anonymise_script function, and exports the anonymised data to a file.
@@ -636,16 +861,17 @@ def tabular_anonymise_wrapper_func(
     - log_files_output_paths: A list of paths where the log files will be saved.
     - in_deny_list: List of specific terms to remove from the data.
     - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
-    - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
-    - comprehend_client (optional): The client object from AWS containing a client connection to AWS Comprehend if that option is chosen on the first tab.
     - output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
     - do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
     """
     def check_lists(list1, list2):
-            return any(string in list2 for string in list1)
     def get_common_strings(list1, list2):
         """
         Finds the common strings between two lists.
@@ -664,62 +890,95 @@ def tabular_anonymise_wrapper_func(
         return common_strings
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
-        raise("Connection to AWS Comprehend service not found, please check connection details.")
     # Check for chosen col, skip file if not found
     all_cols_original_order = list(anon_df.columns)
     any_cols_found = check_lists(chosen_cols, all_cols_original_order)
-    if any_cols_found == False:
         out_message = "No chosen columns found in dataframe: " + out_file_part
         key_string = ""
         print(out_message)
-        return out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number
     else:
-        chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
     # Split dataframe to keep only selected columns
-    #print("Remaining columns to redact:", chosen_cols_in_anon_df)
     if not anon_df.index.is_unique:
         anon_df = anon_df.reset_index(drop=True)
     anon_df_part = anon_df[chosen_cols_in_anon_df]
-    anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
     row_count = anon_df_part.shape[0]
     if row_count > MAX_TABLE_ROWS:
         out_message = f"Number of rows in dataframe is greater than {MAX_TABLE_ROWS}. Please submit a smaller dataframe."
         print(out_message)
         raise Exception(out_message)
     column_count = anon_df_part.shape[1]
     if column_count > MAX_TABLE_COLUMNS:
         out_message = f"Number of columns in dataframe is greater than {MAX_TABLE_COLUMNS}. Please submit a smaller dataframe."
         print(out_message)
         raise Exception(out_message)
     # Anonymise the selected columns
-    anon_df_part_out, key_string, decision_process_output_str, comprehend_query_number = anonymise_script(anon_df_part, anon_strategy, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser, do_initial_clean=do_initial_clean)
     anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
     # Rejoin the dataframe together
-    anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
     anon_df_out = anon_df_out[all_cols_original_order]
     # Export file
     #  Rename anonymisation strategy for file path naming
-    if anon_strategy == "replace with 'REDACTED'": anon_strat_txt = "redact_replace"
-    elif anon_strategy == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
-    elif anon_strategy == "redact completely": anon_strat_txt = "redact_remove"
-    else: anon_strat_txt = anon_strategy
     # If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
-    if file_type == 'xlsx':
         anon_export_file_name = anon_xlsx_export_file_name
@@ -730,19 +989,33 @@ def tabular_anonymise_wrapper_func(
             wb.save(anon_xlsx_export_file_name)
         # Create a Pandas Excel writer using XlsxWriter as the engine.
-        with pd.ExcelWriter(anon_xlsx_export_file_name, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
             # Write each DataFrame to a different worksheet.
             anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
-        decision_process_log_output_file = anon_xlsx_export_file_name + "_" + excel_sheet_name + "_decision_process_output.txt"
         with open(decision_process_log_output_file, "w") as f:
             f.write(decision_process_output_str)
     else:
-        anon_export_file_name = output_folder + out_file_part + "_anon_" + anon_strat_txt + ".csv"
-        anon_df_out.to_csv(anon_export_file_name, index = None, encoding="utf-8-sig")
-        decision_process_log_output_file = anon_export_file_name + "_decision_process_output.txt"
         with open(decision_process_log_output_file, "w") as f:
             f.write(decision_process_output_str)
@@ -753,27 +1026,36 @@ def tabular_anonymise_wrapper_func(
     out_file_paths = list(set(out_file_paths))
     # Print result text to output text box if just anonymising open text
-    if anon_file=='open_text':
-        out_message = ["'" + anon_df_out['text'][0] + "'"]
-    return out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number
-def anonymise_script(df:pd.DataFrame,
-                     anon_strategy:str,
-                     language:str,
-                     chosen_redact_entities:List[str],
-                     in_allow_list:List[str]=list(),
-                     in_deny_list:List[str]=list(),
-                     max_fuzzy_spelling_mistakes_num:int=0,
-                     pii_identification_method:str="Local",
-                     chosen_redact_comprehend_entities:List[str]=list(),
-                     comprehend_query_number:int=0,
-                     comprehend_client:botocore.client.BaseClient="",
-                     custom_entities:List[str]=custom_entities,
-                     nlp_analyser: AnalyzerEngine = nlp_analyser,
-                     do_initial_clean:bool=DO_INITIAL_TABULAR_DATA_CLEAN,
-                     progress:Progress=Progress(track_tqdm=True)):
-    '''
     Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
     Args:
@@ -792,14 +1074,14 @@ def anonymise_script(df:pd.DataFrame,
         nlp_analyser (AnalyzerEngine, optional): The Presidio AnalyzerEngine instance to use. Defaults to `nlp_analyser`.
         do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
         progress (Progress, optional): Gradio Progress object for tracking progress. Defaults to Progress(track_tqdm=False).
-    '''
     print("Identifying personal information")
     analyse_tic = time.perf_counter()
     # Initialize analyzer_results as an empty dictionary to store results by column
     results_by_column = dict()
-    key_string = ""
     if isinstance(in_allow_list, list):
         if in_allow_list:
@@ -818,20 +1100,22 @@ def anonymise_script(df:pd.DataFrame,
     try:
         if language != "en":
             progress(0.1, desc=f"Loading spaCy model for {language}")
         load_spacy_model(language)
     except Exception as e:
         out_message = f"Error downloading language packs for {language}: {e}"
         print(out_message)
         raise Exception(out_message)
     # Try updating the supported languages for the spacy analyser
     try:
         nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
         # Check list of nlp_analyser recognisers and languages
         if language != "en":
-            gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
     except Exception as e:
         out_message = f"Error creating nlp_analyser for {language}: {e}"
@@ -848,40 +1132,49 @@ def anonymise_script(df:pd.DataFrame,
         # Sort the strings in order from the longest string to the shortest
         in_deny_list = sorted(in_deny_list, key=len, reverse=True)
-    if in_deny_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
         nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
-        new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=in_deny_list, spelling_mistakes_max=in_deny_list, search_whole_phrase=max_fuzzy_spelling_mistakes_num)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
-    #analyzer = nlp_analyser #AnalyzerEngine()
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
-    anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
-    batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
     analyzer_results = list()
     if do_initial_clean:
         progress(0.2, desc="Cleaning text")
-        for col in progress.tqdm(df.columns, desc="Cleaning text", unit = "Columns"):
             df[col] = initial_clean(df[col])
     # DataFrame to dict
-    df_dict = df.to_dict(orient="list")
     if pii_identification_method == "Local":
         # Use custom analyzer to be able to track progress with Gradio
-        custom_results = analyze_dict(batch_analyzer,
-                                        df_dict,
-                                        language=language,
-                                        entities=chosen_redact_entities,
-                                        score_threshold=score_threshold,
-                                        return_decision_process=True,
-                                        allow_list=in_allow_list_flat)
         # Initialize results_by_column with custom entity results
         for result in custom_results:
             results_by_column[result.key] = result
@@ -891,23 +1184,26 @@ def anonymise_script(df:pd.DataFrame,
     # AWS Comprehend calls
     elif pii_identification_method == "AWS Comprehend" and comprehend_client:
         # Only run Local anonymisation for entities that are not covered by AWS Comprehend
         if custom_entities:
             custom_redact_entities = [
-                entity for entity in chosen_redact_comprehend_entities
                 if entity in custom_entities
             ]
             if custom_redact_entities:
                 # Get results from analyze_dict
-                custom_results = analyze_dict(batch_analyzer,
-                                    df_dict,
-                                    language=language,
-                                    entities=custom_redact_entities,
-                                    score_threshold=score_threshold,
-                                    return_decision_process=True,
-                                    allow_list=in_allow_list_flat)
                 # Initialize results_by_column with custom entity results
                 for result in custom_results:
                     results_by_column[result.key] = result
@@ -916,47 +1212,56 @@ def anonymise_script(df:pd.DataFrame,
         retry_delay = 3
         # Process each text column in the dictionary
-        for column_name, texts in progress.tqdm(df_dict.items(), desc="Querying AWS Comprehend service.", unit = "Columns"):
             # Get or create DictAnalyzerResult for this column
             if column_name in results_by_column:
                 column_results = results_by_column[column_name]
             else:
                 column_results = DictAnalyzerResult(
-                    recognizer_results=[[] for _ in texts],
-                    key=column_name,
-                    value=texts
                 )
             # Process each text in the column
-            for text_idx, text in progress.tqdm(enumerate(texts), desc="Querying AWS Comprehend service.", unit = "Row"):
                 for attempt in range(max_retries):
                     try:
                         response = comprehend_client.detect_pii_entities(
-                            Text=str(text),
-                            LanguageCode=language
                         )
                         comprehend_query_number += 1
                         # Add all entities from this text to the column's recognizer_results
                         for entity in response["Entities"]:
-                            if entity.get("Type") not in chosen_redact_comprehend_entities:
                                 continue
                             recognizer_result = RecognizerResult(
                                 entity_type=entity["Type"],
                                 start=entity["BeginOffset"],
                                 end=entity["EndOffset"],
-                                score=entity["Score"]
                             )
-                            column_results.recognizer_results[text_idx].append(recognizer_result)
                         break  # Success, exit retry loop
                     except Exception as e:
                         if attempt == max_retries - 1:
-                            print(f"AWS Comprehend calls failed for text: {text[:100]}... due to", e)
                             raise
                         time.sleep(retry_delay)
@@ -967,56 +1272,79 @@ def anonymise_script(df:pd.DataFrame,
         analyzer_results = list(results_by_column.values())
     elif (pii_identification_method == "AWS Comprehend") & (not comprehend_client):
-        raise("Unable to redact, Comprehend connection details not found.")
     else:
         print("Unable to redact.")
     # Usage in the main function:
-    decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
     analyse_toc = time.perf_counter()
-    analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
     print(analyse_time_out)
     # Set up the anonymization configuration WITHOUT DATE_TIME
-    simple_replace_config = eval('{"DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})}')
-    replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
-    redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
-    hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
-    mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
-    people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
-    fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
-    if anon_strategy == "replace with 'REDACTED'": chosen_mask_config = simple_replace_config
-    elif anon_strategy == "replace_redacted": chosen_mask_config = simple_replace_config
-    elif anon_strategy == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
-    elif anon_strategy == "entity_type": chosen_mask_config = replace_config
-    elif anon_strategy == "redact completely": chosen_mask_config = redact_config
-    elif anon_strategy == "redact": chosen_mask_config = redact_config
-    elif anon_strategy == "hash": chosen_mask_config = hash_config
-    elif anon_strategy == "mask": chosen_mask_config = mask_config
-    elif anon_strategy == "encrypt":
         chosen_mask_config = people_encrypt_config
         key = secrets.token_bytes(16)  # 128 bits = 16 bytes
-        key_string = base64.b64encode(key).decode('utf-8')
         # Now inject the key into the operator config
         for entity, operator in chosen_mask_config.items():
             if operator.operator_name == "encrypt":
                 operator.params = {"key": key_string}
-    elif anon_strategy == "fake_first_name": chosen_mask_config = fake_first_name_config
     else:
         print("Anonymisation strategy not found. Redacting completely by default.")
-        chosen_mask_config = redact_config # Redact completely by default
-    # I think in general people will want to keep date / times - removed Mar 2025 as I don't want to assume for people.
-    #keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
-    combined_config = {**chosen_mask_config} #, **keep_date_config}
-    anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
     scrubbed_df = pd.DataFrame(anonymizer_results)
-    return scrubbed_df, key_string, decision_process_output_str, comprehend_query_number

+import base64
 import os
+import re
 import secrets
 import time
+import unicodedata
+from typing import Any, Dict, List, Optional
 import boto3
 import botocore
 import docx
 import gradio as gr
+import pandas as pd
+import polars as pl
+from botocore.client import BaseClient
 from faker import Faker
 from gradio import Progress
+from openpyxl import Workbook
+from presidio_analyzer import (
+    AnalyzerEngine,
+    BatchAnalyzerEngine,
+    DictAnalyzerResult,
+    RecognizerResult
+)
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+from tools.config import (
+    AWS_ACCESS_KEY,
+    AWS_REGION,
+    AWS_SECRET_KEY,
+    CUSTOM_ENTITIES,
+    DEFAULT_LANGUAGE,
+    DO_INITIAL_TABULAR_DATA_CLEAN,
+    MAX_SIMULTANEOUS_FILES,
+    MAX_TABLE_COLUMNS,
+    MAX_TABLE_ROWS,
+    OUTPUT_FOLDER,
+    PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
+    RUN_AWS_FUNCTIONS,
+    aws_comprehend_language_choices,
+)
+from tools.helper_functions import (
+    _get_env_list,
+    detect_file_type,
+    get_file_name_without_type,
+    read_file,
+)
+from tools.load_spacy_model_custom_recognisers import (
+    CustomWordFuzzyRecognizer,
+    create_nlp_analyser,
+    custom_word_list_recogniser,
+    load_spacy_model,
+    nlp_analyser,
+    score_threshold,
+)
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict
+if DO_INITIAL_TABULAR_DATA_CLEAN == "True":
+    DO_INITIAL_TABULAR_DATA_CLEAN = True
+else:
+    DO_INITIAL_TABULAR_DATA_CLEAN = False
+if CUSTOM_ENTITIES:
+    CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
 custom_entities = CUSTOM_ENTITIES
 fake = Faker("en_UK")
 def fake_first_name(x):
     return fake.first_name()
 # #### Some of my cleaning functions
+url_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(?:www\.)[a-zA-Z0-9._-]+\.[a-zA-Z]{2,}"
+html_pattern_regex = r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;"
+html_start_pattern_end_dots_regex = r"<(.*?)\.\."
+non_ascii_pattern = r"[^\x00-\x7F]+"
+and_sign_regex = r"&"
+multiple_spaces_regex = r"\s{2,}"
+multiple_new_lines_regex = r"(\r\n|\n)+"
 multiple_punctuation_regex = r"(\p{P})\p{P}+"
+def initial_clean(texts: pd.Series) -> pd.Series:
+    """
     This function cleans the text by removing URLs, HTML tags, and non-ASCII characters.
+    """
     for text in texts:
         if not text or pd.isnull(text):
             text = ""
         # Normalize unicode characters to decompose any special forms
+        normalized_text = unicodedata.normalize("NFKC", text)
         # Replace smart quotes and special punctuation with standard ASCII equivalents
         replacements = {
+            "‘": "'",
+            "’": "'",
+            "“": '"',
+            "”": '"',
+            "–": "-",
+            "—": "-",
+            "…": "...",
+            "•": "*",
         }
         # Perform replacements
     # Convert to polars Series
     texts = pl.Series(texts).str.strip_chars()
     # Define a list of patterns and their replacements
     patterns = [
+        (multiple_new_lines_regex, "  "),
+        (r"\r", ""),
+        (url_pattern, " "),
+        (html_pattern_regex, " "),
+        (html_start_pattern_end_dots_regex, " "),
+        (non_ascii_pattern, " "),
+        (multiple_spaces_regex, " "),
         (multiple_punctuation_regex, "${1}"),
+        (and_sign_regex, "and"),
     ]
     # Apply each regex replacement
     for pattern, replacement in patterns:
         texts = texts.str.replace_all(pattern, replacement)
     # Convert the series back to a list
     texts = texts.to_list()
     return texts
+def process_recognizer_result(
+    result: RecognizerResult,
+    recognizer_result: RecognizerResult,
+    data_row: int,
+    dictionary_key: int,
+    df_dict: Dict[str, List[Any]],
+    keys_to_keep: List[str],
+) -> List[str]:
+    output = list()
+    if hasattr(result, "value"):
+        text = result.value[data_row]
+    else:
+        text = ""
+    if isinstance(recognizer_result, list):
+        for sub_result in recognizer_result:
+            if isinstance(text, str):
+                found_text = text[sub_result.start : sub_result.end]
+            else:
+                found_text = ""
+            analysis_explanation = {
+                key: sub_result.__dict__[key] for key in keys_to_keep
+            }
+            analysis_explanation.update(
+                {
+                    "data_row": str(data_row),
+                    "column": list(df_dict.keys())[dictionary_key],
+                    "entity": found_text,
+                }
+            )
+            output.append(str(analysis_explanation))
+    return output
 # Writing decision making process to file
+def generate_decision_process_output(
+    analyzer_results: List[DictAnalyzerResult], df_dict: Dict[str, List[Any]]
+) -> str:
     """
     Generate a detailed output of the decision process for entity recognition.
         str: A string containing the detailed decision process output.
     """
     decision_process_output = list()
+    keys_to_keep = ["entity_type", "start", "end"]
     # Run through each column to analyse for PII
     for i, result in enumerate(analyzer_results):
         # If a single result
         if isinstance(result, RecognizerResult):
+            decision_process_output.extend(
+                process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep)
+            )
         # If a list of results
         elif isinstance(result, list) or isinstance(result, DictAnalyzerResult):
             for x, recognizer_result in enumerate(result.recognizer_results):
+                decision_process_output.extend(
+                    process_recognizer_result(
+                        result, recognizer_result, x, i, df_dict, keys_to_keep
+                    )
+                )
         else:
             try:
+                decision_process_output.extend(
+                    process_recognizer_result(
+                        result, result, 0, i, df_dict, keys_to_keep
+                    )
+                )
             except Exception as e:
                 print(e)
+    decision_process_output_str = "\n".join(decision_process_output)
     return decision_process_output_str
+def anon_consistent_names(df: pd.DataFrame) -> pd.DataFrame:
     # ## Pick out common names and replace them with the same person value
     df_dict = df.to_dict(orient="list")
+    # analyzer = AnalyzerEngine()
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
     analyzer_results = batch_analyzer.analyze_dict(df_dict, language=DEFAULT_LANGUAGE)
     # Adjusting the parse_dict function to handle trailing ']'
     # Splitting the main data string into individual list strings
+    list_strs = data_str[1:-1].split("], [")
     def parse_dict(s):
+        s = s.strip("[]")  # Removing any surrounding brackets
+        items = s.split(", ")
         d = {}
         for item in items:
+            key, value = item.split(": ")
+            if key == "score":
                 d[key] = float(value)
+            elif key in ["start", "end"]:
                 d[key] = int(value)
             else:
                 d[key] = value
     for lst_str in list_strs:
         # Splitting each list string into individual dictionary strings
+        dict_strs = lst_str.split(", type: ")
+        dict_strs = [dict_strs[0]] + [
+            "type: " + s for s in dict_strs[1:]
+        ]  # Prepending "type: " back to the split strings
         # Parsing each dictionary string
         dicts = [parse_dict(d) for d in dict_strs]
         result.append(dicts)
     for idx, paragraph in enumerate(text):
         paragraph_texts = list()
         for dictionary in result[idx]:
+            if dictionary["type"] == "PERSON":
+                paragraph_texts.append(
+                    paragraph[dictionary["start"] : dictionary["end"]]
+                )
         names.append(paragraph_texts)
     # Flatten the list of lists and extract unique names
     unique_names = list(set(name for sublist in names for name in sublist))
     fake_names = pd.Series(unique_names).apply(fake_first_name)
+    mapping_df = pd.DataFrame(
+        data={"Unique names": unique_names, "Fake names": fake_names}
+    )
     # Convert mapping dataframe to dictionary, adding word boundaries for full-word match
+    name_map = {
+        r"\b" + k + r"\b": v
+        for k, v in zip(mapping_df["Unique names"], mapping_df["Fake names"])
+    }
     name_map
+    scrubbed_df_consistent_names = df.replace(name_map, regex=True)
     scrubbed_df_consistent_names
     return scrubbed_df_consistent_names
 def handle_docx_anonymisation(
     file_path: str,
     output_folder: str,
     comprehend_query_number: int,
     comprehend_client: BaseClient,
     language: Optional[str] = DEFAULT_LANGUAGE,
+    nlp_analyser: AnalyzerEngine = nlp_analyser,
 ):
     """
     Anonymises a .docx file by extracting text, processing it, and re-inserting it.
     Returns:
         A tuple containing the output file path and the log file path.
     """
     # 1. Load the document and extract text elements
     doc = docx.Document(file_path)
+    text_elements = (
+        list()
+    )  # This will store the actual docx objects (paragraphs, cells)
+    original_texts = list()  # This will store the text from those objects
     paragraph_count = len(doc.paragraphs)
     for table in doc.tables:
         for row in table.rows:
             for cell in row.cells:
+                if cell.text.strip():  # Only process non-empty cells
                     text_elements.append(cell)
                     original_texts.append(cell.text)
     # If there's no text to process, return early
     if not original_texts:
         print(f"No text found in {file_path}. Skipping.")
         return None, None, 0
     # 2. Convert to a DataFrame for the existing anonymisation script
+    df_to_anonymise = pd.DataFrame({"text_to_redact": original_texts})
     # 3. Call the core anonymisation script
     anonymised_df, _, decision_log, comprehend_query_number = anonymise_script(
         df=df_to_anonymise,
         chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
         comprehend_query_number=comprehend_query_number,
         comprehend_client=comprehend_client,
+        nlp_analyser=nlp_analyser,
     )
+    anonymised_texts = anonymised_df["text_to_redact"].tolist()
     # 4. Re-insert the anonymised text back into the document objects
     for element, new_text in zip(text_elements, anonymised_texts):
     # 5. Save the redacted document and the log file
     base_name = os.path.basename(file_path)
     file_name_without_ext = os.path.splitext(base_name)[0]
+    output_docx_path = os.path.join(
+        output_folder, f"{file_name_without_ext}_redacted.docx"
+    )
+    log_file_path = os.path.join(
+        output_folder, f"{file_name_without_ext}_redacted_log.txt"
+    )
+    output_xlsx_path = os.path.join(
+        output_folder, f"{file_name_without_ext}_redacted.csv"
+    )
+    anonymised_df.to_csv(output_xlsx_path, encoding="utf-8-sig", index=None)
     doc.save(output_docx_path)
     with open(log_file_path, "w", encoding="utf-8-sig") as f:
         f.write(decision_log)
     return output_docx_path, log_file_path, output_xlsx_path, comprehend_query_number
+def anonymise_files_with_open_text(
+    file_paths: List[str],
+    in_text: str,
+    anon_strategy: str,
+    chosen_cols: List[str],
+    chosen_redact_entities: List[str],
+    in_allow_list: List[str] = None,
+    latest_file_completed: int = 0,
+    out_message: list = list(),
+    out_file_paths: list = list(),
+    log_files_output_paths: list = list(),
+    in_excel_sheets: list = list(),
+    first_loop_state: bool = False,
+    output_folder: str = OUTPUT_FOLDER,
+    in_deny_list: list[str] = list(),
+    max_fuzzy_spelling_mistakes_num: int = 0,
+    pii_identification_method: str = "Local",
+    chosen_redact_comprehend_entities: List[str] = list(),
+    comprehend_query_number: int = 0,
+    aws_access_key_textbox: str = "",
+    aws_secret_key_textbox: str = "",
+    actual_time_taken_number: float = 0,
+    do_initial_clean: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
+    language: Optional[str] = None,
+    progress: Progress = Progress(track_tqdm=True),
+):
     """
     This function anonymises data files based on the provided parameters.
     - output_folder (str, optional): The output folder path. Defaults to the global output_folder variable.
     - in_deny_list (list[str], optional): A list of specific terms to redact.
     - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+    - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
     - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
     - do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
     """
     tic = time.perf_counter()
     comprehend_client = ""
     # If output folder doesn't end with a forward slash, add one
+    if not output_folder.endswith("/"):
+        output_folder = output_folder + "/"
     # Use provided language or default
     language = language or DEFAULT_LANGUAGE
             raise Warning(out_message)
     # If this is the first time around, set variables to 0/blank
+    if first_loop_state is True:
         latest_file_completed = 0
         out_message = list()
         out_file_paths = list()
     if isinstance(out_message, str):
         out_message = [out_message]
+    # print("log_files_output_paths:",log_files_output_paths)
     if isinstance(log_files_output_paths, str):
         log_files_output_paths = list()
     if not out_file_paths:
+        out_file_paths = list()
     if isinstance(in_allow_list, list):
         if in_allow_list:
             in_allow_list_flat = in_allow_list
             in_allow_list_flat = list()
     else:
         in_allow_list_flat = list()
     anon_df = pd.DataFrame()
+    # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
     if pii_identification_method == "AWS Comprehend":
         print("Trying to connect to AWS Comprehend service")
         if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
             print("Connecting to Comprehend via existing SSO connection")
+            comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
         elif aws_access_key_textbox and aws_secret_key_textbox:
+            print(
+                "Connecting to Comprehend using AWS access key and secret keys from textboxes."
+            )
             print("aws_access_key_textbox:", aws_access_key_textbox)
             print("aws_secret_access_key:", aws_secret_key_textbox)
+            comprehend_client = boto3.client(
+                "comprehend",
+                aws_access_key_id=aws_access_key_textbox,
+                aws_secret_access_key=aws_secret_key_textbox,
+            )
         elif RUN_AWS_FUNCTIONS == "1":
             print("Connecting to Comprehend via existing SSO connection")
+            comprehend_client = boto3.client("comprehend")
         elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
             print("Getting Comprehend credentials from environment variables")
+            comprehend_client = boto3.client(
+                "comprehend",
+                aws_access_key_id=AWS_ACCESS_KEY,
+                aws_secret_access_key=AWS_SECRET_KEY,
+            )
         else:
             comprehend_client = ""
             out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
+            raise (out_message)
     # Check if files and text exist
     if not file_paths:
         if in_text:
+            file_paths = ["open_text"]
         else:
             out_message = "Please enter text or a file to redact."
             raise Exception(out_message)
         out_message = f"Number of files to anonymise is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
         print(out_message)
         raise Exception(out_message)
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     if latest_file_completed >= len(file_paths):
+        print("Last file reached")  # , returning files:", str(latest_file_completed))
         # Set to a very high number so as not to mess with subsequent file processing by the user
+        # latest_file_completed = 99
+        final_out_message = "\n".join(out_message)
+        return (
+            final_out_message,
+            out_file_paths,
+            out_file_paths,
+            latest_file_completed,
+            log_files_output_paths,
+            log_files_output_paths,
+            actual_time_taken_number,
+            comprehend_query_number,
+        )
     file_path_loop = [file_paths[int(latest_file_completed)]]
+    for anon_file in progress.tqdm(
+        file_path_loop, desc="Anonymising files", unit="files"
+    ):
         # Get a string file path
+        if isinstance(anon_file, str):
+            file_path = anon_file
+        else:
+            file_path = anon_file
+        if anon_file == "open_text":
+            anon_df = pd.DataFrame(data={"text": [in_text]})
+            chosen_cols = ["text"]
             out_file_part = anon_file
             sheet_name = ""
             file_type = ""
+            (
+                out_file_paths,
+                out_message,
+                key_string,
+                log_files_output_paths,
+                comprehend_query_number,
+            ) = tabular_anonymise_wrapper_func(
+                file_path,
+                anon_df,
+                chosen_cols,
+                out_file_paths,
+                out_file_part,
+                out_message,
+                sheet_name,
+                anon_strategy,
+                language,
+                chosen_redact_entities,
+                in_allow_list,
+                file_type,
+                "",
+                log_files_output_paths,
+                in_deny_list,
+                max_fuzzy_spelling_mistakes_num,
+                pii_identification_method,
+                chosen_redact_comprehend_entities,
+                comprehend_query_number,
+                comprehend_client,
+                output_folder=OUTPUT_FOLDER,
+                do_initial_clean=do_initial_clean,
+            )
         else:
             # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
             file_type = detect_file_type(file_path)
             out_file_part = get_file_name_without_type(file_path)
+            if file_type == "docx":
+                output_path, log_path, output_xlsx_path, comprehend_query_number = (
+                    handle_docx_anonymisation(
+                        file_path=file_path,
+                        output_folder=output_folder,
+                        anon_strategy=anon_strategy,
+                        chosen_redact_entities=chosen_redact_entities,
+                        in_allow_list=in_allow_list_flat,
+                        in_deny_list=in_deny_list,
+                        max_fuzzy_spelling_mistakes_num=max_fuzzy_spelling_mistakes_num,
+                        pii_identification_method=pii_identification_method,
+                        chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
+                        comprehend_query_number=comprehend_query_number,
+                        comprehend_client=comprehend_client,
+                        language=language,
+                    )
                 )
                 if output_path:
                     out_file_paths.append(output_path)
                     out_file_paths.append(output_xlsx_path)
                 if log_path:
                     log_files_output_paths.append(log_path)
+            elif file_type == "xlsx":
                 print("Running through all xlsx sheets")
+                # anon_xlsx = pd.ExcelFile(anon_file)
                 if not in_excel_sheets:
+                    out_message.append(
+                        "No Excel sheets selected. Please select at least one to anonymise."
+                    )
                     continue
                 # Create xlsx file:
+                anon_xlsx = pd.ExcelFile(file_path)
+                anon_xlsx_export_file_name = (
+                    output_folder + out_file_part + "_redacted.xlsx"
+                )
                 # Iterate through the sheet names
+                for sheet_name in progress.tqdm(
+                    in_excel_sheets, desc="Anonymising sheets", unit="sheets"
+                ):
                     # Read each sheet into a DataFrame
                     if sheet_name not in anon_xlsx.sheet_names:
                         continue
                     anon_df = pd.read_excel(file_path, sheet_name=sheet_name)
+                    (
+                        out_file_paths,
+                        out_message,
+                        key_string,
+                        log_files_output_paths,
+                        comprehend_query_number,
+                    ) = tabular_anonymise_wrapper_func(
+                        anon_file,
+                        anon_df,
+                        chosen_cols,
+                        out_file_paths,
+                        out_file_part,
+                        out_message,
+                        sheet_name,
+                        anon_strategy,
+                        language,
+                        chosen_redact_entities,
+                        in_allow_list,
+                        file_type,
+                        anon_xlsx_export_file_name,
+                        log_files_output_paths,
+                        in_deny_list,
+                        max_fuzzy_spelling_mistakes_num,
+                        pii_identification_method,
+                        language,
+                        chosen_redact_comprehend_entities,
+                        comprehend_query_number,
+                        comprehend_client,
+                        output_folder=output_folder,
+                        do_initial_clean=do_initial_clean,
+                    )
             else:
                 sheet_name = ""
                 anon_df = read_file(file_path)
                 out_file_part = get_file_name_without_type(file_path)
+                (
+                    out_file_paths,
+                    out_message,
+                    key_string,
+                    log_files_output_paths,
+                    comprehend_query_number,
+                ) = tabular_anonymise_wrapper_func(
+                    anon_file,
+                    anon_df,
+                    chosen_cols,
+                    out_file_paths,
+                    out_file_part,
+                    out_message,
+                    sheet_name,
+                    anon_strategy,
+                    language,
+                    chosen_redact_entities,
+                    in_allow_list,
+                    file_type,
+                    "",
+                    log_files_output_paths,
+                    in_deny_list,
+                    max_fuzzy_spelling_mistakes_num,
+                    pii_identification_method,
+                    language,
+                    chosen_redact_comprehend_entities,
+                    comprehend_query_number,
+                    comprehend_client,
+                    output_folder=output_folder,
+                    do_initial_clean=do_initial_clean,
+                )
         # Increase latest file completed count unless we are at the last file
         if latest_file_completed != len(file_paths):
         if isinstance(out_message, str):
             out_message = [out_message]
+        out_message.append(
+            "Anonymisation of file '" + out_file_part + "' successfully completed in"
+        )
+        out_message_out = "\n".join(out_message)
         out_message_out = out_message_out + " " + out_time
         if anon_strategy == "encrypt":
+            out_message_out.append(". Your decryption key is " + key_string)
+        out_message_out = (
+            out_message_out
+            + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
+        )
+        out_message_out = re.sub(r"^\n+|^\. ", "", out_message_out).strip()
+    return (
+        out_message_out,
+        out_file_paths,
+        out_file_paths,
+        latest_file_completed,
+        log_files_output_paths,
+        log_files_output_paths,
+        actual_time_taken_number,
+        comprehend_query_number,
+    )
 def tabular_anonymise_wrapper_func(
+    anon_file: str,
+    anon_df: pd.DataFrame,
+    chosen_cols: List[str],
+    out_file_paths: List[str],
+    out_file_part: str,
+    out_message: str,
+    excel_sheet_name: str,
+    anon_strategy: str,
     language: str,
+    chosen_redact_entities: List[str],
+    in_allow_list: List[str],
+    file_type: str,
+    anon_xlsx_export_file_name: str,
     log_files_output_paths: List[str],
+    in_deny_list: List[str] = list(),
+    max_fuzzy_spelling_mistakes_num: int = 0,
+    pii_identification_method: str = "Local",
     comprehend_language: Optional[str] = None,
+    chosen_redact_comprehend_entities: List[str] = list(),
+    comprehend_query_number: int = 0,
+    comprehend_client: botocore.client.BaseClient = "",
     nlp_analyser: AnalyzerEngine = nlp_analyser,
     output_folder: str = OUTPUT_FOLDER,
+    do_initial_clean: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
 ):
     """
     This function wraps the anonymisation process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymisation strategy using the anonymise_script function, and exports the anonymised data to a file.
     - log_files_output_paths: A list of paths where the log files will be saved.
     - in_deny_list: List of specific terms to remove from the data.
     - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+    - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
+    - comprehend_client (optional): The client object from AWS containing a client connection to AWS Comprehend if that option is chosen on the first tab.
     - output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
     - do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
     """
     def check_lists(list1, list2):
+        return any(string in list2 for string in list1)
     def get_common_strings(list1, list2):
         """
         Finds the common strings between two lists.
         return common_strings
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
+        raise (
+            "Connection to AWS Comprehend service not found, please check connection details."
+        )
     # Check for chosen col, skip file if not found
     all_cols_original_order = list(anon_df.columns)
     any_cols_found = check_lists(chosen_cols, all_cols_original_order)
+    if any_cols_found is False:
         out_message = "No chosen columns found in dataframe: " + out_file_part
         key_string = ""
         print(out_message)
+        return (
+            out_file_paths,
+            out_message,
+            key_string,
+            log_files_output_paths,
+            comprehend_query_number,
+        )
     else:
+        chosen_cols_in_anon_df = get_common_strings(
+            chosen_cols, all_cols_original_order
+        )
     # Split dataframe to keep only selected columns
+    # print("Remaining columns to redact:", chosen_cols_in_anon_df)
     if not anon_df.index.is_unique:
         anon_df = anon_df.reset_index(drop=True)
     anon_df_part = anon_df[chosen_cols_in_anon_df]
+    anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis=1)
     row_count = anon_df_part.shape[0]
     if row_count > MAX_TABLE_ROWS:
         out_message = f"Number of rows in dataframe is greater than {MAX_TABLE_ROWS}. Please submit a smaller dataframe."
         print(out_message)
         raise Exception(out_message)
     column_count = anon_df_part.shape[1]
     if column_count > MAX_TABLE_COLUMNS:
         out_message = f"Number of columns in dataframe is greater than {MAX_TABLE_COLUMNS}. Please submit a smaller dataframe."
         print(out_message)
         raise Exception(out_message)
     # Anonymise the selected columns
+    (
+        anon_df_part_out,
+        key_string,
+        decision_process_output_str,
+        comprehend_query_number,
+    ) = anonymise_script(
+        anon_df_part,
+        anon_strategy,
+        language,
+        chosen_redact_entities,
+        in_allow_list,
+        in_deny_list,
+        max_fuzzy_spelling_mistakes_num,
+        pii_identification_method,
+        chosen_redact_comprehend_entities,
+        comprehend_query_number,
+        comprehend_client,
+        nlp_analyser=nlp_analyser,
+        do_initial_clean=do_initial_clean,
+    )
     anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
     # Rejoin the dataframe together
+    anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis=1)
     anon_df_out = anon_df_out[all_cols_original_order]
     # Export file
     #  Rename anonymisation strategy for file path naming
+    if anon_strategy == "replace with 'REDACTED'":
+        anon_strat_txt = "redact_replace"
+    elif anon_strategy == "replace with <ENTITY_NAME>":
+        anon_strat_txt = "redact_entity_type"
+    elif anon_strategy == "redact completely":
+        anon_strat_txt = "redact_remove"
+    else:
+        anon_strat_txt = anon_strategy
     # If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
+    if file_type == "xlsx":
         anon_export_file_name = anon_xlsx_export_file_name
             wb.save(anon_xlsx_export_file_name)
         # Create a Pandas Excel writer using XlsxWriter as the engine.
+        with pd.ExcelWriter(
+            anon_xlsx_export_file_name,
+            engine="openpyxl",
+            mode="a",
+            if_sheet_exists="replace",
+        ) as writer:
             # Write each DataFrame to a different worksheet.
             anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
+        decision_process_log_output_file = (
+            anon_xlsx_export_file_name
+            + "_"
+            + excel_sheet_name
+            + "_decision_process_output.txt"
+        )
         with open(decision_process_log_output_file, "w") as f:
             f.write(decision_process_output_str)
     else:
+        anon_export_file_name = (
+            output_folder + out_file_part + "_anon_" + anon_strat_txt + ".csv"
+        )
+        anon_df_out.to_csv(anon_export_file_name, index=None, encoding="utf-8-sig")
+        decision_process_log_output_file = (
+            anon_export_file_name + "_decision_process_output.txt"
+        )
         with open(decision_process_log_output_file, "w") as f:
             f.write(decision_process_output_str)
     out_file_paths = list(set(out_file_paths))
     # Print result text to output text box if just anonymising open text
+    if anon_file == "open_text":
+        out_message = ["'" + anon_df_out["text"][0] + "'"]
+    return (
+        out_file_paths,
+        out_message,
+        key_string,
+        log_files_output_paths,
+        comprehend_query_number,
+    )
+def anonymise_script(
+    df: pd.DataFrame,
+    anon_strategy: str,
+    language: str,
+    chosen_redact_entities: List[str],
+    in_allow_list: List[str] = list(),
+    in_deny_list: List[str] = list(),
+    max_fuzzy_spelling_mistakes_num: int = 0,
+    pii_identification_method: str = "Local",
+    chosen_redact_comprehend_entities: List[str] = list(),
+    comprehend_query_number: int = 0,
+    comprehend_client: botocore.client.BaseClient = "",
+    custom_entities: List[str] = custom_entities,
+    nlp_analyser: AnalyzerEngine = nlp_analyser,
+    do_initial_clean: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
+    progress: Progress = Progress(track_tqdm=True),
+):
+    """
     Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
     Args:
         nlp_analyser (AnalyzerEngine, optional): The Presidio AnalyzerEngine instance to use. Defaults to `nlp_analyser`.
         do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
         progress (Progress, optional): Gradio Progress object for tracking progress. Defaults to Progress(track_tqdm=False).
+    """
     print("Identifying personal information")
     analyse_tic = time.perf_counter()
     # Initialize analyzer_results as an empty dictionary to store results by column
     results_by_column = dict()
+    key_string = ""
     if isinstance(in_allow_list, list):
         if in_allow_list:
     try:
         if language != "en":
             progress(0.1, desc=f"Loading spaCy model for {language}")
         load_spacy_model(language)
     except Exception as e:
         out_message = f"Error downloading language packs for {language}: {e}"
         print(out_message)
         raise Exception(out_message)
     # Try updating the supported languages for the spacy analyser
     try:
         nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
         # Check list of nlp_analyser recognisers and languages
         if language != "en":
+            gr.Info(
+                f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}"
+            )
     except Exception as e:
         out_message = f"Error creating nlp_analyser for {language}: {e}"
         # Sort the strings in order from the longest string to the shortest
         in_deny_list = sorted(in_deny_list, key=len, reverse=True)
+    if in_deny_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
         nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
+        new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(
+            supported_entities=["CUSTOM_FUZZY"],
+            custom_list=in_deny_list,
+            spelling_mistakes_max=in_deny_list,
+            search_whole_phrase=max_fuzzy_spelling_mistakes_num,
+        )
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
+    # analyzer = nlp_analyser #AnalyzerEngine()
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
+    anonymizer = (
+        AnonymizerEngine()
+    )  # conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
+    batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine=anonymizer)
     analyzer_results = list()
     if do_initial_clean:
         progress(0.2, desc="Cleaning text")
+        for col in progress.tqdm(df.columns, desc="Cleaning text", unit="Columns"):
             df[col] = initial_clean(df[col])
     # DataFrame to dict
+    df_dict = df.to_dict(orient="list")
     if pii_identification_method == "Local":
         # Use custom analyzer to be able to track progress with Gradio
+        custom_results = analyze_dict(
+            batch_analyzer,
+            df_dict,
+            language=language,
+            entities=chosen_redact_entities,
+            score_threshold=score_threshold,
+            return_decision_process=True,
+            allow_list=in_allow_list_flat,
+        )
         # Initialize results_by_column with custom entity results
         for result in custom_results:
             results_by_column[result.key] = result
     # AWS Comprehend calls
     elif pii_identification_method == "AWS Comprehend" and comprehend_client:
         # Only run Local anonymisation for entities that are not covered by AWS Comprehend
         if custom_entities:
             custom_redact_entities = [
+                entity
+                for entity in chosen_redact_comprehend_entities
                 if entity in custom_entities
             ]
             if custom_redact_entities:
                 # Get results from analyze_dict
+                custom_results = analyze_dict(
+                    batch_analyzer,
+                    df_dict,
+                    language=language,
+                    entities=custom_redact_entities,
+                    score_threshold=score_threshold,
+                    return_decision_process=True,
+                    allow_list=in_allow_list_flat,
+                )
                 # Initialize results_by_column with custom entity results
                 for result in custom_results:
                     results_by_column[result.key] = result
         retry_delay = 3
         # Process each text column in the dictionary
+        for column_name, texts in progress.tqdm(
+            df_dict.items(), desc="Querying AWS Comprehend service.", unit="Columns"
+        ):
             # Get or create DictAnalyzerResult for this column
             if column_name in results_by_column:
                 column_results = results_by_column[column_name]
             else:
                 column_results = DictAnalyzerResult(
+                    recognizer_results=[[] for _ in texts], key=column_name, value=texts
                 )
             # Process each text in the column
+            for text_idx, text in progress.tqdm(
+                enumerate(texts), desc="Querying AWS Comprehend service.", unit="Row"
+            ):
                 for attempt in range(max_retries):
                     try:
                         response = comprehend_client.detect_pii_entities(
+                            Text=str(text), LanguageCode=language
                         )
                         comprehend_query_number += 1
                         # Add all entities from this text to the column's recognizer_results
                         for entity in response["Entities"]:
+                            if (
+                                entity.get("Type")
+                                not in chosen_redact_comprehend_entities
+                            ):
                                 continue
                             recognizer_result = RecognizerResult(
                                 entity_type=entity["Type"],
                                 start=entity["BeginOffset"],
                                 end=entity["EndOffset"],
+                                score=entity["Score"],
                             )
+                            column_results.recognizer_results[text_idx].append(
+                                recognizer_result
+                            )
                         break  # Success, exit retry loop
                     except Exception as e:
                         if attempt == max_retries - 1:
+                            print(
+                                f"AWS Comprehend calls failed for text: {text[:100]}... due to",
+                                e,
+                            )
                             raise
                         time.sleep(retry_delay)
         analyzer_results = list(results_by_column.values())
     elif (pii_identification_method == "AWS Comprehend") & (not comprehend_client):
+        raise ("Unable to redact, Comprehend connection details not found.")
     else:
         print("Unable to redact.")
     # Usage in the main function:
+    decision_process_output_str = generate_decision_process_output(
+        analyzer_results, df_dict
+    )
     analyse_toc = time.perf_counter()
+    analyse_time_out = (
+        f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
+    )
     print(analyse_time_out)
     # Set up the anonymization configuration WITHOUT DATE_TIME
+    simple_replace_config = {
+        "DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})
+    }
+    replace_config = {"DEFAULT": OperatorConfig("replace")}
+    redact_config = {"DEFAULT": OperatorConfig("redact")}
+    hash_config = {"DEFAULT": OperatorConfig("hash")}
+    mask_config = {
+        "DEFAULT": OperatorConfig("mask", {
+            "masking_char": "*",
+            "chars_to_mask": 100,
+            "from_end": True
+        })
+    }
+    people_encrypt_config = {
+        "PERSON": OperatorConfig("encrypt", {"key": key_string})
+    }  # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
+    fake_first_name_config = {
+        "PERSON": OperatorConfig("custom", {"lambda": fake_first_name})
+    }
+    if anon_strategy == "replace with 'REDACTED'":
+        chosen_mask_config = simple_replace_config
+    elif anon_strategy == "replace_redacted":
+        chosen_mask_config = simple_replace_config
+    elif anon_strategy == "replace with <ENTITY_NAME>":
+        chosen_mask_config = replace_config
+    elif anon_strategy == "entity_type":
+        chosen_mask_config = replace_config
+    elif anon_strategy == "redact completely":
+        chosen_mask_config = redact_config
+    elif anon_strategy == "redact":
+        chosen_mask_config = redact_config
+    elif anon_strategy == "hash":
+        chosen_mask_config = hash_config
+    elif anon_strategy == "mask":
+        chosen_mask_config = mask_config
+    elif anon_strategy == "encrypt":
         chosen_mask_config = people_encrypt_config
         key = secrets.token_bytes(16)  # 128 bits = 16 bytes
+        key_string = base64.b64encode(key).decode("utf-8")
         # Now inject the key into the operator config
         for entity, operator in chosen_mask_config.items():
             if operator.operator_name == "encrypt":
                 operator.params = {"key": key_string}
+    elif anon_strategy == "fake_first_name":
+        chosen_mask_config = fake_first_name_config
     else:
         print("Anonymisation strategy not found. Redacting completely by default.")
+        chosen_mask_config = redact_config  # Redact completely by default
+    combined_config = {**chosen_mask_config}
+    anonymizer_results = batch_anonymizer.anonymize_dict(
+        analyzer_results, operators=combined_config)
     scrubbed_df = pd.DataFrame(anonymizer_results)
+    return scrubbed_df, key_string, decision_process_output_str, comprehend_query_number

tools/file_conversion.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

tools/file_redaction.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

tools/find_duplicate_pages.py CHANGED Viewed

@@ -1,24 +1,30 @@
-import pandas as pd
 import os
 import re
 import time
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-from typing import List, Tuple, Optional, Dict, Union
 from collections import defaultdict
 import gradio as gr
 from gradio import Progress
-from pathlib import Path
-from typing import List
-from tools.helper_functions import OUTPUT_FOLDER
 from tools.config import MAX_SIMULTANEOUS_FILES
-from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
 from tools.load_spacy_model_custom_recognisers import nlp
-number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value
 ID_MULTIPLIER = 100000
 # Define the set of punctuation characters for efficient lookup
-PUNCTUATION_TO_STRIP = {'.', ',', '?', '!', ':', ';'}
 def split_text_with_punctuation(text: str) -> List[str]:
     """
@@ -29,84 +35,89 @@ def split_text_with_punctuation(text: str) -> List[str]:
     # 1. A sequence of one or more punctuation marks `[.,?!:;]+`
     # 2. OR a sequence of one or more characters that are NOT punctuation or whitespace `[^.,?!:;\s]+`
     pattern = re.compile(r"([.,?!:;]+|[^.,?!:;\s]+)")
     final_list = []
     # We first split by whitespace to handle sentences correctly
     for word in text.split():
         # Then, for each whitespace-separated word, we tokenize it further
         final_list.extend(pattern.findall(word))
     return final_list
 def extract_indices_from_page_ranges(
     results_df: pd.DataFrame,
-    start_col: str = 'Page2_Start_Page',
-    end_col: str = 'Page2_End_Page',
-    modulo_divisor_number_of_zeros: int = number_of_zeros_to_add_to_index, # Search for number of added
-    converted_index: bool = False # Has the index been converted to the page_no + 0000 + line number format that needs the modulo divisor to convert back?
 ) -> List[int]:
     all_indices = set()
-    modulo_divisor = int("1" + modulo_divisor_number_of_zeros*"0")
     for _, row in results_df.iterrows():
         start_page = row[start_col]
         end_page = row[end_col]
         for encoded_page_id in range(start_page, end_page + 1):
-            if converted_index == True:
-                original_page, original_index = _parse_page_line_id(encoded_page_id)#(encoded_page_id % modulo_divisor) - 1
             else:
                 original_index = encoded_page_id
             all_indices.add(original_index)
     return sorted(list(all_indices))
 def punctuation_at_word_text_end(word_level_df_orig: pd.DataFrame) -> bool:
     """
-    Check the first 1000 rows of word_level_df_orig to see if any of the strings
     in 'word_text' end with a full stop '.', exclamation mark '!', or question mark '?',
     for strings that do not contain these characters alone.
     Args:
         word_level_df_orig (pd.DataFrame): DataFrame containing word-level OCR data with 'word_text' column
     Returns:
         bool: True if any strings end with punctuation marks, False otherwise
     """
     # Get the first 1000 rows or all rows if less than 1000
     sample_df = word_level_df_orig.head(1000)
     # Check if 'word_text' column exists
-    if 'word_text' not in sample_df.columns:
         return False
     # Define punctuation marks to check for
-    punctuation_marks = ['.', '!', '?']
     # Check each word_text string
-    for word_text in sample_df['word_text']:
         if pd.isna(word_text) or not isinstance(word_text, str):
             continue
         # Skip strings that contain only punctuation marks
         if word_text.strip() in punctuation_marks:
             continue
         # Check if the string ends with any of the punctuation marks
         if any(word_text.rstrip().endswith(punct) for punct in punctuation_marks):
             return True
     return False
 def run_full_search_and_analysis(
     search_query_text: str,
     word_level_df_orig: pd.DataFrame,
     similarity_threshold: float = 1,
-    combine_pages: bool = False,
     min_word_count: int = 1,
     min_consecutive_pages: int = 1,
     greedy_match: bool = True,
     remake_index: bool = False,
-    progress=gr.Progress(track_tqdm=True)
 ):
     """
     This function orchestrates the entire pipeline for finding duplicate pages based on a user's search query. It takes in the search query text, the original word-level OCR data, and various parameters to control the analysis. The function then:
@@ -120,7 +131,7 @@ def run_full_search_and_analysis(
     - search_query_text (str): The text entered by the user to search for in the OCR data.
     - word_level_df_orig (pd.DataFrame): The original DataFrame containing word-level OCR data.
     - similarity_threshold (float, optional): The minimum similarity score required for two pages to be considered duplicates. Defaults to 1.
-    - combine_pages (bool, optional): A flag indicating whether to combine text from the same page number within a file. Defaults to False.
     - min_word_count (int, optional): The minimum number of words required for a page to be considered in the analysis. Defaults to 1.
     - min_consecutive_pages (int, optional): The minimum number of consecutive pages required to be considered a match. Defaults to 1.
     - greedy_match (bool, optional): A flag indicating whether to use a greedy strategy for matching consecutive pages. Defaults to True.
@@ -133,25 +144,32 @@ def run_full_search_and_analysis(
     if len(search_query_text) > 100:
         raise Warning("Please use a search query with at less than 100 characters.")
-    if punctuation_at_word_text_end(word_level_df_orig) == True: do_punctuation_split = False
-    else: do_punctuation_split = True
     # Step 1: Process the user's search query string
-    search_query_data, query_word_length = create_dataframe_from_string(search_query_text, file_name="user_search_query", split_words=True, split_punctuation=do_punctuation_split)
     if not search_query_data:
         # Handle case where user submits an empty search string
-        raise Warning("Could not convert search string to required format")
     if query_word_length > 25:
         # Handle case where user submits an empty search string
-        raise Warning("Please use a query with less than 25 words")
     # Overwrite min_consecutive_pages with the search string length
     min_consecutive_pages = query_word_length
     # Create word index from reference table
     word_level_df_orig["index"] = word_level_df_orig.index
-    word_level_df = word_level_df_orig.copy()
     # Step 2: Process the main word-level OCR DataFrame
     word_level_data = convert_word_level_df(word_level_df, file_name="source_document")
@@ -160,13 +178,13 @@ def run_full_search_and_analysis(
     all_data_to_process = search_query_data + word_level_data
     if not all_data_to_process:
         raise gr.Error("No data to process. Please check your inputs.")
     # Step 4: Run the combination logic
     combined_df, _, full_out_ocr_df = combine_ocr_dataframes(
         input_data=all_data_to_process,
         combine_pages=combine_pages,
-        output_folder=None, # No need to save this intermediate file
-        remake_index=remake_index
     )
     # Step 5: Run the final similarity analysis on the combined data
@@ -181,28 +199,37 @@ def run_full_search_and_analysis(
         do_text_clean=False,
         file1_name="user_search_query",
         file2_name="source_document",
-        progress=progress
     )
     print("Finished text search")
     # Map the results back to the reference data file
-    if remake_index == True:
-        results_df_index_list = extract_indices_from_page_ranges(results_df, converted_index=True)
     else:
-        results_df_index_list = extract_indices_from_page_ranges(results_df, converted_index=False)
-    word_level_df_out = word_level_df_orig.loc[word_level_df_orig["index"].isin(results_df_index_list)]
     return word_level_df_out, duplicate_files, full_data
-def create_all_data_to_process(converted_data:pd.DataFrame, other_data_list:List[Tuple]):
     all_data_to_process = converted_data + other_data_list
     return all_data_to_process
 def convert_word_level_df(
-    word_level_df: pd.DataFrame,
-    file_name: str = "converted_dataframe"
 ) -> List[Tuple[str, pd.DataFrame]]:
     """
     Converts a word-level OCR DataFrame to the format for
@@ -225,34 +252,37 @@ def convert_word_level_df(
             DataFrame will have 'page' and 'text' columns.
     """
     # --- 1. Validate Input ---
-    required_columns = ['page', 'line', 'word_text']
     if not all(col in word_level_df.columns for col in required_columns):
-        raise ValueError(f"Input DataFrame must contain all of the following columns: {required_columns}")
     df = word_level_df.copy()
     # --- 2. Process the DataFrame ---
     # Ensure word_text is a string to allow for joining
-    df['word_text'] = df['word_text'].astype(str)
     # Group by page and line number, then join the words with a space (not needed for word level search)
     # The result is a Series with a MultiIndex (page, line)
-    #line_text_series = df.groupby(['page', 'line'])['word_text'].apply(' '.join)
     # Convert the Series back to a DataFrame and reset the index
-    #line_level_df = line_text_series.reset_index()
     # Rename the aggregated column from 'word_text' to the required 'text'
-    df = df.rename(columns={'word_text': 'text'})
     # --- 3. Finalise the structure ---
     # We now have a DataFrame with columns [page, line, text].
-    final_df = df[['page', 'text']]
     # --- 4. Package for output ---
     # Return in the required List[Tuple[str, DataFrame]] format
     return [(file_name, final_df)]
 def create_dataframe_from_string(
     text_string: str,
     file_name: str = "user_search_query",
@@ -292,24 +322,22 @@ def create_dataframe_from_string(
     if split_words:
         # --- Split string into words, one per row, based on similar punctuation split technique used to create ocr_results_with_words objects ---
-        if split_punctuation == True:
             words = split_text_with_punctuation(text_string)
         else:
             words = text_string.split()
-        #words = text_string.split()
         len_words = len(words)
-        data = {
-            'page': [page_number] * len_words, # Assign the same page number to every word
-            'text': words # The list of words becomes the text column
         }
     else:
         # --- Entire string in one row ---
         len_words = 1
-        data = {
-            'page': [page_number],
-            'text': [text_string]
-        }
     # Create the DataFrame from the prepared data
     df = pd.DataFrame(data)
@@ -319,13 +347,14 @@ def create_dataframe_from_string(
     # Return it in the required format: a list containing one (name, df) tuple
     return [(file_name, df)], len_words
 def combine_ocr_dataframes(
     input_data: List[Tuple[str, pd.DataFrame]],
     combine_pages: bool = True,
     output_folder: str = OUTPUT_FOLDER,
     output_filename: str = "combined_ocr_output.csv",
     number_of_added_zeros: int = number_of_zeros_to_add_to_index,
-    remake_index:bool = True
 ) -> Tuple[pd.DataFrame, List[str]]:
     """
     Combines text from multiple pandas DataFrames containing page and text columns.
@@ -358,18 +387,20 @@ def combine_ocr_dataframes(
         df = df_initial.copy()  # Work on a copy to avoid side effects
         # --- Validation ---
-        if 'page' not in df.columns or 'text' not in df.columns:
-            print(f"Warning: Skipping data for '{file_identifier}' - missing required columns 'page' and 'text'.")
             continue
         # --- Processing ---
-        df['text'] = df['text'].fillna('').astype(str)
         if combine_pages:
             # Group by page and concatenate text into a single string
-            processed_df = df.groupby('page')['text'].apply(' '.join).reset_index()
         else:
-            if remake_index == True:
                 # # Create a unique, sortable page ID for each line without combining
                 # df['line_number_by_page'] = df.groupby('page').cumcount() + 1
                 # df['original_page'] = df['page']
@@ -377,41 +408,47 @@ def combine_ocr_dataframes(
                 # df['page'] = (
                 #     df['page'].astype(str).str.zfill(number_of_added_zeros) +
                 #     df['line_number_by_page'].astype(str).str.zfill(number_of_added_zeros)
-                # ).astype(int)
                 # Define the multiplier based on the max expected lines per page.
                 # If you expect up to 99,999 lines, use 100,000.
-                df['line_number_by_page'] = df.groupby('page').cumcount() + 1
-                df['original_page'] = df['page']
                 # Create the new combined ID using arithmetic
-                df['page'] = (df['original_page'] * ID_MULTIPLIER) + df['line_number_by_page']
             else:
-                if not 'index' in df.columns:
-                    df['index'] = df.index
-                df['page'] = df['index']
             processed_df = df
         # Add the file identifier column
-        processed_df['file'] = file_identifier
         all_data.append(processed_df)
     if not all_data:
-        raise ValueError("No valid DataFrames were processed. Ensure input data is not empty and DataFrames have 'page' and 'text' columns.")
     # --- Final Combination ---
     combined_df = pd.concat(all_data, ignore_index=True)
     # Reorder columns to a standard format, dropping intermediate columns
-    final_columns = ['file', 'page', 'text']
-    if 'original_page' in combined_df.columns:
-         final_columns.append('original_page') # Keep for context if created
     # Ensure all final columns exist before trying to select them
-    existing_final_columns = [col for col in final_columns if col in combined_df.columns]
     full_out_ocr_df = combined_df
     combined_df = combined_df.copy()[existing_final_columns]
@@ -427,11 +464,12 @@ def combine_ocr_dataframes(
     return combined_df, output_files, full_out_ocr_df
 def combine_ocr_output_text(
     input_files: Union[str, List[str]],
     combine_pages: bool = True,
     remake_index: bool = True,
-    output_folder: str = OUTPUT_FOLDER
 ) -> Tuple[pd.DataFrame, List[str]]:
     """
     Reads multiple OCR CSV files, combines them, and saves the result.
@@ -472,19 +510,20 @@ def combine_ocr_output_text(
         input_data=data_to_process,
         combine_pages=combine_pages,
         output_folder=output_folder,
-        output_filename="combined_ocr_from_files.csv", # Specific name for this path
-        remake_index=remake_index
     )
-def clean_and_stem_text_series(df:pd.DataFrame, column:str):
-    '''
     Clean and stem text columns in a data frame
-    '''
     def _clean_text(raw_text):
         # Remove HTML tags
-        clean = re.sub(r'<.*?>', '', raw_text)
-        clean = ' '.join(clean.split())
         # Join the cleaned words back into a string
         return clean
@@ -492,59 +531,106 @@ def clean_and_stem_text_series(df:pd.DataFrame, column:str):
     def _apply_lemmatization(text):
         doc = nlp(text)
         # Keep only alphabetic tokens and remove stopwords
-        lemmatized_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
-        return ' '.join(lemmatized_words)
-    df['text_clean'] = df[column].apply(_clean_text)
-    df['text_clean'] = df['text_clean'].apply(_apply_lemmatization)
     return df
-def map_metadata_single_page(similarity_df:pd.DataFrame, metadata_source_df:pd.DataFrame, preview_length:int=200):
     """Helper to map metadata for single page results."""
-    metadata_df = metadata_source_df[['file', 'page', 'text']]
-    results_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_index=True)\
-                            .rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})
-    results_df = results_df.merge(metadata_df, left_on='Page2_Index', right_index=True, suffixes=('_1', '_2'))\
-                            .rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})
     results_df["Similarity_Score"] = results_df["Similarity_Score"].round(3)
-    final_df = results_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
-    final_df = final_df.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page"])
-    final_df['Page1_Text'] = final_df['Page1_Text'].str[:preview_length]
-    final_df['Page2_Text'] = final_df['Page2_Text'].str[:preview_length]
     return final_df
-def map_metadata_subdocument(subdocument_df:pd.DataFrame, metadata_source_df:pd.DataFrame, preview_length:int=200):
     """Helper to map metadata for subdocument results."""
-    metadata_df = metadata_source_df[['file', 'page', 'text']]
-    subdocument_df = subdocument_df.merge(metadata_df, left_on='Page1_Start_Index', right_index=True)\
-                                   .rename(columns={'file': 'Page1_File', 'page': 'Page1_Start_Page', 'text': 'Page1_Text'})
-    subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page1_End_Index', right_index=True)\
-                                   .rename(columns={'page': 'Page1_End_Page'})
-    subdocument_df = subdocument_df.merge(metadata_df, left_on='Page2_Start_Index', right_index=True)\
-                                   .rename(columns={'file': 'Page2_File', 'page': 'Page2_Start_Page', 'text': 'Page2_Text'})
-    subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page2_End_Index', right_index=True)\
-                                   .rename(columns={'page': 'Page2_End_Page'})
-    cols = ['Page1_File', 'Page1_Start_Page', 'Page1_End_Page',
-            'Page2_File', 'Page2_Start_Page', 'Page2_End_Page',
-            'Match_Length', 'Page1_Text', 'Page2_Text']
     # Add Avg_Similarity if it exists (it won't for greedy match unless we add it)
-    if 'Avg_Similarity' in subdocument_df.columns:
-        subdocument_df['Avg_Similarity'] = subdocument_df['Avg_Similarity'].round(3)
-        cols.insert(7, 'Avg_Similarity')
     final_df = subdocument_df[cols]
-    final_df = final_df.sort_values(['Page1_File', 'Page1_Start_Page', 'Page2_File', 'Page2_Start_Page'])
-    final_df['Page1_Text'] = final_df['Page1_Text'].str[:preview_length]
-    final_df['Page2_Text'] = final_df['Page2_Text'].str[:preview_length]
     return final_df
-def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str, combine_pages:bool = True) -> list:
     """
     Saves the main results DataFrame and generates per-file redaction lists.
     This function is extracted to be reusable.
@@ -566,44 +652,53 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
         return []
     # 1. Save the main results DataFrame
-    similarity_file_output_path = output_folder_path / 'page_similarity_results.csv'
     final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
     output_paths.append(str(similarity_file_output_path))
-    #print(f"Main results saved to {similarity_file_output_path}")
     # 2. Save per-file redaction lists
     # Use 'Page2_File' as the source of duplicate content
-    if combine_pages == True:
-        grouping_col = 'Page2_File'
         if grouping_col not in final_df.columns:
-            print("Warning: 'Page2_File' column not found. Cannot generate redaction lists.")
             return output_paths
         for redact_file, group in final_df.groupby(grouping_col):
             output_file_name_stem = Path(redact_file).stem
-            output_file_path = output_folder_path / f"{output_file_name_stem}_pages_to_redact.csv"
             all_pages_to_redact = set()
-            is_subdocument_match = 'Page2_Start_Page' in group.columns
             if is_subdocument_match:
                 for _, row in group.iterrows():
-                    pages_in_range = range(int(row['Page2_Start_Page']), int(row['Page2_End_Page']) + 1)
                     all_pages_to_redact.update(pages_in_range)
             else:
-                pages = group['Page2_Page'].unique()
                 all_pages_to_redact.update(pages)
             if all_pages_to_redact:
-                redaction_df = pd.DataFrame(sorted(list(all_pages_to_redact)), columns=['Page_to_Redact'])
                 redaction_df.to_csv(output_file_path, header=False, index=False)
                 output_paths.append(str(output_file_path))
                 print(f"Redaction list for {redact_file} saved to {output_file_path}")
     return output_paths
 def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
     """
     Helper function to compare two sequences of tokens with punctuation flexibility.
@@ -627,9 +722,9 @@ def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
         # - Its last character must be in our punctuation set
         # - The token without its last character must match the query token
         if (
-            len(ref_token) > 1 and
-            ref_token[-1] in PUNCTUATION_TO_STRIP and
-            ref_token[:-1] == query_token
         ):
             continue
@@ -639,10 +734,9 @@ def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
     # If the loop completes, every token has matched.
     return True
 def find_consecutive_sequence_matches(
-    df_filtered: pd.DataFrame,
-    search_file_name: str,
-    reference_file_name: str
 ) -> pd.DataFrame:
     """
     Finds all occurrences of a consecutive sequence of tokens from a search file
@@ -659,22 +753,22 @@ def find_consecutive_sequence_matches(
         A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
         consecutive match, or an empty DataFrame if no match is found.
     """
-    #print(f"Starting sequence search for '{search_file_name}' in '{reference_file_name}'...")
     # Step 1: Isolate the data for each file
-    search_df = df_filtered[df_filtered['file'] == search_file_name]
-    reference_df = df_filtered[df_filtered['file'] == reference_file_name]
     if search_df.empty or reference_df.empty:
         print("Error: One or both files not found or are empty.")
-        return pd.DataFrame(columns=['Page1_Index', 'Page2_Index'])
     # Step 2: Convert the token data into lists for easy comparison.
     # We need both the text tokens and their original global indices.
-    query_tokens = search_df['text_clean'].tolist()
     query_indices = search_df.index.tolist()
-    reference_tokens = reference_df['text_clean'].tolist()
     reference_indices = reference_df.index.tolist()
     query_len = len(query_tokens)
@@ -689,29 +783,32 @@ def find_consecutive_sequence_matches(
         # Step 4: If the window matches the query with or without punctuation on end
         if _sequences_match(query_tokens, window):
-            #print(f"Found a consecutive match starting at reference index: {reference_indices[i]}")
             # Get the global indices for this entire matching block
             matching_reference_indices = reference_indices[i : i + query_len]
             # Create the mapping between query indices and the found reference indices
             for j in range(query_len):
                 all_found_matches.append(
                     (query_indices[j], matching_reference_indices[j], 1)
                 )
             # If you only want the *first* match, you can uncomment the next line:
-            # break
     if not all_found_matches:
         print("No matches found")
         gr.Info("No matches found")
-        return pd.DataFrame(columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
     # Step 5: Create the final DataFrame in the desired format
-    result_df = pd.DataFrame(all_found_matches, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
     return result_df
 def identify_similar_text_sequences(
     df_combined: pd.DataFrame,
     similarity_threshold: float = 1,
@@ -720,11 +817,11 @@ def identify_similar_text_sequences(
     greedy_match: bool = True,
     combine_pages: bool = False,
     inter_file_only: bool = False,
-    do_text_clean:bool = True,
-    file1_name: str = '',
-    file2_name: str = '',
     output_folder: str = OUTPUT_FOLDER,
-    progress=Progress(track_tqdm=True)
 ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
     """
     Identifies similar pages. Uses a highly optimized path for inter_file_only=True.
@@ -732,14 +829,17 @@ def identify_similar_text_sequences(
     progress(0.1, desc="Processing and filtering text")
     if do_text_clean:
-        df = clean_and_stem_text_series(df_combined, 'text') # Will produce the column 'text_clean'
     else:
         df = df_combined.copy()
-        df['text_clean'] = df['text'].str.lower()#.str.replace(r'[^\w\s]', '', regex=True)
-    df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
-    #df['word_count'] = pd.to_numeric(df['word_count'], errors='coerce').fillna(0).astype('int64')
     # ensure min_word_count is an int (e.g., from Gradio/text input)
     try:
@@ -748,57 +848,66 @@ def identify_similar_text_sequences(
         min_word_count = 0  # or raise/log, depending on your preference
     original_row_count = len(df)
-    df_filtered = df[df['word_count'] >= min_word_count].copy()
     df_filtered.reset_index(drop=True, inplace=True)
-    print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
     if len(df_filtered) < 2:
         return pd.DataFrame(), [], df_combined
     # Similarity calculated differently if comparing between files only (inter_file_only==True), or within the same file
     if inter_file_only:
         progress(0.2, desc="Finding direct text matches...")
-        #base_similarity_df = _debug_similarity_between_two_files(df_filtered, vectorizer, similarity_threshold, file1_name, file2_name)
-        base_similarity_df = find_consecutive_sequence_matches(df_filtered, file1_name, file2_name)
         if base_similarity_df.empty:
-            return pd.DataFrame(), [], df_combined
     else:
         # Use the original, simpler path for all-to-all comparisons (including intra-file).
         vectorizer = TfidfVectorizer()
         print("Standard Path: Calculating all-to-all similarity.")
         progress(0.2, desc="Vectorizing text...")
-        tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
         progress(0.3, desc="Calculating similarity matrix...")
         similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
         coo_matrix = similarity_matrix.tocoo()
         similar_pages = [
-            (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
             if r < c and v >= similarity_threshold
         ]
         if not similar_pages:
             return pd.DataFrame(), [], df_combined
-        base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
     progress(0.7, desc="Aggregating results based on matching strategy")
     if greedy_match or min_consecutive_pages > 1:
-        #print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
         # Sort the dataframe to ensure consecutive pages are adjacent
-        similarity_df = base_similarity_df #.sort_values(['Page1_Index', 'Page2_Index']).copy()
         # A new sequence starts if the difference from the previous row is not (1, 1)
         # is_consecutive will be True if a row continues the sequence, False if it's a new one.
-        is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
         # Use cumsum() on the inverted boolean series to create a unique ID for each block.
         # Every time a 'False' appears (a new block starts), the sum increases.
@@ -809,46 +918,53 @@ def identify_similar_text_sequences(
         # Aggregate each group to get the start, end, and length of the match
         agg_results = grouped.agg(
-            Page1_Start_Index=('Page1_Index', 'first'),
-            Page2_Start_Index=('Page2_Index', 'first'),
-            Page1_End_Index=('Page1_Index', 'last'),
-            Page2_End_Index=('Page2_Index', 'last'),
-            Match_Length=('Page1_Index', 'size'),
-            Avg_Similarity=('Similarity_Score', 'mean')
         ).reset_index(drop=True)
         # If greedy_match=True, we keep all matches. If min_consecutive_pages > 1, we filter.
         if greedy_match and min_consecutive_pages <= 1:
             subdocument_df = agg_results
         else:
-             # This handles the case for min_consecutive_pages > 1
-            subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
         if subdocument_df.empty:
             gr.Info("No matches found")
             return pd.DataFrame(), [], df_combined
         final_df = map_metadata_subdocument(subdocument_df, df_filtered)
     else:
-        print(f"Finding single page matches, not greedy (min_consecutive_pages=1)")
         # This part of your code would handle the non-sequential case
         final_df = map_metadata_single_page(base_similarity_df, df_filtered)
-        #subdocument_df = final_df # To align variable names for saving
         if final_df.empty:
             gr.Info("No matches found")
             return pd.DataFrame(), [], df_combined
     progress(0.9, desc="Saving output files")
-    output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)
     gr.Info(f"Found {final_df.shape[0]} match(es)")
     print(f"Found {final_df.shape[0]} match(es)")
     return final_df, output_paths, df_combined
-def handle_selection_and_preview(evt: gr.SelectData, results_df:pd.DataFrame, full_duplicate_data_by_file: dict):
     """
     This single function handles a user selecting a row. It:
     1. Determines the selected row index.
@@ -857,18 +973,23 @@ def handle_selection_and_preview(evt: gr.SelectData, results_df:pd.DataFrame, fu
     """
     # If the user deselects, the event might be None.
     if not evt:
-        return None, None, None # Clear state and both preview panes
     # 1. Get the selected index
     selected_index = evt.index[0]
     # 2. Get the preview data
-    page1_data, page2_data = show_page_previews(full_duplicate_data_by_file, results_df, evt)
     # 3. Return all three outputs in the correct order
     return selected_index, page1_data, page2_data
-def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder=OUTPUT_FOLDER):
     """
     Removes a selected row from the results DataFrame, regenerates output files,
     and clears the text preview panes.
@@ -877,23 +998,34 @@ def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder=OUT
         gr.Warning("No match selected. Please click on a row in the table first.")
         # Return the original dataframe and update=False for the files
         return results_df, gr.update(), None, None
     if results_df.empty:
         gr.Warning("No duplicate page results found, nothing to exclude.")
         return results_df, gr.update(), None, None
     # Drop the selected row
     updated_df = results_df.drop(selected_index).reset_index(drop=True)
     # Recalculate all output files using the helper function
     new_output_paths = save_results_and_redaction_lists(updated_df, output_folder)
     gr.Info(f"Match at row {selected_index} excluded. Output files have been updated.")
     # Return the updated dataframe, the new file list, and clear the preview panes
     return updated_df, new_output_paths, None, None
-def run_duplicate_analysis(files:list[str], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, output_folder:str=OUTPUT_FOLDER, progress=gr.Progress(track_tqdm=True)):
     """
     Main wrapper function to orchestrate the duplicate page analysis process.
     It handles file loading, text combination, similarity identification,
@@ -911,9 +1043,11 @@ def run_duplicate_analysis(files:list[str], threshold:float, min_words:int, min_
         progress (gr.Progress, optional): A Gradio progress tracker object to display progress in the UI.
     """
-    if not files: raise Warning("Please upload files to analyse.")
-    if isinstance(files, str): files = [files]
     if len(files) > MAX_SIMULTANEOUS_FILES:
         out_message = f"Number of files to deduplicate is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
@@ -923,9 +1057,11 @@ def run_duplicate_analysis(files:list[str], threshold:float, min_words:int, min_
     start_time = time.time()
     task_textbox = "deduplicate"
     progress(0, desc="Combining input files...")
-    df_combined, _, full_out_ocr_df = combine_ocr_output_text(files, combine_pages=combine_pages)
     if df_combined.empty:
         raise Warning("No data found in the uploaded files.")
@@ -939,26 +1075,32 @@ def run_duplicate_analysis(files:list[str], threshold:float, min_words:int, min_
         greedy_match=greedy_match,
         combine_pages=combine_pages,
         output_folder=output_folder,
-        progress=progress
     )
     # Clip text to first 200 characters
-    full_df['text'] = full_df['text'].str[:preview_length]
     # Preprocess full_data (without preview text) for fast access (run once)
     full_data_by_file = {
-    file: df.sort_values('page').set_index('page')
-    for file, df in full_df.drop(["text_clean"],axis=1).groupby('file')
     }
     if results_df.empty:
-        gr.Info(f"No duplicate pages found, no results returned.")
     end_time = time.time()
     processing_time = round(end_time - start_time, 2)
     return results_df, output_paths, full_data_by_file, processing_time, task_textbox
-def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
     """
     Optimized version using pre-partitioned and indexed full_data.
     Triggered when a user selects a row in the results DataFrame.
@@ -968,26 +1110,35 @@ def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: g
     selected_row = results_df.iloc[evt.index[0], :]
-    is_subdocument_match = 'Page1_Start_Page' in selected_row
     if is_subdocument_match:
-        file1, start1, end1 = selected_row['Page1_File'], selected_row['Page1_Start_Page'], selected_row['Page1_End_Page']
-        file2, start2, end2 = selected_row['Page2_File'], selected_row['Page2_Start_Page'], selected_row['Page2_End_Page']
-        page1_data = full_data_by_file[file1].loc[start1:end1, ['text']].reset_index()
-        page2_data = full_data_by_file[file2].loc[start2:end2, ['text']].reset_index()
     else:
-        file1, page1 = selected_row['Page1_File'], selected_row['Page1_Page']
-        file2, page2 = selected_row['Page2_File'], selected_row['Page2_Page']
-        page1_data = full_data_by_file[file1].loc[[page1], ['text']].reset_index()
-        page2_data = full_data_by_file[file2].loc[[page2], ['text']].reset_index()
-    page1_data['text'] = page1_data['text'].str[:preview_length]
-    page2_data['text'] = page2_data['text'].str[:preview_length]
-    return page1_data[['page', 'text']], page2_data[['page', 'text']]
 def get_page_image_info(page_num: int, page_sizes: List[Dict]) -> Optional[Dict]:
     """
@@ -995,10 +1146,9 @@ def get_page_image_info(page_num: int, page_sizes: List[Dict]) -> Optional[Dict]
     """
     return next((size for size in page_sizes if size["page"] == page_num), None)
 def add_new_annotations_to_existing_page_annotations(
-    all_annotations: List[Dict],
-    image_path: str,
-    new_annotation_boxes: List[Dict]
 ) -> Tuple[List[Dict], Dict]:
     """
     Adds a list of new annotation boxes to the annotations for a specific page.
@@ -1018,8 +1168,12 @@ def add_new_annotations_to_existing_page_annotations(
     """
     # Find the annotation group for the current page/image
     current_page_group = next(
-        (annot_group for annot_group in all_annotations if annot_group["image"] == image_path),
-        None
     )
     if current_page_group:
@@ -1027,22 +1181,27 @@ def add_new_annotations_to_existing_page_annotations(
         current_page_group["boxes"].extend(new_annotation_boxes)
     else:
         # This is the first set of annotations for this page, create a new group
-        new_group = {
-            "image": image_path,
-            "boxes": new_annotation_boxes
-        }
         all_annotations.append(new_group)
     # This object represents all annotations that were just added for this page
-    newly_added_annotation_group = {
-        "image": image_path,
-        "boxes": new_annotation_boxes
-    }
     return all_annotations, newly_added_annotation_group
-def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFrame, doc_file_name_with_extension_textbox: str, review_file_state: pd.DataFrame, duplicate_output_paths: list[str], pymupdf_doc: object, page_sizes: list[dict], all_existing_annotations: list[dict], combine_pages:bool=True, new_annotations_with_bounding_boxes:List[dict]=list()):
-    '''
     This function applies redactions to whole pages based on a provided list of duplicate page numbers. It supports two modes of operation: combining pages and not combining pages. When combining pages is enabled, it attempts to identify duplicate pages across different files and applies redactions accordingly. If combining pages is disabled, it relies on new annotations with bounding boxes to determine which pages to redact. The function utilises a PyMuPDF document object to manipulate the PDF file, and it also considers the sizes of pages to ensure accurate redaction application.
     Args:
@@ -1055,7 +1214,7 @@ def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFram
         all_existing_annotations (list[dict]): A list of all existing annotations in the document.
         combine_pages (bool, optional): A flag indicating whether to combine pages for redaction. Defaults to True.
         new_annotations_with_bounding_boxes (List[dict], optional): A list of new annotations with bounding boxes. Defaults to an empty list.
-    '''
     if all_existing_annotations is None:
         all_existing_annotations = []
@@ -1069,31 +1228,39 @@ def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFram
         print(f"Warning: {message}")
         raise Warning(message)
-    list_whole_pages_to_redact = []
-    if combine_pages == True:
         # Get list of pages to redact from either dataframe or file
         if not duplicate_page_numbers_df.empty:
             list_whole_pages_to_redact = duplicate_page_numbers_df.iloc[:, 0].tolist()
         elif duplicate_output_paths:
-            expected_duplicate_pages_to_redact_name = f"{doc_file_name_with_extension_textbox}"
             whole_pages_list = pd.DataFrame()  # Initialize empty DataFrame
             for output_file in duplicate_output_paths:
                 # Note: output_file.name might not be available if output_file is just a string path
                 # If it's a Path object or similar, .name is fine. Otherwise, parse from string.
-                file_name_from_path = output_file.split('/')[-1] if isinstance(output_file, str) else output_file.name
                 if expected_duplicate_pages_to_redact_name in file_name_from_path:
-                    whole_pages_list = pd.read_csv(output_file, header=None) # Use output_file directly if it's a path
-                    break
         else:
             message = "No relevant list of whole pages to redact found."
             print(message)
             raise Warning(message)
         if not whole_pages_list.empty:
             list_whole_pages_to_redact = whole_pages_list.iloc[:, 0].tolist()
         list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
     else:
@@ -1101,19 +1268,20 @@ def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFram
             message = "Can't find any new annotations to add"
             print(message)
             raise Warning(message)
         list_whole_pages_to_redact = []
         for annotation in new_annotations_with_bounding_boxes:
-            match = re.search(r'_(\d+)\.png$', annotation["image"])
             if match:
                 page = int(match.group(1)) + 1
                 list_whole_pages_to_redact.append(page)
             else:
-                print(f"Warning: Could not extract page number from {annotation['image']}")
         list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
     new_annotations = []
     # Process each page for redaction
     for page in list_whole_pages_to_redact:
@@ -1130,47 +1298,65 @@ def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFram
                 continue
             image_path = page_info["image_path"]
-            page_annotation_group = next((g for g in all_annotations if g["image"] == image_path), None)
-            if page_annotation_group and any(box["label"] == "Whole page" for box in page_annotation_group["boxes"]):
-                print(f"Whole page redaction for page {page_num} already exists, skipping.")
                 continue
             # --- Create a LIST of boxes to add.---
             boxes_to_add = []
             pymupdf_page = pymupdf_doc[page_index]
-            if combine_pages==True:
                 whole_page_box = redact_whole_pymupdf_page(
                     rect_height=page_info["cropbox_height"],
                     rect_width=page_info["cropbox_width"],
-                    page=pymupdf_page, border=0.005, redact_pdf=False
                 )
                 boxes_to_add.append(whole_page_box)
             else:
                 # Find the specific annotation group that matches the current page's image path
                 relevant_box_group = next(
-                    (group for group in new_annotations_with_bounding_boxes if group.get('image') == image_path),
-                    None  # Default to None if no match is found
                 )
                 # Check if we found a matching group of boxes for this page
                 if relevant_box_group:
-                    boxes_to_add.extend(relevant_box_group['boxes'])
                 else:
                     # This case would be unexpected, but it's good to handle.
                     # It means a page was in list_whole_pages_to_redact but had no
                     # corresponding boxes generated in new_annotations_with_bounding_boxes.
-                    print(f"Warning: No new annotation boxes found for page {page_num} ({image_path}).")
             # === Use the modified helper function to add a LIST of boxes ===
-            all_annotations, new_annotations_for_page = add_new_annotations_to_existing_page_annotations(
-                all_annotations=all_annotations,
-                image_path=image_path,
-                new_annotation_boxes=boxes_to_add  # Pass the list here
             )
-            new_annotations_for_page = fill_missing_box_ids_each_box(new_annotations_for_page)
             new_annotations.append(new_annotations_for_page)
         except Exception as e:
@@ -1185,36 +1371,58 @@ def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFram
         gr.Info(message)
         return review_file_state, all_annotations
-    expected_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
     for col in expected_cols:
-        if col not in review_file_state.columns: review_file_state[col] = pd.NA
-        if col not in whole_page_review_file.columns: whole_page_review_file[col] = pd.NA
-    review_file_out = pd.concat([review_file_state, whole_page_review_file], ignore_index=True)
-    review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"]).reset_index(drop=True)
-    review_file_out = review_file_out.drop_duplicates(subset=['page', 'label', 'text', 'id'], keep='first')
     out_message = "Successfully created duplicate text redactions."
     print(out_message)
     gr.Info(out_message)
     return review_file_out, all_annotations
 def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
     """Parses a combined ID using modular arithmetic."""
     if int(combined_id) < ID_MULTIPLIER:
         # Handle cases where page is 0 (or just an edge case)
         return 0, combined_id
     page = combined_id // ID_MULTIPLIER
     line = combined_id % ID_MULTIPLIER
     return page, line
 def create_annotation_objects_from_duplicates(
-    duplicates_df: pd.DataFrame,
     ocr_results_df: pd.DataFrame,
     page_sizes: List[Dict],
-    combine_pages:bool=False) -> List[Dict]:
     """
     Creates structured annotation objects from duplicate line ranges, mapping
     page numbers to image paths.
@@ -1233,33 +1441,44 @@ def create_annotation_objects_from_duplicates(
     if duplicates_df.empty:
         raise Warning("No duplicates found")
     if ocr_results_df.empty:
-        raise Warning("No OCR results found for file under review. Please upload relevant OCR_output file and original PDF document on the review tab.")
-    if combine_pages == False:
-        page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
         # Prepare OCR Data: Add a line number column if it doesn't exist
-        if 'line_number_by_page' not in ocr_results_df.columns:
-            ocr_results_df = ocr_results_df.sort_values(by=['page', 'top', 'left']).reset_index(drop=True)
-            ocr_results_df['line_number_by_page'] = ocr_results_df.groupby('page').cumcount() + 1
         annotations_by_page = defaultdict(list)
         # Iterate through each duplicate range (this logic is unchanged)
         for _, row in duplicates_df.iterrows():
-            start_page, start_line = _parse_page_line_id(row['Page2_Start_Page'])
-            end_page, end_line = _parse_page_line_id(row['Page2_End_Page'])
             # Select OCR Lines based on the range (this logic is unchanged)
             if start_page == end_page:
-                condition = (
-                    (ocr_results_df['page'] == start_page) &
-                    (ocr_results_df['line_number_by_page'].between(start_line, end_line))
                 )
             else:
-                cond_start = (ocr_results_df['page'] == start_page) & (ocr_results_df['line_number_by_page'] >= start_line)
-                cond_middle = ocr_results_df['page'].between(start_page + 1, end_page - 1)
-                cond_end = (ocr_results_df['page'] == end_page) & (ocr_results_df['line_number_by_page'] <= end_line)
                 condition = cond_start | cond_middle | cond_end
             lines_to_annotate = ocr_results_df[condition]
@@ -1268,30 +1487,27 @@ def create_annotation_objects_from_duplicates(
             for _, line_row in lines_to_annotate.iterrows():
                 box = {
                     "label": "Duplicate text",
-                    "color": (0,0,0),
-                    "xmin": line_row['left'],
-                    "ymin": line_row['top'],
-                    "xmax": line_row['left'] + line_row['width'],
-                    "ymax": line_row['top'] + line_row['height'],
-                    "text": line_row['text'],
-                    "id": "" # to be filled in after
                 }
-                page_number = line_row['page']
                 annotations_by_page[page_number].append(box)
         # --- Format the final output list using the page-to-image map ---
         final_output = []
         # Sort by page number for a predictable order
         for page_num, boxes in sorted(annotations_by_page.items()):
             # Look up the image path using the page number
             image_path = page_to_image_map.get(page_num)
             if image_path:
-                page_boxes = {
-                    "image": image_path,
-                    "boxes": boxes
-                }
                 # Fill in missing IDs for the new data entries
                 page_boxes = fill_missing_box_ids_each_box(page_boxes)
@@ -1300,7 +1516,9 @@ def create_annotation_objects_from_duplicates(
                 final_output.append(page_boxes)
             else:
                 # Handle cases where a page might not have a corresponding image path
-                print(f"Warning: Page {page_num} found in OCR data but has no corresponding "
-                    f"entry in the 'page_sizes' object. This page's annotations will be skipped.")
-    return final_output

 import os
 import re
 import time
 from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
 import gradio as gr
+import pandas as pd
 from gradio import Progress
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
 from tools.config import MAX_SIMULTANEOUS_FILES
+from tools.file_conversion import (
+    convert_annotation_data_to_dataframe,
+    fill_missing_box_ids_each_box,
+    redact_whole_pymupdf_page,
+)
+from tools.helper_functions import OUTPUT_FOLDER
 from tools.load_spacy_model_custom_recognisers import nlp
+number_of_zeros_to_add_to_index = 7  # Number of zeroes to add between page number and line numbers to get a unique page/line index value
 ID_MULTIPLIER = 100000
 # Define the set of punctuation characters for efficient lookup
+PUNCTUATION_TO_STRIP = {".", ",", "?", "!", ":", ";"}
 def split_text_with_punctuation(text: str) -> List[str]:
     """
     # 1. A sequence of one or more punctuation marks `[.,?!:;]+`
     # 2. OR a sequence of one or more characters that are NOT punctuation or whitespace `[^.,?!:;\s]+`
     pattern = re.compile(r"([.,?!:;]+|[^.,?!:;\s]+)")
     final_list = []
     # We first split by whitespace to handle sentences correctly
     for word in text.split():
         # Then, for each whitespace-separated word, we tokenize it further
         final_list.extend(pattern.findall(word))
     return final_list
 def extract_indices_from_page_ranges(
     results_df: pd.DataFrame,
+    start_col: str = "Page2_Start_Page",
+    end_col: str = "Page2_End_Page",
+    modulo_divisor_number_of_zeros: int = number_of_zeros_to_add_to_index,  # Search for number of added
+    converted_index: bool = False,  # Has the index been converted to the page_no + 0000 + line number format that needs the modulo divisor to convert back?
 ) -> List[int]:
     all_indices = set()
+    int("1" + modulo_divisor_number_of_zeros * "0")
     for _, row in results_df.iterrows():
         start_page = row[start_col]
         end_page = row[end_col]
         for encoded_page_id in range(start_page, end_page + 1):
+            if converted_index is True:
+                original_page, original_index = _parse_page_line_id(
+                    encoded_page_id
+                )  # (encoded_page_id % modulo_divisor) - 1
             else:
                 original_index = encoded_page_id
             all_indices.add(original_index)
     return sorted(list(all_indices))
 def punctuation_at_word_text_end(word_level_df_orig: pd.DataFrame) -> bool:
     """
+    Check the first 1000 rows of word_level_df_orig to see if any of the strings
     in 'word_text' end with a full stop '.', exclamation mark '!', or question mark '?',
     for strings that do not contain these characters alone.
     Args:
         word_level_df_orig (pd.DataFrame): DataFrame containing word-level OCR data with 'word_text' column
     Returns:
         bool: True if any strings end with punctuation marks, False otherwise
     """
     # Get the first 1000 rows or all rows if less than 1000
     sample_df = word_level_df_orig.head(1000)
     # Check if 'word_text' column exists
+    if "word_text" not in sample_df.columns:
         return False
     # Define punctuation marks to check for
+    punctuation_marks = [".", "!", "?"]
     # Check each word_text string
+    for word_text in sample_df["word_text"]:
         if pd.isna(word_text) or not isinstance(word_text, str):
             continue
         # Skip strings that contain only punctuation marks
         if word_text.strip() in punctuation_marks:
             continue
         # Check if the string ends with any of the punctuation marks
         if any(word_text.rstrip().endswith(punct) for punct in punctuation_marks):
             return True
     return False
 def run_full_search_and_analysis(
     search_query_text: str,
     word_level_df_orig: pd.DataFrame,
     similarity_threshold: float = 1,
+    combine_pages: bool = False,
     min_word_count: int = 1,
     min_consecutive_pages: int = 1,
     greedy_match: bool = True,
     remake_index: bool = False,
+    progress=gr.Progress(track_tqdm=True),
 ):
     """
     This function orchestrates the entire pipeline for finding duplicate pages based on a user's search query. It takes in the search query text, the original word-level OCR data, and various parameters to control the analysis. The function then:
     - search_query_text (str): The text entered by the user to search for in the OCR data.
     - word_level_df_orig (pd.DataFrame): The original DataFrame containing word-level OCR data.
     - similarity_threshold (float, optional): The minimum similarity score required for two pages to be considered duplicates. Defaults to 1.
+    - combine_pages (bool, optional): A flag indicating whether to combine text from the same page number within a file. Defaults to False.
     - min_word_count (int, optional): The minimum number of words required for a page to be considered in the analysis. Defaults to 1.
     - min_consecutive_pages (int, optional): The minimum number of consecutive pages required to be considered a match. Defaults to 1.
     - greedy_match (bool, optional): A flag indicating whether to use a greedy strategy for matching consecutive pages. Defaults to True.
     if len(search_query_text) > 100:
         raise Warning("Please use a search query with at less than 100 characters.")
+    if punctuation_at_word_text_end(word_level_df_orig) is True:
+        do_punctuation_split = False
+    else:
+        do_punctuation_split = True
     # Step 1: Process the user's search query string
+    search_query_data, query_word_length = create_dataframe_from_string(
+        search_query_text,
+        file_name="user_search_query",
+        split_words=True,
+        split_punctuation=do_punctuation_split,
+    )
     if not search_query_data:
         # Handle case where user submits an empty search string
+        raise Warning("Could not convert search string to required format")
     if query_word_length > 25:
         # Handle case where user submits an empty search string
+        raise Warning("Please use a query with less than 25 words")
     # Overwrite min_consecutive_pages with the search string length
     min_consecutive_pages = query_word_length
     # Create word index from reference table
     word_level_df_orig["index"] = word_level_df_orig.index
+    word_level_df = word_level_df_orig.copy()
     # Step 2: Process the main word-level OCR DataFrame
     word_level_data = convert_word_level_df(word_level_df, file_name="source_document")
     all_data_to_process = search_query_data + word_level_data
     if not all_data_to_process:
         raise gr.Error("No data to process. Please check your inputs.")
     # Step 4: Run the combination logic
     combined_df, _, full_out_ocr_df = combine_ocr_dataframes(
         input_data=all_data_to_process,
         combine_pages=combine_pages,
+        output_folder=None,  # No need to save this intermediate file
+        remake_index=remake_index,
     )
     # Step 5: Run the final similarity analysis on the combined data
         do_text_clean=False,
         file1_name="user_search_query",
         file2_name="source_document",
+        progress=progress,
     )
     print("Finished text search")
     # Map the results back to the reference data file
+    if remake_index is True:
+        results_df_index_list = extract_indices_from_page_ranges(
+            results_df, converted_index=True
+        )
     else:
+        results_df_index_list = extract_indices_from_page_ranges(
+            results_df, converted_index=False
+        )
+    word_level_df_out = word_level_df_orig.loc[
+        word_level_df_orig["index"].isin(results_df_index_list)
+    ]
     return word_level_df_out, duplicate_files, full_data
+def create_all_data_to_process(
+    converted_data: pd.DataFrame, other_data_list: List[Tuple]
+):
     all_data_to_process = converted_data + other_data_list
     return all_data_to_process
 def convert_word_level_df(
+    word_level_df: pd.DataFrame, file_name: str = "converted_dataframe"
 ) -> List[Tuple[str, pd.DataFrame]]:
     """
     Converts a word-level OCR DataFrame to the format for
             DataFrame will have 'page' and 'text' columns.
     """
     # --- 1. Validate Input ---
+    required_columns = ["page", "line", "word_text"]
     if not all(col in word_level_df.columns for col in required_columns):
+        raise ValueError(
+            f"Input DataFrame must contain all of the following columns: {required_columns}"
+        )
     df = word_level_df.copy()
     # --- 2. Process the DataFrame ---
     # Ensure word_text is a string to allow for joining
+    df["word_text"] = df["word_text"].astype(str)
     # Group by page and line number, then join the words with a space (not needed for word level search)
     # The result is a Series with a MultiIndex (page, line)
+    # line_text_series = df.groupby(['page', 'line'])['word_text'].apply(' '.join)
     # Convert the Series back to a DataFrame and reset the index
+    # line_level_df = line_text_series.reset_index()
     # Rename the aggregated column from 'word_text' to the required 'text'
+    df = df.rename(columns={"word_text": "text"})
     # --- 3. Finalise the structure ---
     # We now have a DataFrame with columns [page, line, text].
+    final_df = df[["page", "text"]]
     # --- 4. Package for output ---
     # Return in the required List[Tuple[str, DataFrame]] format
     return [(file_name, final_df)]
 def create_dataframe_from_string(
     text_string: str,
     file_name: str = "user_search_query",
     if split_words:
         # --- Split string into words, one per row, based on similar punctuation split technique used to create ocr_results_with_words objects ---
+        if split_punctuation is True:
             words = split_text_with_punctuation(text_string)
         else:
             words = text_string.split()
+        # words = text_string.split()
         len_words = len(words)
+        data = {
+            "page": [page_number]
+            * len_words,  # Assign the same page number to every word
+            "text": words,  # The list of words becomes the text column
         }
     else:
         # --- Entire string in one row ---
         len_words = 1
+        data = {"page": [page_number], "text": [text_string]}
     # Create the DataFrame from the prepared data
     df = pd.DataFrame(data)
     # Return it in the required format: a list containing one (name, df) tuple
     return [(file_name, df)], len_words
 def combine_ocr_dataframes(
     input_data: List[Tuple[str, pd.DataFrame]],
     combine_pages: bool = True,
     output_folder: str = OUTPUT_FOLDER,
     output_filename: str = "combined_ocr_output.csv",
     number_of_added_zeros: int = number_of_zeros_to_add_to_index,
+    remake_index: bool = True,
 ) -> Tuple[pd.DataFrame, List[str]]:
     """
     Combines text from multiple pandas DataFrames containing page and text columns.
         df = df_initial.copy()  # Work on a copy to avoid side effects
         # --- Validation ---
+        if "page" not in df.columns or "text" not in df.columns:
+            print(
+                f"Warning: Skipping data for '{file_identifier}' - missing required columns 'page' and 'text'."
+            )
             continue
         # --- Processing ---
+        df["text"] = df["text"].fillna("").astype(str)
         if combine_pages:
             # Group by page and concatenate text into a single string
+            processed_df = df.groupby("page")["text"].apply(" ".join).reset_index()
         else:
+            if remake_index is True:
                 # # Create a unique, sortable page ID for each line without combining
                 # df['line_number_by_page'] = df.groupby('page').cumcount() + 1
                 # df['original_page'] = df['page']
                 # df['page'] = (
                 #     df['page'].astype(str).str.zfill(number_of_added_zeros) +
                 #     df['line_number_by_page'].astype(str).str.zfill(number_of_added_zeros)
+                # ).astype(int)
                 # Define the multiplier based on the max expected lines per page.
                 # If you expect up to 99,999 lines, use 100,000.
+                df["line_number_by_page"] = df.groupby("page").cumcount() + 1
+                df["original_page"] = df["page"]
                 # Create the new combined ID using arithmetic
+                df["page"] = (df["original_page"] * ID_MULTIPLIER) + df[
+                    "line_number_by_page"
+                ]
             else:
+                if "index" not in df.columns:
+                    df["index"] = df.index
+                df["page"] = df["index"]
             processed_df = df
         # Add the file identifier column
+        processed_df["file"] = file_identifier
         all_data.append(processed_df)
     if not all_data:
+        raise ValueError(
+            "No valid DataFrames were processed. Ensure input data is not empty and DataFrames have 'page' and 'text' columns."
+        )
     # --- Final Combination ---
     combined_df = pd.concat(all_data, ignore_index=True)
     # Reorder columns to a standard format, dropping intermediate columns
+    final_columns = ["file", "page", "text"]
+    if "original_page" in combined_df.columns:
+        final_columns.append("original_page")  # Keep for context if created
     # Ensure all final columns exist before trying to select them
+    existing_final_columns = [
+        col for col in final_columns if col in combined_df.columns
+    ]
     full_out_ocr_df = combined_df
     combined_df = combined_df.copy()[existing_final_columns]
     return combined_df, output_files, full_out_ocr_df
 def combine_ocr_output_text(
     input_files: Union[str, List[str]],
     combine_pages: bool = True,
     remake_index: bool = True,
+    output_folder: str = OUTPUT_FOLDER,
 ) -> Tuple[pd.DataFrame, List[str]]:
     """
     Reads multiple OCR CSV files, combines them, and saves the result.
         input_data=data_to_process,
         combine_pages=combine_pages,
         output_folder=output_folder,
+        output_filename="combined_ocr_from_files.csv",  # Specific name for this path
+        remake_index=remake_index,
     )
+def clean_and_stem_text_series(df: pd.DataFrame, column: str):
+    """
     Clean and stem text columns in a data frame
+    """
     def _clean_text(raw_text):
         # Remove HTML tags
+        clean = re.sub(r"<.*?>", "", raw_text)
+        clean = " ".join(clean.split())
         # Join the cleaned words back into a string
         return clean
     def _apply_lemmatization(text):
         doc = nlp(text)
         # Keep only alphabetic tokens and remove stopwords
+        lemmatized_words = [
+            token.lemma_ for token in doc if token.is_alpha and not token.is_stop
+        ]
+        return " ".join(lemmatized_words)
+    df["text_clean"] = df[column].apply(_clean_text)
+    df["text_clean"] = df["text_clean"].apply(_apply_lemmatization)
     return df
+def map_metadata_single_page(
+    similarity_df: pd.DataFrame,
+    metadata_source_df: pd.DataFrame,
+    preview_length: int = 200,
+):
     """Helper to map metadata for single page results."""
+    metadata_df = metadata_source_df[["file", "page", "text"]]
+    results_df = similarity_df.merge(
+        metadata_df, left_on="Page1_Index", right_index=True
+    ).rename(columns={"file": "Page1_File", "page": "Page1_Page", "text": "Page1_Text"})
+    results_df = results_df.merge(
+        metadata_df, left_on="Page2_Index", right_index=True, suffixes=("_1", "_2")
+    ).rename(columns={"file": "Page2_File", "page": "Page2_Page", "text": "Page2_Text"})
     results_df["Similarity_Score"] = results_df["Similarity_Score"].round(3)
+    final_df = results_df[
+        [
+            "Page1_File",
+            "Page1_Page",
+            "Page2_File",
+            "Page2_Page",
+            "Similarity_Score",
+            "Page1_Text",
+            "Page2_Text",
+        ]
+    ]
+    final_df = final_df.sort_values(
+        ["Page1_File", "Page1_Page", "Page2_File", "Page2_Page"]
+    )
+    final_df["Page1_Text"] = final_df["Page1_Text"].str[:preview_length]
+    final_df["Page2_Text"] = final_df["Page2_Text"].str[:preview_length]
     return final_df
+def map_metadata_subdocument(
+    subdocument_df: pd.DataFrame,
+    metadata_source_df: pd.DataFrame,
+    preview_length: int = 200,
+):
     """Helper to map metadata for subdocument results."""
+    metadata_df = metadata_source_df[["file", "page", "text"]]
+    subdocument_df = subdocument_df.merge(
+        metadata_df, left_on="Page1_Start_Index", right_index=True
+    ).rename(
+        columns={"file": "Page1_File", "page": "Page1_Start_Page", "text": "Page1_Text"}
+    )
+    subdocument_df = subdocument_df.merge(
+        metadata_df[["page"]], left_on="Page1_End_Index", right_index=True
+    ).rename(columns={"page": "Page1_End_Page"})
+    subdocument_df = subdocument_df.merge(
+        metadata_df, left_on="Page2_Start_Index", right_index=True
+    ).rename(
+        columns={"file": "Page2_File", "page": "Page2_Start_Page", "text": "Page2_Text"}
+    )
+    subdocument_df = subdocument_df.merge(
+        metadata_df[["page"]], left_on="Page2_End_Index", right_index=True
+    ).rename(columns={"page": "Page2_End_Page"})
+    cols = [
+        "Page1_File",
+        "Page1_Start_Page",
+        "Page1_End_Page",
+        "Page2_File",
+        "Page2_Start_Page",
+        "Page2_End_Page",
+        "Match_Length",
+        "Page1_Text",
+        "Page2_Text",
+    ]
     # Add Avg_Similarity if it exists (it won't for greedy match unless we add it)
+    if "Avg_Similarity" in subdocument_df.columns:
+        subdocument_df["Avg_Similarity"] = subdocument_df["Avg_Similarity"].round(3)
+        cols.insert(7, "Avg_Similarity")
     final_df = subdocument_df[cols]
+    final_df = final_df.sort_values(
+        ["Page1_File", "Page1_Start_Page", "Page2_File", "Page2_Start_Page"]
+    )
+    final_df["Page1_Text"] = final_df["Page1_Text"].str[:preview_length]
+    final_df["Page2_Text"] = final_df["Page2_Text"].str[:preview_length]
     return final_df
+def save_results_and_redaction_lists(
+    final_df: pd.DataFrame, output_folder: str, combine_pages: bool = True
+) -> list:
     """
     Saves the main results DataFrame and generates per-file redaction lists.
     This function is extracted to be reusable.
         return []
     # 1. Save the main results DataFrame
+    similarity_file_output_path = output_folder_path / "page_similarity_results.csv"
     final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
     output_paths.append(str(similarity_file_output_path))
+    # print(f"Main results saved to {similarity_file_output_path}")
     # 2. Save per-file redaction lists
     # Use 'Page2_File' as the source of duplicate content
+    if combine_pages is True:
+        grouping_col = "Page2_File"
         if grouping_col not in final_df.columns:
+            print(
+                "Warning: 'Page2_File' column not found. Cannot generate redaction lists."
+            )
             return output_paths
         for redact_file, group in final_df.groupby(grouping_col):
             output_file_name_stem = Path(redact_file).stem
+            output_file_path = (
+                output_folder_path / f"{output_file_name_stem}_pages_to_redact.csv"
+            )
             all_pages_to_redact = set()
+            is_subdocument_match = "Page2_Start_Page" in group.columns
             if is_subdocument_match:
                 for _, row in group.iterrows():
+                    pages_in_range = range(
+                        int(row["Page2_Start_Page"]), int(row["Page2_End_Page"]) + 1
+                    )
                     all_pages_to_redact.update(pages_in_range)
             else:
+                pages = group["Page2_Page"].unique()
                 all_pages_to_redact.update(pages)
             if all_pages_to_redact:
+                redaction_df = pd.DataFrame(
+                    sorted(list(all_pages_to_redact)), columns=["Page_to_Redact"]
+                )
                 redaction_df.to_csv(output_file_path, header=False, index=False)
                 output_paths.append(str(output_file_path))
                 print(f"Redaction list for {redact_file} saved to {output_file_path}")
     return output_paths
 def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
     """
     Helper function to compare two sequences of tokens with punctuation flexibility.
         # - Its last character must be in our punctuation set
         # - The token without its last character must match the query token
         if (
+            len(ref_token) > 1
+            and ref_token[-1] in PUNCTUATION_TO_STRIP
+            and ref_token[:-1] == query_token
         ):
             continue
     # If the loop completes, every token has matched.
     return True
 def find_consecutive_sequence_matches(
+    df_filtered: pd.DataFrame, search_file_name: str, reference_file_name: str
 ) -> pd.DataFrame:
     """
     Finds all occurrences of a consecutive sequence of tokens from a search file
         A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
         consecutive match, or an empty DataFrame if no match is found.
     """
+    # print(f"Starting sequence search for '{search_file_name}' in '{reference_file_name}'...")
     # Step 1: Isolate the data for each file
+    search_df = df_filtered[df_filtered["file"] == search_file_name]
+    reference_df = df_filtered[df_filtered["file"] == reference_file_name]
     if search_df.empty or reference_df.empty:
         print("Error: One or both files not found or are empty.")
+        return pd.DataFrame(columns=["Page1_Index", "Page2_Index"])
     # Step 2: Convert the token data into lists for easy comparison.
     # We need both the text tokens and their original global indices.
+    query_tokens = search_df["text_clean"].tolist()
     query_indices = search_df.index.tolist()
+    reference_tokens = reference_df["text_clean"].tolist()
     reference_indices = reference_df.index.tolist()
     query_len = len(query_tokens)
         # Step 4: If the window matches the query with or without punctuation on end
         if _sequences_match(query_tokens, window):
+            # print(f"Found a consecutive match starting at reference index: {reference_indices[i]}")
             # Get the global indices for this entire matching block
             matching_reference_indices = reference_indices[i : i + query_len]
             # Create the mapping between query indices and the found reference indices
             for j in range(query_len):
                 all_found_matches.append(
                     (query_indices[j], matching_reference_indices[j], 1)
                 )
             # If you only want the *first* match, you can uncomment the next line:
+            # break
     if not all_found_matches:
         print("No matches found")
         gr.Info("No matches found")
+        return pd.DataFrame(columns=["Page1_Index", "Page2_Index", "Similarity_Score"])
     # Step 5: Create the final DataFrame in the desired format
+    result_df = pd.DataFrame(
+        all_found_matches, columns=["Page1_Index", "Page2_Index", "Similarity_Score"]
+    )
     return result_df
 def identify_similar_text_sequences(
     df_combined: pd.DataFrame,
     similarity_threshold: float = 1,
     greedy_match: bool = True,
     combine_pages: bool = False,
     inter_file_only: bool = False,
+    do_text_clean: bool = True,
+    file1_name: str = "",
+    file2_name: str = "",
     output_folder: str = OUTPUT_FOLDER,
+    progress=Progress(track_tqdm=True),
 ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
     """
     Identifies similar pages. Uses a highly optimized path for inter_file_only=True.
     progress(0.1, desc="Processing and filtering text")
     if do_text_clean:
+        df = clean_and_stem_text_series(
+            df_combined, "text"
+        )  # Will produce the column 'text_clean'
     else:
         df = df_combined.copy()
+        df["text_clean"] = df[
+            "text"
+        ].str.lower()  # .str.replace(r'[^\w\s]', '', regex=True)
+    df["word_count"] = df["text_clean"].str.split().str.len().fillna(0)
+    # df['word_count'] = pd.to_numeric(df['word_count'], errors='coerce').fillna(0).astype('int64')
     # ensure min_word_count is an int (e.g., from Gradio/text input)
     try:
         min_word_count = 0  # or raise/log, depending on your preference
     original_row_count = len(df)
+    df_filtered = df[df["word_count"] >= min_word_count].copy()
     df_filtered.reset_index(drop=True, inplace=True)
+    print(
+        f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words."
+    )
     if len(df_filtered) < 2:
         return pd.DataFrame(), [], df_combined
     # Similarity calculated differently if comparing between files only (inter_file_only==True), or within the same file
     if inter_file_only:
         progress(0.2, desc="Finding direct text matches...")
+        # base_similarity_df = _debug_similarity_between_two_files(df_filtered, vectorizer, similarity_threshold, file1_name, file2_name)
+        base_similarity_df = find_consecutive_sequence_matches(
+            df_filtered, file1_name, file2_name
+        )
         if base_similarity_df.empty:
+            return pd.DataFrame(), [], df_combined
     else:
         # Use the original, simpler path for all-to-all comparisons (including intra-file).
         vectorizer = TfidfVectorizer()
         print("Standard Path: Calculating all-to-all similarity.")
         progress(0.2, desc="Vectorizing text...")
+        tfidf_matrix = vectorizer.fit_transform(df_filtered["text_clean"])
         progress(0.3, desc="Calculating similarity matrix...")
         similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
         coo_matrix = similarity_matrix.tocoo()
         similar_pages = [
+            (r, c, v)
+            for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
             if r < c and v >= similarity_threshold
         ]
         if not similar_pages:
             return pd.DataFrame(), [], df_combined
+        base_similarity_df = pd.DataFrame(
+            similar_pages, columns=["Page1_Index", "Page2_Index", "Similarity_Score"]
+        )
     progress(0.7, desc="Aggregating results based on matching strategy")
     if greedy_match or min_consecutive_pages > 1:
+        # print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
         # Sort the dataframe to ensure consecutive pages are adjacent
+        similarity_df = (
+            base_similarity_df  # .sort_values(['Page1_Index', 'Page2_Index']).copy()
+        )
         # A new sequence starts if the difference from the previous row is not (1, 1)
         # is_consecutive will be True if a row continues the sequence, False if it's a new one.
+        is_consecutive = (similarity_df["Page1_Index"].diff() == 1) & (
+            similarity_df["Page2_Index"].diff() == 1
+        )
         # Use cumsum() on the inverted boolean series to create a unique ID for each block.
         # Every time a 'False' appears (a new block starts), the sum increases.
         # Aggregate each group to get the start, end, and length of the match
         agg_results = grouped.agg(
+            Page1_Start_Index=("Page1_Index", "first"),
+            Page2_Start_Index=("Page2_Index", "first"),
+            Page1_End_Index=("Page1_Index", "last"),
+            Page2_End_Index=("Page2_Index", "last"),
+            Match_Length=("Page1_Index", "size"),
+            Avg_Similarity=("Similarity_Score", "mean"),
         ).reset_index(drop=True)
         # If greedy_match=True, we keep all matches. If min_consecutive_pages > 1, we filter.
         if greedy_match and min_consecutive_pages <= 1:
             subdocument_df = agg_results
         else:
+            # This handles the case for min_consecutive_pages > 1
+            subdocument_df = agg_results[
+                agg_results["Match_Length"] >= min_consecutive_pages
+            ].copy()
         if subdocument_df.empty:
             gr.Info("No matches found")
             return pd.DataFrame(), [], df_combined
         final_df = map_metadata_subdocument(subdocument_df, df_filtered)
     else:
+        print("Finding single page matches, not greedy (min_consecutive_pages=1)")
         # This part of your code would handle the non-sequential case
         final_df = map_metadata_single_page(base_similarity_df, df_filtered)
+        # subdocument_df = final_df # To align variable names for saving
         if final_df.empty:
             gr.Info("No matches found")
             return pd.DataFrame(), [], df_combined
     progress(0.9, desc="Saving output files")
+    output_paths = save_results_and_redaction_lists(
+        final_df, output_folder, combine_pages
+    )
     gr.Info(f"Found {final_df.shape[0]} match(es)")
     print(f"Found {final_df.shape[0]} match(es)")
     return final_df, output_paths, df_combined
+def handle_selection_and_preview(
+    evt: gr.SelectData, results_df: pd.DataFrame, full_duplicate_data_by_file: dict
+):
     """
     This single function handles a user selecting a row. It:
     1. Determines the selected row index.
     """
     # If the user deselects, the event might be None.
     if not evt:
+        return None, None, None  # Clear state and both preview panes
     # 1. Get the selected index
     selected_index = evt.index[0]
     # 2. Get the preview data
+    page1_data, page2_data = show_page_previews(
+        full_duplicate_data_by_file, results_df, evt
+    )
     # 3. Return all three outputs in the correct order
     return selected_index, page1_data, page2_data
+def exclude_match(
+    results_df: pd.DataFrame, selected_index: int, output_folder=OUTPUT_FOLDER
+):
     """
     Removes a selected row from the results DataFrame, regenerates output files,
     and clears the text preview panes.
         gr.Warning("No match selected. Please click on a row in the table first.")
         # Return the original dataframe and update=False for the files
         return results_df, gr.update(), None, None
     if results_df.empty:
         gr.Warning("No duplicate page results found, nothing to exclude.")
         return results_df, gr.update(), None, None
     # Drop the selected row
     updated_df = results_df.drop(selected_index).reset_index(drop=True)
     # Recalculate all output files using the helper function
     new_output_paths = save_results_and_redaction_lists(updated_df, output_folder)
     gr.Info(f"Match at row {selected_index} excluded. Output files have been updated.")
     # Return the updated dataframe, the new file list, and clear the preview panes
     return updated_df, new_output_paths, None, None
+def run_duplicate_analysis(
+    files: list[str],
+    threshold: float,
+    min_words: int,
+    min_consecutive: int,
+    greedy_match: bool,
+    combine_pages: bool = True,
+    preview_length: int = 500,
+    output_folder: str = OUTPUT_FOLDER,
+    progress=gr.Progress(track_tqdm=True),
+):
     """
     Main wrapper function to orchestrate the duplicate page analysis process.
     It handles file loading, text combination, similarity identification,
         progress (gr.Progress, optional): A Gradio progress tracker object to display progress in the UI.
     """
+    if not files:
+        raise Warning("Please upload files to analyse.")
+    if isinstance(files, str):
+        files = [files]
     if len(files) > MAX_SIMULTANEOUS_FILES:
         out_message = f"Number of files to deduplicate is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
     start_time = time.time()
     task_textbox = "deduplicate"
     progress(0, desc="Combining input files...")
+    df_combined, _, full_out_ocr_df = combine_ocr_output_text(
+        files, combine_pages=combine_pages
+    )
     if df_combined.empty:
         raise Warning("No data found in the uploaded files.")
         greedy_match=greedy_match,
         combine_pages=combine_pages,
         output_folder=output_folder,
+        progress=progress,
     )
     # Clip text to first 200 characters
+    full_df["text"] = full_df["text"].str[:preview_length]
     # Preprocess full_data (without preview text) for fast access (run once)
     full_data_by_file = {
+        file: df.sort_values("page").set_index("page")
+        for file, df in full_df.drop(["text_clean"], axis=1).groupby("file")
     }
     if results_df.empty:
+        gr.Info("No duplicate pages found, no results returned.")
     end_time = time.time()
     processing_time = round(end_time - start_time, 2)
     return results_df, output_paths, full_data_by_file, processing_time, task_textbox
+def show_page_previews(
+    full_data_by_file: dict,
+    results_df: pd.DataFrame,
+    evt: gr.SelectData,
+    preview_length: int = 500,
+):
     """
     Optimized version using pre-partitioned and indexed full_data.
     Triggered when a user selects a row in the results DataFrame.
     selected_row = results_df.iloc[evt.index[0], :]
+    is_subdocument_match = "Page1_Start_Page" in selected_row
     if is_subdocument_match:
+        file1, start1, end1 = (
+            selected_row["Page1_File"],
+            selected_row["Page1_Start_Page"],
+            selected_row["Page1_End_Page"],
+        )
+        file2, start2, end2 = (
+            selected_row["Page2_File"],
+            selected_row["Page2_Start_Page"],
+            selected_row["Page2_End_Page"],
+        )
+        page1_data = full_data_by_file[file1].loc[start1:end1, ["text"]].reset_index()
+        page2_data = full_data_by_file[file2].loc[start2:end2, ["text"]].reset_index()
     else:
+        file1, page1 = selected_row["Page1_File"], selected_row["Page1_Page"]
+        file2, page2 = selected_row["Page2_File"], selected_row["Page2_Page"]
+        page1_data = full_data_by_file[file1].loc[[page1], ["text"]].reset_index()
+        page2_data = full_data_by_file[file2].loc[[page2], ["text"]].reset_index()
+    page1_data["text"] = page1_data["text"].str[:preview_length]
+    page2_data["text"] = page2_data["text"].str[:preview_length]
+    return page1_data[["page", "text"]], page2_data[["page", "text"]]
 def get_page_image_info(page_num: int, page_sizes: List[Dict]) -> Optional[Dict]:
     """
     """
     return next((size for size in page_sizes if size["page"] == page_num), None)
 def add_new_annotations_to_existing_page_annotations(
+    all_annotations: List[Dict], image_path: str, new_annotation_boxes: List[Dict]
 ) -> Tuple[List[Dict], Dict]:
     """
     Adds a list of new annotation boxes to the annotations for a specific page.
     """
     # Find the annotation group for the current page/image
     current_page_group = next(
+        (
+            annot_group
+            for annot_group in all_annotations
+            if annot_group["image"] == image_path
+        ),
+        None,
     )
     if current_page_group:
         current_page_group["boxes"].extend(new_annotation_boxes)
     else:
         # This is the first set of annotations for this page, create a new group
+        new_group = {"image": image_path, "boxes": new_annotation_boxes}
         all_annotations.append(new_group)
     # This object represents all annotations that were just added for this page
+    newly_added_annotation_group = {"image": image_path, "boxes": new_annotation_boxes}
     return all_annotations, newly_added_annotation_group
+def apply_whole_page_redactions_from_list(
+    duplicate_page_numbers_df: pd.DataFrame,
+    doc_file_name_with_extension_textbox: str,
+    review_file_state: pd.DataFrame,
+    duplicate_output_paths: list[str],
+    pymupdf_doc: object,
+    page_sizes: list[dict],
+    all_existing_annotations: list[dict],
+    combine_pages: bool = True,
+    new_annotations_with_bounding_boxes: List[dict] = list(),
+):
+    """
     This function applies redactions to whole pages based on a provided list of duplicate page numbers. It supports two modes of operation: combining pages and not combining pages. When combining pages is enabled, it attempts to identify duplicate pages across different files and applies redactions accordingly. If combining pages is disabled, it relies on new annotations with bounding boxes to determine which pages to redact. The function utilises a PyMuPDF document object to manipulate the PDF file, and it also considers the sizes of pages to ensure accurate redaction application.
     Args:
         all_existing_annotations (list[dict]): A list of all existing annotations in the document.
         combine_pages (bool, optional): A flag indicating whether to combine pages for redaction. Defaults to True.
         new_annotations_with_bounding_boxes (List[dict], optional): A list of new annotations with bounding boxes. Defaults to an empty list.
+    """
     if all_existing_annotations is None:
         all_existing_annotations = []
         print(f"Warning: {message}")
         raise Warning(message)
+    list_whole_pages_to_redact = []
+    if combine_pages is True:
         # Get list of pages to redact from either dataframe or file
         if not duplicate_page_numbers_df.empty:
             list_whole_pages_to_redact = duplicate_page_numbers_df.iloc[:, 0].tolist()
         elif duplicate_output_paths:
+            expected_duplicate_pages_to_redact_name = (
+                f"{doc_file_name_with_extension_textbox}"
+            )
             whole_pages_list = pd.DataFrame()  # Initialize empty DataFrame
             for output_file in duplicate_output_paths:
                 # Note: output_file.name might not be available if output_file is just a string path
                 # If it's a Path object or similar, .name is fine. Otherwise, parse from string.
+                file_name_from_path = (
+                    output_file.split("/")[-1]
+                    if isinstance(output_file, str)
+                    else output_file.name
+                )
                 if expected_duplicate_pages_to_redact_name in file_name_from_path:
+                    whole_pages_list = pd.read_csv(
+                        output_file, header=None
+                    )  # Use output_file directly if it's a path
+                    break
         else:
             message = "No relevant list of whole pages to redact found."
             print(message)
             raise Warning(message)
         if not whole_pages_list.empty:
             list_whole_pages_to_redact = whole_pages_list.iloc[:, 0].tolist()
         list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
     else:
             message = "Can't find any new annotations to add"
             print(message)
             raise Warning(message)
         list_whole_pages_to_redact = []
         for annotation in new_annotations_with_bounding_boxes:
+            match = re.search(r"_(\d+)\.png$", annotation["image"])
             if match:
                 page = int(match.group(1)) + 1
                 list_whole_pages_to_redact.append(page)
             else:
+                print(
+                    f"Warning: Could not extract page number from {annotation['image']}"
+                )
         list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
     new_annotations = []
     # Process each page for redaction
     for page in list_whole_pages_to_redact:
                 continue
             image_path = page_info["image_path"]
+            page_annotation_group = next(
+                (g for g in all_annotations if g["image"] == image_path), None
+            )
+            if page_annotation_group and any(
+                box["label"] == "Whole page" for box in page_annotation_group["boxes"]
+            ):
+                print(
+                    f"Whole page redaction for page {page_num} already exists, skipping."
+                )
                 continue
             # --- Create a LIST of boxes to add.---
             boxes_to_add = []
             pymupdf_page = pymupdf_doc[page_index]
+            if combine_pages is True:
                 whole_page_box = redact_whole_pymupdf_page(
                     rect_height=page_info["cropbox_height"],
                     rect_width=page_info["cropbox_width"],
+                    page=pymupdf_page,
+                    border=0.005,
+                    redact_pdf=False,
                 )
                 boxes_to_add.append(whole_page_box)
             else:
                 # Find the specific annotation group that matches the current page's image path
                 relevant_box_group = next(
+                    (
+                        group
+                        for group in new_annotations_with_bounding_boxes
+                        if group.get("image") == image_path
+                    ),
+                    None,  # Default to None if no match is found
                 )
                 # Check if we found a matching group of boxes for this page
                 if relevant_box_group:
+                    boxes_to_add.extend(relevant_box_group["boxes"])
                 else:
                     # This case would be unexpected, but it's good to handle.
                     # It means a page was in list_whole_pages_to_redact but had no
                     # corresponding boxes generated in new_annotations_with_bounding_boxes.
+                    print(
+                        f"Warning: No new annotation boxes found for page {page_num} ({image_path})."
+                    )
             # === Use the modified helper function to add a LIST of boxes ===
+            all_annotations, new_annotations_for_page = (
+                add_new_annotations_to_existing_page_annotations(
+                    all_annotations=all_annotations,
+                    image_path=image_path,
+                    new_annotation_boxes=boxes_to_add,  # Pass the list here
+                )
             )
+            new_annotations_for_page = fill_missing_box_ids_each_box(
+                new_annotations_for_page
+            )
             new_annotations.append(new_annotations_for_page)
         except Exception as e:
         gr.Info(message)
         return review_file_state, all_annotations
+    expected_cols = [
+        "image",
+        "page",
+        "label",
+        "color",
+        "xmin",
+        "ymin",
+        "xmax",
+        "ymax",
+        "text",
+        "id",
+    ]
     for col in expected_cols:
+        if col not in review_file_state.columns:
+            review_file_state[col] = pd.NA
+        if col not in whole_page_review_file.columns:
+            whole_page_review_file[col] = pd.NA
+    review_file_out = pd.concat(
+        [review_file_state, whole_page_review_file], ignore_index=True
+    )
+    review_file_out = review_file_out.sort_values(
+        by=["page", "ymin", "xmin"]
+    ).reset_index(drop=True)
+    review_file_out = review_file_out.drop_duplicates(
+        subset=["page", "label", "text", "id"], keep="first"
+    )
     out_message = "Successfully created duplicate text redactions."
     print(out_message)
     gr.Info(out_message)
     return review_file_out, all_annotations
 def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
     """Parses a combined ID using modular arithmetic."""
     if int(combined_id) < ID_MULTIPLIER:
         # Handle cases where page is 0 (or just an edge case)
         return 0, combined_id
     page = combined_id // ID_MULTIPLIER
     line = combined_id % ID_MULTIPLIER
     return page, line
 def create_annotation_objects_from_duplicates(
+    duplicates_df: pd.DataFrame,
     ocr_results_df: pd.DataFrame,
     page_sizes: List[Dict],
+    combine_pages: bool = False,
+) -> List[Dict]:
     """
     Creates structured annotation objects from duplicate line ranges, mapping
     page numbers to image paths.
     if duplicates_df.empty:
         raise Warning("No duplicates found")
     if ocr_results_df.empty:
+        raise Warning(
+            "No OCR results found for file under review. Please upload relevant OCR_output file and original PDF document on the review tab."
+        )
+    if combine_pages is False:
+        page_to_image_map = {item["page"]: item["image_path"] for item in page_sizes}
         # Prepare OCR Data: Add a line number column if it doesn't exist
+        if "line_number_by_page" not in ocr_results_df.columns:
+            ocr_results_df = ocr_results_df.sort_values(
+                by=["page", "top", "left"]
+            ).reset_index(drop=True)
+            ocr_results_df["line_number_by_page"] = (
+                ocr_results_df.groupby("page").cumcount() + 1
+            )
         annotations_by_page = defaultdict(list)
         # Iterate through each duplicate range (this logic is unchanged)
         for _, row in duplicates_df.iterrows():
+            start_page, start_line = _parse_page_line_id(row["Page2_Start_Page"])
+            end_page, end_line = _parse_page_line_id(row["Page2_End_Page"])
             # Select OCR Lines based on the range (this logic is unchanged)
             if start_page == end_page:
+                condition = (ocr_results_df["page"] == start_page) & (
+                    ocr_results_df["line_number_by_page"].between(start_line, end_line)
                 )
             else:
+                cond_start = (ocr_results_df["page"] == start_page) & (
+                    ocr_results_df["line_number_by_page"] >= start_line
+                )
+                cond_middle = ocr_results_df["page"].between(
+                    start_page + 1, end_page - 1
+                )
+                cond_end = (ocr_results_df["page"] == end_page) & (
+                    ocr_results_df["line_number_by_page"] <= end_line
+                )
                 condition = cond_start | cond_middle | cond_end
             lines_to_annotate = ocr_results_df[condition]
             for _, line_row in lines_to_annotate.iterrows():
                 box = {
                     "label": "Duplicate text",
+                    "color": (0, 0, 0),
+                    "xmin": line_row["left"],
+                    "ymin": line_row["top"],
+                    "xmax": line_row["left"] + line_row["width"],
+                    "ymax": line_row["top"] + line_row["height"],
+                    "text": line_row["text"],
+                    "id": "",  # to be filled in after
                 }
+                page_number = line_row["page"]
                 annotations_by_page[page_number].append(box)
         # --- Format the final output list using the page-to-image map ---
         final_output = []
         # Sort by page number for a predictable order
         for page_num, boxes in sorted(annotations_by_page.items()):
             # Look up the image path using the page number
             image_path = page_to_image_map.get(page_num)
             if image_path:
+                page_boxes = {"image": image_path, "boxes": boxes}
                 # Fill in missing IDs for the new data entries
                 page_boxes = fill_missing_box_ids_each_box(page_boxes)
                 final_output.append(page_boxes)
             else:
                 # Handle cases where a page might not have a corresponding image path
+                print(
+                    f"Warning: Page {page_num} found in OCR data but has no corresponding "
+                    f"entry in the 'page_sizes' object. This page's annotations will be skipped."
+                )
+    return final_output

tools/find_duplicate_tabular.py CHANGED Viewed

@@ -1,22 +1,36 @@
-import pandas as pd
 import os
 import re
 import time
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-from typing import List, Tuple, Dict
 import gradio as gr
 from gradio import Progress
-from pathlib import Path
-from tools.helper_functions import OUTPUT_FOLDER, read_file
 from tools.data_anonymise import initial_clean
 from tools.load_spacy_model_custom_recognisers import nlp
-from tools.config import DO_INITIAL_TABULAR_DATA_CLEAN, REMOVE_DUPLICATE_ROWS, MAX_SIMULTANEOUS_FILES, MAX_TABLE_ROWS
-if REMOVE_DUPLICATE_ROWS == "True": REMOVE_DUPLICATE_ROWS = True
-else: REMOVE_DUPLICATE_ROWS = False
-def clean_and_stem_text_series(df: pd.DataFrame, column: str, do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN):
     """
     Clean and stem text columns in a data frame for tabular data
     """
@@ -25,31 +39,34 @@ def clean_and_stem_text_series(df: pd.DataFrame, column: str, do_initial_clean_d
     def _apply_lemmatization(text):
         doc = nlp(text)
         # Keep only alphabetic tokens and remove stopwords
-        lemmatized_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
-        return ' '.join(lemmatized_words)
     if do_initial_clean_dup:
-        df['text_clean'] = initial_clean(df[column])
-    df['text_clean'] = df['text_clean'].apply(_apply_lemmatization)
-    df['text_clean'] = df[column].str.lower()#.str.replace(r'[^\w\s]', '', regex=True)
     return df
 def convert_tabular_data_to_analysis_format(
-    df: pd.DataFrame,
-    file_name: str,
-    text_columns: List[str] = None
 ) -> List[Tuple[str, pd.DataFrame]]:
     """
     Convert tabular data (CSV/XLSX) to the format needed for duplicate analysis.
     Args:
         df (pd.DataFrame): The input DataFrame
         file_name (str): Name of the file
-        text_columns (List[str], optional): Columns to analyze for duplicates.
                                           If None, uses all string columns.
     Returns:
         List[Tuple[str, pd.DataFrame]]: List containing (file_name, processed_df) tuple
     """
@@ -60,34 +77,39 @@ def convert_tabular_data_to_analysis_format(
     #     text_columns = df.select_dtypes(include=['object', 'string']).columns.tolist()
     text_columns = [col for col in text_columns if col in df.columns]
     if not text_columns:
         print(f"No text columns found in {file_name}")
         return list()
     # Create a copy to avoid modifying original
     df_copy = df.copy()
     # Create a combined text column from all text columns
-    df_copy['combined_text'] = df_copy[text_columns].fillna('').astype(str).agg(' '.join, axis=1)
     # Add row identifier
-    df_copy['row_id'] = df_copy.index
     # Create the format expected by the duplicate detection system
     # Using 'row_number' as row number and 'text' as the combined text
-    processed_df = pd.DataFrame({
-        'row_number': df_copy['row_id'],
-        'text': df_copy['combined_text'],
-        'file': file_name
-    })
     # Add original row data for reference
     for col in text_columns:
-        processed_df[f'original_{col}'] = df_copy[col]
     return [(file_name, processed_df)]
 def find_duplicate_cells_in_tabular_data(
     input_files: List[str],
     similarity_threshold: float = 0.95,
@@ -97,11 +119,11 @@ def find_duplicate_cells_in_tabular_data(
     do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
     remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
     in_excel_tabular_sheets: str = "",
-    progress: Progress = Progress(track_tqdm=True)
 ) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
     """
     Find duplicate cells/text in tabular data files (CSV, XLSX, Parquet).
     Args:
         input_files (List[str]): List of file paths to analyze
         similarity_threshold (float): Minimum similarity score to consider duplicates
@@ -110,27 +132,27 @@ def find_duplicate_cells_in_tabular_data(
         output_folder (str, optional): Output folder for results
         do_initial_clean_dup (bool, optional): Whether to do initial clean of text
         progress (Progress): Progress tracking object
     Returns:
         Tuple containing:
         - results_df: DataFrame with duplicate matches
         - output_paths: List of output file paths
         - full_data_by_file: Dictionary of processed data by file
     """
     if not input_files:
         raise gr.Error("Please upload files to analyze.")
     progress(0.1, desc="Loading and processing files...")
     all_data_to_process = list()
     full_data_by_file = dict()
-    file_paths = list()
     # Process each file
     for file_path in input_files:
         try:
-            if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
                 temp_df = pd.DataFrame()
                 # Try finding each sheet in the given list until a match is found
@@ -147,12 +169,12 @@ def find_duplicate_cells_in_tabular_data(
                         file_name = os.path.basename(file_path) + "_" + sheet_name
                         file_paths.append(file_path)
                         # Convert to analysis format
                         processed_data = convert_tabular_data_to_analysis_format(
                             temp_df, file_name, text_columns
                         )
                         if processed_data:
                             all_data_to_process.extend(processed_data)
                             full_data_by_file[file_name] = processed_data[0][1]
@@ -165,99 +187,129 @@ def find_duplicate_cells_in_tabular_data(
                     out_message = f"Number of rows in {file_path} is greater than {MAX_TABLE_ROWS}. Please submit a smaller file."
                     print(out_message)
                     raise Exception(out_message)
                 file_name = os.path.basename(file_path)
                 file_paths.append(file_path)
                 # Convert to analysis format
                 processed_data = convert_tabular_data_to_analysis_format(
                     temp_df, file_name, text_columns
                 )
                 if processed_data:
                     all_data_to_process.extend(processed_data)
                     full_data_by_file[file_name] = processed_data[0][1]
         except Exception as e:
             print(f"Error processing {file_path}: {e}")
             continue
     if not all_data_to_process:
         raise gr.Error("No valid data found in uploaded files.")
     progress(0.2, desc="Combining data...")
     # Combine all data
-    combined_df = pd.concat([data[1] for data in all_data_to_process], ignore_index=True)
-    combined_df = combined_df.drop_duplicates(subset=['row_number', 'file'])
     progress(0.3, desc="Cleaning and preparing text...")
     # Clean and prepare text
-    combined_df = clean_and_stem_text_series(combined_df, 'text', do_initial_clean_dup=do_initial_clean_dup)
     # Filter by minimum word count
-    combined_df['word_count'] = combined_df['text_clean'].str.split().str.len().fillna(0)
-    combined_df = combined_df[combined_df['word_count'] >= min_word_count].copy()
     if len(combined_df) < 2:
         return pd.DataFrame(), [], full_data_by_file
     progress(0.4, desc="Calculating similarities...")
     # Calculate similarities
     vectorizer = TfidfVectorizer()
-    tfidf_matrix = vectorizer.fit_transform(combined_df['text_clean'])
     similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
     # Find similar pairs
     coo_matrix = similarity_matrix.tocoo()
     similar_pairs = [
-        (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
         if r < c and v >= similarity_threshold
     ]
     if not similar_pairs:
         gr.Info("No duplicate cells found.")
         return pd.DataFrame(), [], full_data_by_file
     progress(0.7, desc="Processing results...")
     # Create results DataFrame
     results_data = []
     for row1, row2, similarity in similar_pairs:
         row1_data = combined_df.iloc[row1]
         row2_data = combined_df.iloc[row2]
-        results_data.append({
-            'File1': row1_data['file'],
-            'Row1': int(row1_data['row_number']),
-            'File2': row2_data['file'],
-            'Row2': int(row2_data['row_number']),
-            'Similarity_Score': round(similarity, 3),
-            'Text1': row1_data['text'][:200] + '...' if len(row1_data['text']) > 200 else row1_data['text'],
-            'Text2': row2_data['text'][:200] + '...' if len(row2_data['text']) > 200 else row2_data['text'],
-            'Original_Index1': row1,
-            'Original_Index2': row2
-        })
     results_df = pd.DataFrame(results_data)
-    results_df = results_df.sort_values(['File1', 'Row1', 'File2', 'Row2'])
     progress(0.9, desc="Saving results...")
     # Save results
-    output_paths = save_tabular_duplicate_results(results_df, output_folder, file_paths, remove_duplicate_rows=remove_duplicate_rows, in_excel_tabular_sheets=in_excel_tabular_sheets)
     gr.Info(f"Found {len(results_df)} duplicate cell matches")
     return results_df, output_paths, full_data_by_file
-def save_tabular_duplicate_results(results_df: pd.DataFrame, output_folder: str, file_paths: List[str], remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS, in_excel_tabular_sheets: List[str] = []) -> List[str]:
     """
     Save tabular duplicate detection results to files.
     Args:
         results_df (pd.DataFrame): Results DataFrame
         output_folder (str): Output folder path
@@ -270,78 +322,87 @@ def save_tabular_duplicate_results(results_df: pd.DataFrame, output_folder: str,
     output_paths = list()
     output_folder_path = Path(output_folder)
     output_folder_path.mkdir(exist_ok=True)
     if results_df.empty:
         print("No duplicate matches to save.")
         return list()
     # Save main results
-    results_file = output_folder_path / 'tabular_duplicate_results.csv'
     results_df.to_csv(results_file, index=False, encoding="utf-8-sig")
     output_paths.append(str(results_file))
     # Group results by original file to handle Excel files properly
-    excel_files_processed = dict() # Track which Excel files have been processed
     # Save per-file duplicate lists
-    for file_name, group in results_df.groupby('File2'):
         # Check for matches with original file names
         for original_file in file_paths:
             original_file_name = os.path.basename(original_file)
             if original_file_name in file_name:
                 original_file_extension = os.path.splitext(original_file)[-1]
-                if original_file_extension in ['.xlsx', '.xls']:
                     # Split the string using a regex to handle both .xlsx_ and .xls_ delimiters
                     # The regex r'\.xlsx_|\.xls_' correctly matches either ".xlsx_" or ".xls_" as a delimiter.
-                    parts = re.split(r'\.xlsx_|\.xls_', os.path.basename(file_name))
                     # The sheet name is the last part after splitting
                     file_sheet_name = parts[-1]
                     file_path = original_file
                     # Initialize Excel file tracking if not already done
                     if file_path not in excel_files_processed:
                         excel_files_processed[file_path] = {
-                            'sheets_data': dict(),
-                            'all_sheets': list(),
-                            'processed_sheets': set()
                         }
                     # Read the original Excel file to get all sheet names
-                    if not excel_files_processed[file_path]['all_sheets']:
                         try:
                             excel_file = pd.ExcelFile(file_path)
-                            excel_files_processed[file_path]['all_sheets'] = excel_file.sheet_names
                         except Exception as e:
                             print(f"Error reading Excel file {file_path}: {e}")
                             continue
                     # Read the current sheet
                     df = read_file(file_path, excel_sheet_name=file_sheet_name)
                     # Create duplicate rows file for this sheet
                     file_stem = Path(file_name).stem
-                    duplicate_rows_file = output_folder_path / f"{file_stem}_{file_sheet_name}_duplicate_rows.csv"
                     # Get unique row numbers to remove
-                    rows_to_remove = sorted(group['Row2'].unique())
-                    duplicate_df = pd.DataFrame({'Row_to_Remove': rows_to_remove})
                     duplicate_df.to_csv(duplicate_rows_file, index=False)
                     output_paths.append(str(duplicate_rows_file))
                     # Process the sheet data
                     df_cleaned = df.copy()
                     df_cleaned["duplicated"] = False
                     df_cleaned.loc[rows_to_remove, "duplicated"] = True
                     if remove_duplicate_rows:
                         df_cleaned = df_cleaned.drop(index=rows_to_remove)
                     # Store the processed sheet data
-                    excel_files_processed[file_path]['sheets_data'][file_sheet_name] = df_cleaned
-                    excel_files_processed[file_path]['processed_sheets'].add(file_sheet_name)
                 else:
                     file_sheet_name = ""
                     file_path = original_file
@@ -350,11 +411,13 @@ def save_tabular_duplicate_results(results_df: pd.DataFrame, output_folder: str,
                     df = read_file(file_path)
                     file_stem = Path(file_name).stem
-                    duplicate_rows_file = output_folder_path / f"{file_stem}_duplicate_rows.csv"
                     # Get unique row numbers to remove
-                    rows_to_remove = sorted(group['Row2'].unique())
-                    duplicate_df = pd.DataFrame({'Row_to_Remove': rows_to_remove})
                     duplicate_df.to_csv(duplicate_rows_file, index=False)
                     output_paths.append(str(duplicate_rows_file))
@@ -366,63 +429,66 @@ def save_tabular_duplicate_results(results_df: pd.DataFrame, output_folder: str,
                     file_ext = os.path.splitext(file_name)[-1]
-                    if file_ext in ['.parquet']:
-                        output_path = os.path.join(output_folder, f"{file_base_name}_deduplicated.parquet")
                         df_cleaned.to_parquet(output_path, index=False)
                     else:
-                        output_path = os.path.join(output_folder, f"{file_base_name}_deduplicated.csv")
-                        df_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")
                     output_paths.append(str(output_path))
                 break
     # Process Excel files to create complete deduplicated files
     for file_path, file_data in excel_files_processed.items():
         try:
             # Create output filename
             file_base_name = os.path.splitext(os.path.basename(file_path))[0]
             file_ext = os.path.splitext(file_path)[-1]
-            output_path = os.path.join(output_folder, f"{file_base_name}_deduplicated{file_ext}")
             # Create Excel writer
-            with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
                 # Write all sheets
-                for sheet_name in file_data['all_sheets']:
-                    if sheet_name in file_data['processed_sheets']:
                         # Use the processed (deduplicated) version
-                        file_data['sheets_data'][sheet_name].to_excel(
-                            writer,
-                            sheet_name=sheet_name,
-                            index=False
                         )
                     else:
                         # Use the original sheet (no duplicates found)
                         original_df = read_file(file_path, excel_sheet_name=sheet_name)
-                        original_df.to_excel(
-                            writer,
-                            sheet_name=sheet_name,
-                            index=False
-                        )
             output_paths.append(str(output_path))
             print(f"Created deduplicated Excel file: {output_path}")
         except Exception as e:
             print(f"Error creating deduplicated Excel file for {file_path}: {e}")
             continue
     return output_paths
 def remove_duplicate_rows_from_tabular_data(
     file_path: str,
     duplicate_rows: List[int],
     output_folder: str = OUTPUT_FOLDER,
     in_excel_tabular_sheets: List[str] = [],
-    remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS
 ) -> str:
     """
     Remove duplicate rows from a tabular data file.
     Args:
         file_path (str): Path to the input file
         duplicate_rows (List[int]): List of row indices to remove
@@ -434,31 +500,39 @@ def remove_duplicate_rows_from_tabular_data(
     """
     try:
         # Load the file
-        df = read_file(file_path, excel_sheet_name=in_excel_tabular_sheets if in_excel_tabular_sheets else "")
         # Remove duplicate rows (0-indexed)
         df_cleaned = df.drop(index=duplicate_rows).reset_index(drop=True)
         # Save cleaned file
         file_name = os.path.basename(file_path)
         file_stem = os.path.splitext(file_name)[0]
         file_ext = os.path.splitext(file_name)[-1]
         output_path = os.path.join(output_folder, f"{file_stem}_deduplicated{file_ext}")
-        if file_ext in ['.xlsx', '.xls']:
-            df_cleaned.to_excel(output_path, index=False, sheet_name=in_excel_tabular_sheets if in_excel_tabular_sheets else [])
-        elif file_ext in ['.parquet']:
             df_cleaned.to_parquet(output_path, index=False)
         else:
             df_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")
         return output_path
     except Exception as e:
         print(f"Error removing duplicates from {file_path}: {e}")
         raise
 def run_tabular_duplicate_analysis(
     files: List[str],
     threshold: float,
@@ -468,11 +542,11 @@ def run_tabular_duplicate_analysis(
     do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
     remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
     in_excel_tabular_sheets: List[str] = [],
-    progress: Progress = Progress(track_tqdm=True)
 ) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
     """
     Main function to run tabular duplicate analysis.
     Args:
         files (List[str]): List of file paths
         threshold (float): Similarity threshold
@@ -480,7 +554,7 @@ def run_tabular_duplicate_analysis(
         text_columns (List[str], optional): Specific columns to analyze
         output_folder (str, optional): Output folder for results
         progress (Progress): Progress tracking
     Returns:
         Tuple containing results DataFrame, output paths, and full data by file
     """
@@ -491,43 +565,58 @@ def run_tabular_duplicate_analysis(
         text_columns=text_columns if text_columns else [],
         output_folder=output_folder,
         do_initial_clean_dup=do_initial_clean_dup,
-        in_excel_tabular_sheets=in_excel_tabular_sheets if in_excel_tabular_sheets else [],
-        remove_duplicate_rows=remove_duplicate_rows
     )
 # Function to update column choices when files are uploaded
 def update_tabular_column_choices(files, in_excel_tabular_sheets: List[str] = []):
     if not files:
         return gr.update(choices=[])
     all_columns = set()
     for file in files:
         try:
             file_extension = os.path.splitext(file.name)[-1]
-            if file_extension in ['.xlsx', '.xls']:
                 for sheet_name in in_excel_tabular_sheets:
                     df = read_file(file.name, excel_sheet_name=sheet_name)
-                    text_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
                     all_columns.update(text_cols)
             else:
                 df = read_file(file.name)
-                text_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
                 all_columns.update(text_cols)
             # Get text columns
-            text_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
             all_columns.update(text_cols)
         except Exception as e:
             print(f"Error reading {file.name}: {e}")
             continue
     return gr.Dropdown(choices=sorted(list(all_columns)))
 # Function to handle tabular duplicate detection
-def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, output_folder: str = OUTPUT_FOLDER, do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN, in_excel_tabular_sheets: List[str] = [], remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS):
     if not files:
         print("No files uploaded")
         return pd.DataFrame(), [], gr.Dropdown(choices=[]), 0, "deduplicate"
@@ -537,8 +626,9 @@ def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, o
     task_textbox = "deduplicate"
     # If output folder doesn't end with a forward slash, add one
-    if not output_folder.endswith('/'): output_folder = output_folder + '/'
     file_paths = list()
     if isinstance(files, str):
         # If 'files' is a single string, treat it as a list with one element
@@ -549,18 +639,22 @@ def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, o
             if isinstance(f_item, str):
                 # If an element is a string, it's a direct file path
                 file_paths.append(f_item)
-            elif hasattr(f_item, 'name'):
                 # If an element has a '.name' attribute (e.g., a Gradio File object), use its name
                 file_paths.append(f_item.name)
             else:
                 # Log a warning for unexpected element types within the list
-                print(f"Warning: Skipping an element in 'files' list that is neither a string nor has a '.name' attribute: {type(f_item)}")
-    elif hasattr(files, 'name'):
         # Handle the case where a single file object (e.g., gr.File) is passed directly, not in a list
         file_paths.append(files.name)
     else:
         # Raise an error for any other unexpected type of the 'files' argument itself
-        raise TypeError(f"Unexpected type for 'files' argument: {type(files)}. Expected str, list of str/file objects, or a single file object.")
     if len(file_paths) > MAX_SIMULTANEOUS_FILES:
         out_message = f"Number of files to deduplicate is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
@@ -574,21 +668,30 @@ def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, o
         text_columns=text_columns if text_columns else [],
         output_folder=output_folder,
         do_initial_clean_dup=do_initial_clean_dup,
-        in_excel_tabular_sheets=in_excel_tabular_sheets if in_excel_tabular_sheets else None,
-        remove_duplicate_rows=remove_duplicate_rows
     )
     # Update file choices for cleaning
     file_choices = list(set([f for f in file_paths]))
     end_time = time.time()
     processing_time = round(end_time - start_time, 2)
-    return results_df, output_paths, gr.Dropdown(choices=file_choices), processing_time, task_textbox
 # Function to handle row selection for preview
-def handle_tabular_row_selection(results_df, evt:gr.SelectData):
     if not evt:
         return None, "", ""
@@ -596,25 +699,32 @@ def handle_tabular_row_selection(results_df, evt:gr.SelectData):
         return None, "", ""
     elif results_df.empty:
         return None, "", ""
     selected_index = evt.index[0]
     if selected_index >= len(results_df):
         return None, "", ""
     row = results_df.iloc[selected_index]
-    return selected_index, row['Text1'], row['Text2']
 # Function to clean duplicates from selected file
-def clean_tabular_duplicates(file_name, results_df, output_folder, in_excel_tabular_sheets: str = "", remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS):
     if not file_name or results_df.empty:
         return None
     # Get duplicate rows for this file
-    file_duplicates = results_df[results_df['File2'] == file_name]['Row2'].tolist()
     if not file_duplicates:
         return None
     try:
         # Find the original file path
         # This is a simplified approach - in practice you might want to store file paths
@@ -623,9 +733,9 @@ def clean_tabular_duplicates(file_name, results_df, output_folder, in_excel_tabu
             duplicate_rows=file_duplicates,
             output_folder=output_folder,
             in_excel_tabular_sheets=in_excel_tabular_sheets,
-            remove_duplicate_rows=remove_duplicate_rows
         )
         return cleaned_file
     except Exception as e:
         print(f"Error cleaning duplicates: {e}")
-        return None

 import os
 import re
 import time
+from pathlib import Path
+from typing import Dict, List, Tuple
 import gradio as gr
+import pandas as pd
 from gradio import Progress
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from tools.config import (
+    DO_INITIAL_TABULAR_DATA_CLEAN,
+    MAX_SIMULTANEOUS_FILES,
+    MAX_TABLE_ROWS,
+    REMOVE_DUPLICATE_ROWS,
+)
 from tools.data_anonymise import initial_clean
+from tools.helper_functions import OUTPUT_FOLDER, read_file
 from tools.load_spacy_model_custom_recognisers import nlp
+if REMOVE_DUPLICATE_ROWS == "True":
+    REMOVE_DUPLICATE_ROWS = True
+else:
+    REMOVE_DUPLICATE_ROWS = False
+def clean_and_stem_text_series(
+    df: pd.DataFrame,
+    column: str,
+    do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
+):
     """
     Clean and stem text columns in a data frame for tabular data
     """
     def _apply_lemmatization(text):
         doc = nlp(text)
         # Keep only alphabetic tokens and remove stopwords
+        lemmatized_words = [
+            token.lemma_ for token in doc if token.is_alpha and not token.is_stop
+        ]
+        return " ".join(lemmatized_words)
     if do_initial_clean_dup:
+        df["text_clean"] = initial_clean(df[column])
+    df["text_clean"] = df["text_clean"].apply(_apply_lemmatization)
+    df["text_clean"] = df[
+        column
+    ].str.lower()  # .str.replace(r'[^\w\s]', '', regex=True)
     return df
 def convert_tabular_data_to_analysis_format(
+    df: pd.DataFrame, file_name: str, text_columns: List[str] = None
 ) -> List[Tuple[str, pd.DataFrame]]:
     """
     Convert tabular data (CSV/XLSX) to the format needed for duplicate analysis.
     Args:
         df (pd.DataFrame): The input DataFrame
         file_name (str): Name of the file
+        text_columns (List[str], optional): Columns to analyze for duplicates.
                                           If None, uses all string columns.
     Returns:
         List[Tuple[str, pd.DataFrame]]: List containing (file_name, processed_df) tuple
     """
     #     text_columns = df.select_dtypes(include=['object', 'string']).columns.tolist()
     text_columns = [col for col in text_columns if col in df.columns]
     if not text_columns:
         print(f"No text columns found in {file_name}")
         return list()
     # Create a copy to avoid modifying original
     df_copy = df.copy()
     # Create a combined text column from all text columns
+    df_copy["combined_text"] = (
+        df_copy[text_columns].fillna("").astype(str).agg(" ".join, axis=1)
+    )
     # Add row identifier
+    df_copy["row_id"] = df_copy.index
     # Create the format expected by the duplicate detection system
     # Using 'row_number' as row number and 'text' as the combined text
+    processed_df = pd.DataFrame(
+        {
+            "row_number": df_copy["row_id"],
+            "text": df_copy["combined_text"],
+            "file": file_name,
+        }
+    )
     # Add original row data for reference
     for col in text_columns:
+        processed_df[f"original_{col}"] = df_copy[col]
     return [(file_name, processed_df)]
 def find_duplicate_cells_in_tabular_data(
     input_files: List[str],
     similarity_threshold: float = 0.95,
     do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
     remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
     in_excel_tabular_sheets: str = "",
+    progress: Progress = Progress(track_tqdm=True),
 ) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
     """
     Find duplicate cells/text in tabular data files (CSV, XLSX, Parquet).
     Args:
         input_files (List[str]): List of file paths to analyze
         similarity_threshold (float): Minimum similarity score to consider duplicates
         output_folder (str, optional): Output folder for results
         do_initial_clean_dup (bool, optional): Whether to do initial clean of text
         progress (Progress): Progress tracking object
     Returns:
         Tuple containing:
         - results_df: DataFrame with duplicate matches
         - output_paths: List of output file paths
         - full_data_by_file: Dictionary of processed data by file
     """
     if not input_files:
         raise gr.Error("Please upload files to analyze.")
     progress(0.1, desc="Loading and processing files...")
     all_data_to_process = list()
     full_data_by_file = dict()
+    file_paths = list()
     # Process each file
     for file_path in input_files:
         try:
+            if file_path.endswith(".xlsx") or file_path.endswith(".xls"):
                 temp_df = pd.DataFrame()
                 # Try finding each sheet in the given list until a match is found
                         file_name = os.path.basename(file_path) + "_" + sheet_name
                         file_paths.append(file_path)
                         # Convert to analysis format
                         processed_data = convert_tabular_data_to_analysis_format(
                             temp_df, file_name, text_columns
                         )
                         if processed_data:
                             all_data_to_process.extend(processed_data)
                             full_data_by_file[file_name] = processed_data[0][1]
                     out_message = f"Number of rows in {file_path} is greater than {MAX_TABLE_ROWS}. Please submit a smaller file."
                     print(out_message)
                     raise Exception(out_message)
                 file_name = os.path.basename(file_path)
                 file_paths.append(file_path)
                 # Convert to analysis format
                 processed_data = convert_tabular_data_to_analysis_format(
                     temp_df, file_name, text_columns
                 )
                 if processed_data:
                     all_data_to_process.extend(processed_data)
                     full_data_by_file[file_name] = processed_data[0][1]
         except Exception as e:
             print(f"Error processing {file_path}: {e}")
             continue
     if not all_data_to_process:
         raise gr.Error("No valid data found in uploaded files.")
     progress(0.2, desc="Combining data...")
     # Combine all data
+    combined_df = pd.concat(
+        [data[1] for data in all_data_to_process], ignore_index=True
+    )
+    combined_df = combined_df.drop_duplicates(subset=["row_number", "file"])
     progress(0.3, desc="Cleaning and preparing text...")
     # Clean and prepare text
+    combined_df = clean_and_stem_text_series(
+        combined_df, "text", do_initial_clean_dup=do_initial_clean_dup
+    )
     # Filter by minimum word count
+    combined_df["word_count"] = (
+        combined_df["text_clean"].str.split().str.len().fillna(0)
+    )
+    combined_df = combined_df[combined_df["word_count"] >= min_word_count].copy()
     if len(combined_df) < 2:
         return pd.DataFrame(), [], full_data_by_file
     progress(0.4, desc="Calculating similarities...")
     # Calculate similarities
     vectorizer = TfidfVectorizer()
+    tfidf_matrix = vectorizer.fit_transform(combined_df["text_clean"])
     similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
     # Find similar pairs
     coo_matrix = similarity_matrix.tocoo()
     similar_pairs = [
+        (r, c, v)
+        for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
         if r < c and v >= similarity_threshold
     ]
     if not similar_pairs:
         gr.Info("No duplicate cells found.")
         return pd.DataFrame(), [], full_data_by_file
     progress(0.7, desc="Processing results...")
     # Create results DataFrame
     results_data = []
     for row1, row2, similarity in similar_pairs:
         row1_data = combined_df.iloc[row1]
         row2_data = combined_df.iloc[row2]
+        results_data.append(
+            {
+                "File1": row1_data["file"],
+                "Row1": int(row1_data["row_number"]),
+                "File2": row2_data["file"],
+                "Row2": int(row2_data["row_number"]),
+                "Similarity_Score": round(similarity, 3),
+                "Text1": (
+                    row1_data["text"][:200] + "..."
+                    if len(row1_data["text"]) > 200
+                    else row1_data["text"]
+                ),
+                "Text2": (
+                    row2_data["text"][:200] + "..."
+                    if len(row2_data["text"]) > 200
+                    else row2_data["text"]
+                ),
+                "Original_Index1": row1,
+                "Original_Index2": row2,
+            }
+        )
     results_df = pd.DataFrame(results_data)
+    results_df = results_df.sort_values(["File1", "Row1", "File2", "Row2"])
     progress(0.9, desc="Saving results...")
     # Save results
+    output_paths = save_tabular_duplicate_results(
+        results_df,
+        output_folder,
+        file_paths,
+        remove_duplicate_rows=remove_duplicate_rows,
+        in_excel_tabular_sheets=in_excel_tabular_sheets,
+    )
     gr.Info(f"Found {len(results_df)} duplicate cell matches")
     return results_df, output_paths, full_data_by_file
+def save_tabular_duplicate_results(
+    results_df: pd.DataFrame,
+    output_folder: str,
+    file_paths: List[str],
+    remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
+    in_excel_tabular_sheets: List[str] = [],
+) -> List[str]:
     """
     Save tabular duplicate detection results to files.
     Args:
         results_df (pd.DataFrame): Results DataFrame
         output_folder (str): Output folder path
     output_paths = list()
     output_folder_path = Path(output_folder)
     output_folder_path.mkdir(exist_ok=True)
     if results_df.empty:
         print("No duplicate matches to save.")
         return list()
     # Save main results
+    results_file = output_folder_path / "tabular_duplicate_results.csv"
     results_df.to_csv(results_file, index=False, encoding="utf-8-sig")
     output_paths.append(str(results_file))
     # Group results by original file to handle Excel files properly
+    excel_files_processed = dict()  # Track which Excel files have been processed
     # Save per-file duplicate lists
+    for file_name, group in results_df.groupby("File2"):
         # Check for matches with original file names
         for original_file in file_paths:
             original_file_name = os.path.basename(original_file)
             if original_file_name in file_name:
                 original_file_extension = os.path.splitext(original_file)[-1]
+                if original_file_extension in [".xlsx", ".xls"]:
                     # Split the string using a regex to handle both .xlsx_ and .xls_ delimiters
                     # The regex r'\.xlsx_|\.xls_' correctly matches either ".xlsx_" or ".xls_" as a delimiter.
+                    parts = re.split(r"\.xlsx_|\.xls_", os.path.basename(file_name))
                     # The sheet name is the last part after splitting
                     file_sheet_name = parts[-1]
                     file_path = original_file
                     # Initialize Excel file tracking if not already done
                     if file_path not in excel_files_processed:
                         excel_files_processed[file_path] = {
+                            "sheets_data": dict(),
+                            "all_sheets": list(),
+                            "processed_sheets": set(),
                         }
                     # Read the original Excel file to get all sheet names
+                    if not excel_files_processed[file_path]["all_sheets"]:
                         try:
                             excel_file = pd.ExcelFile(file_path)
+                            excel_files_processed[file_path][
+                                "all_sheets"
+                            ] = excel_file.sheet_names
                         except Exception as e:
                             print(f"Error reading Excel file {file_path}: {e}")
                             continue
                     # Read the current sheet
                     df = read_file(file_path, excel_sheet_name=file_sheet_name)
                     # Create duplicate rows file for this sheet
                     file_stem = Path(file_name).stem
+                    duplicate_rows_file = (
+                        output_folder_path
+                        / f"{file_stem}_{file_sheet_name}_duplicate_rows.csv"
+                    )
                     # Get unique row numbers to remove
+                    rows_to_remove = sorted(group["Row2"].unique())
+                    duplicate_df = pd.DataFrame({"Row_to_Remove": rows_to_remove})
                     duplicate_df.to_csv(duplicate_rows_file, index=False)
                     output_paths.append(str(duplicate_rows_file))
                     # Process the sheet data
                     df_cleaned = df.copy()
                     df_cleaned["duplicated"] = False
                     df_cleaned.loc[rows_to_remove, "duplicated"] = True
                     if remove_duplicate_rows:
                         df_cleaned = df_cleaned.drop(index=rows_to_remove)
                     # Store the processed sheet data
+                    excel_files_processed[file_path]["sheets_data"][
+                        file_sheet_name
+                    ] = df_cleaned
+                    excel_files_processed[file_path]["processed_sheets"].add(
+                        file_sheet_name
+                    )
                 else:
                     file_sheet_name = ""
                     file_path = original_file
                     df = read_file(file_path)
                     file_stem = Path(file_name).stem
+                    duplicate_rows_file = (
+                        output_folder_path / f"{file_stem}_duplicate_rows.csv"
+                    )
                     # Get unique row numbers to remove
+                    rows_to_remove = sorted(group["Row2"].unique())
+                    duplicate_df = pd.DataFrame({"Row_to_Remove": rows_to_remove})
                     duplicate_df.to_csv(duplicate_rows_file, index=False)
                     output_paths.append(str(duplicate_rows_file))
                     file_ext = os.path.splitext(file_name)[-1]
+                    if file_ext in [".parquet"]:
+                        output_path = os.path.join(
+                            output_folder, f"{file_base_name}_deduplicated.parquet"
+                        )
                         df_cleaned.to_parquet(output_path, index=False)
                     else:
+                        output_path = os.path.join(
+                            output_folder, f"{file_base_name}_deduplicated.csv"
+                        )
+                        df_cleaned.to_csv(
+                            output_path, index=False, encoding="utf-8-sig"
+                        )
                     output_paths.append(str(output_path))
                 break
     # Process Excel files to create complete deduplicated files
     for file_path, file_data in excel_files_processed.items():
         try:
             # Create output filename
             file_base_name = os.path.splitext(os.path.basename(file_path))[0]
             file_ext = os.path.splitext(file_path)[-1]
+            output_path = os.path.join(
+                output_folder, f"{file_base_name}_deduplicated{file_ext}"
+            )
             # Create Excel writer
+            with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
                 # Write all sheets
+                for sheet_name in file_data["all_sheets"]:
+                    if sheet_name in file_data["processed_sheets"]:
                         # Use the processed (deduplicated) version
+                        file_data["sheets_data"][sheet_name].to_excel(
+                            writer, sheet_name=sheet_name, index=False
                         )
                     else:
                         # Use the original sheet (no duplicates found)
                         original_df = read_file(file_path, excel_sheet_name=sheet_name)
+                        original_df.to_excel(writer, sheet_name=sheet_name, index=False)
             output_paths.append(str(output_path))
             print(f"Created deduplicated Excel file: {output_path}")
         except Exception as e:
             print(f"Error creating deduplicated Excel file for {file_path}: {e}")
             continue
     return output_paths
 def remove_duplicate_rows_from_tabular_data(
     file_path: str,
     duplicate_rows: List[int],
     output_folder: str = OUTPUT_FOLDER,
     in_excel_tabular_sheets: List[str] = [],
+    remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
 ) -> str:
     """
     Remove duplicate rows from a tabular data file.
     Args:
         file_path (str): Path to the input file
         duplicate_rows (List[int]): List of row indices to remove
     """
     try:
         # Load the file
+        df = read_file(
+            file_path,
+            excel_sheet_name=in_excel_tabular_sheets if in_excel_tabular_sheets else "",
+        )
         # Remove duplicate rows (0-indexed)
         df_cleaned = df.drop(index=duplicate_rows).reset_index(drop=True)
         # Save cleaned file
         file_name = os.path.basename(file_path)
         file_stem = os.path.splitext(file_name)[0]
         file_ext = os.path.splitext(file_name)[-1]
         output_path = os.path.join(output_folder, f"{file_stem}_deduplicated{file_ext}")
+        if file_ext in [".xlsx", ".xls"]:
+            df_cleaned.to_excel(
+                output_path,
+                index=False,
+                sheet_name=in_excel_tabular_sheets if in_excel_tabular_sheets else [],
+            )
+        elif file_ext in [".parquet"]:
             df_cleaned.to_parquet(output_path, index=False)
         else:
             df_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")
         return output_path
     except Exception as e:
         print(f"Error removing duplicates from {file_path}: {e}")
         raise
 def run_tabular_duplicate_analysis(
     files: List[str],
     threshold: float,
     do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
     remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
     in_excel_tabular_sheets: List[str] = [],
+    progress: Progress = Progress(track_tqdm=True),
 ) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
     """
     Main function to run tabular duplicate analysis.
     Args:
         files (List[str]): List of file paths
         threshold (float): Similarity threshold
         text_columns (List[str], optional): Specific columns to analyze
         output_folder (str, optional): Output folder for results
         progress (Progress): Progress tracking
     Returns:
         Tuple containing results DataFrame, output paths, and full data by file
     """
         text_columns=text_columns if text_columns else [],
         output_folder=output_folder,
         do_initial_clean_dup=do_initial_clean_dup,
+        in_excel_tabular_sheets=(
+            in_excel_tabular_sheets if in_excel_tabular_sheets else []
+        ),
+        remove_duplicate_rows=remove_duplicate_rows,
     )
 # Function to update column choices when files are uploaded
 def update_tabular_column_choices(files, in_excel_tabular_sheets: List[str] = []):
     if not files:
         return gr.update(choices=[])
     all_columns = set()
     for file in files:
         try:
             file_extension = os.path.splitext(file.name)[-1]
+            if file_extension in [".xlsx", ".xls"]:
                 for sheet_name in in_excel_tabular_sheets:
                     df = read_file(file.name, excel_sheet_name=sheet_name)
+                    text_cols = df.select_dtypes(
+                        include=["object", "string"]
+                    ).columns.tolist()
                     all_columns.update(text_cols)
             else:
                 df = read_file(file.name)
+                text_cols = df.select_dtypes(
+                    include=["object", "string"]
+                ).columns.tolist()
                 all_columns.update(text_cols)
             # Get text columns
+            text_cols = df.select_dtypes(include=["object", "string"]).columns.tolist()
             all_columns.update(text_cols)
         except Exception as e:
             print(f"Error reading {file.name}: {e}")
             continue
     return gr.Dropdown(choices=sorted(list(all_columns)))
 # Function to handle tabular duplicate detection
+def run_tabular_duplicate_detection(
+    files,
+    threshold,
+    min_words,
+    text_columns,
+    output_folder: str = OUTPUT_FOLDER,
+    do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
+    in_excel_tabular_sheets: List[str] = [],
+    remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
+):
     if not files:
         print("No files uploaded")
         return pd.DataFrame(), [], gr.Dropdown(choices=[]), 0, "deduplicate"
     task_textbox = "deduplicate"
     # If output folder doesn't end with a forward slash, add one
+    if not output_folder.endswith("/"):
+        output_folder = output_folder + "/"
     file_paths = list()
     if isinstance(files, str):
         # If 'files' is a single string, treat it as a list with one element
             if isinstance(f_item, str):
                 # If an element is a string, it's a direct file path
                 file_paths.append(f_item)
+            elif hasattr(f_item, "name"):
                 # If an element has a '.name' attribute (e.g., a Gradio File object), use its name
                 file_paths.append(f_item.name)
             else:
                 # Log a warning for unexpected element types within the list
+                print(
+                    f"Warning: Skipping an element in 'files' list that is neither a string nor has a '.name' attribute: {type(f_item)}"
+                )
+    elif hasattr(files, "name"):
         # Handle the case where a single file object (e.g., gr.File) is passed directly, not in a list
         file_paths.append(files.name)
     else:
         # Raise an error for any other unexpected type of the 'files' argument itself
+        raise TypeError(
+            f"Unexpected type for 'files' argument: {type(files)}. Expected str, list of str/file objects, or a single file object."
+        )
     if len(file_paths) > MAX_SIMULTANEOUS_FILES:
         out_message = f"Number of files to deduplicate is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
         text_columns=text_columns if text_columns else [],
         output_folder=output_folder,
         do_initial_clean_dup=do_initial_clean_dup,
+        in_excel_tabular_sheets=(
+            in_excel_tabular_sheets if in_excel_tabular_sheets else None
+        ),
+        remove_duplicate_rows=remove_duplicate_rows,
     )
     # Update file choices for cleaning
     file_choices = list(set([f for f in file_paths]))
     end_time = time.time()
     processing_time = round(end_time - start_time, 2)
+    return (
+        results_df,
+        output_paths,
+        gr.Dropdown(choices=file_choices),
+        processing_time,
+        task_textbox,
+    )
 # Function to handle row selection for preview
+def handle_tabular_row_selection(results_df, evt: gr.SelectData):
     if not evt:
         return None, "", ""
         return None, "", ""
     elif results_df.empty:
         return None, "", ""
     selected_index = evt.index[0]
     if selected_index >= len(results_df):
         return None, "", ""
     row = results_df.iloc[selected_index]
+    return selected_index, row["Text1"], row["Text2"]
 # Function to clean duplicates from selected file
+def clean_tabular_duplicates(
+    file_name,
+    results_df,
+    output_folder,
+    in_excel_tabular_sheets: str = "",
+    remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
+):
     if not file_name or results_df.empty:
         return None
     # Get duplicate rows for this file
+    file_duplicates = results_df[results_df["File2"] == file_name]["Row2"].tolist()
     if not file_duplicates:
         return None
     try:
         # Find the original file path
         # This is a simplified approach - in practice you might want to store file paths
             duplicate_rows=file_duplicates,
             output_folder=output_folder,
             in_excel_tabular_sheets=in_excel_tabular_sheets,
+            remove_duplicate_rows=remove_duplicate_rows,
         )
         return cleaned_file
     except Exception as e:
         print(f"Error cleaning duplicates: {e}")
+        return None

tools/helper_functions.py CHANGED Viewed

@@ -1,66 +1,119 @@
 import os
 import re
 import boto3
-from botocore.exceptions import ClientError
 import gradio as gr
-import pandas as pd
 import numpy as np
-import unicodedata
-from typing import List
-from math import ceil
 from gradio_image_annotation import image_annotator
-from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION, MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES, textract_language_choices, aws_comprehend_language_choices, DEFAULT_LANGUAGE
 def _get_env_list(env_var_name: str) -> List[str]:
     """Parses a comma-separated environment variable into a list of strings."""
-    value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
     if not value:
         return []
     # Split by comma and filter out any empty strings that might result from extra commas
-    return [s.strip() for s in value.split(',') if s.strip()]
-if textract_language_choices: textract_language_choices = _get_env_list(textract_language_choices)
-if aws_comprehend_language_choices: aws_comprehend_language_choices = _get_env_list(aws_comprehend_language_choices)
-if MAPPED_LANGUAGE_CHOICES: MAPPED_LANGUAGE_CHOICES = _get_env_list(MAPPED_LANGUAGE_CHOICES)
-if LANGUAGE_CHOICES: LANGUAGE_CHOICES = _get_env_list(LANGUAGE_CHOICES)
 LANGUAGE_MAP = dict(zip(MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES))
 def reset_state_vars():
-    return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
             label="Modify redaction boxes",
             label_list=["Redaction"],
             label_colors=[(0, 0, 0)],
             show_label=False,
-            sources=None,#["upload"],
             show_clear_button=False,
             show_share_button=False,
             show_remove_button=False,
-            interactive=False
-        ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False, 0, []
 def reset_ocr_results_state():
     return pd.DataFrame(), pd.DataFrame(), []
 def reset_review_vars():
     return pd.DataFrame(), pd.DataFrame()
 def reset_data_vars():
     return 0, [], 0
 def reset_aws_call_vars():
     return 0, 0
 def load_in_default_allow_list(allow_list_file_path):
     if isinstance(allow_list_file_path, str):
         allow_list_file_path = [allow_list_file_path]
     return allow_list_file_path
-def load_in_default_cost_codes(cost_codes_path:str, default_cost_code:str=""):
-    '''
     Load in the cost codes list from file.
-    '''
     cost_codes_df = pd.read_csv(cost_codes_path)
     dropdown_choices = cost_codes_df.iloc[:, 0].astype(str).tolist()
@@ -76,36 +129,50 @@ def load_in_default_cost_codes(cost_codes_path:str, default_cost_code:str=""):
         value=default_cost_code if default_cost_code in dropdown_choices else "",
         label="Choose cost code for analysis",
         choices=dropdown_choices,
-        allow_custom_value=False
     )
     return cost_codes_df, cost_codes_df, out_dropdown
-def enforce_cost_codes(enforce_cost_code_textbox:str, cost_code_choice:str, cost_code_df:pd.DataFrame, verify_cost_codes:bool=True):
-    '''
     Check if the enforce cost codes variable is set to true, and then check that a cost cost has been chosen. If not, raise an error. Then, check against the values in the cost code dataframe to ensure that the cost code exists.
-    '''
     if enforce_cost_code_textbox == "True":
         if not cost_code_choice:
             raise Exception("Please choose a cost code before continuing")
-        if verify_cost_codes == True:
             if cost_code_df.empty:
                 raise Exception("No cost codes present in dataframe for verification")
             else:
-                valid_cost_codes_list = list(cost_code_df.iloc[:,0].unique())
-                if not cost_code_choice in valid_cost_codes_list:
-                    raise Exception("Selected cost code not found in list. Please contact Finance if you cannot find the correct cost code from the given list of suggestions.")
     return
-def update_cost_code_dataframe_from_dropdown_select(cost_dropdown_selection:str, cost_code_df:pd.DataFrame):
-    cost_code_df = cost_code_df.loc[cost_code_df.iloc[:,0] == cost_dropdown_selection, :]
     return cost_code_df
-def ensure_folder_exists(output_folder:str):
-    """Checks if the specified folder exists, creates it if not."""
     if not os.path.exists(output_folder):
         # Create the folder if it doesn't exist
@@ -114,58 +181,80 @@ def ensure_folder_exists(output_folder:str):
     else:
         print(f"The {output_folder} folder already exists.")
-def update_dataframe(df:pd.DataFrame):
     df_copy = df.copy()
     return df_copy
 def get_file_name_without_type(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
     # Then, split the basename and its extension and return only the basename without the extension
     filename_without_extension, _ = os.path.splitext(basename)
-    #print(filename_without_extension)
     return filename_without_extension
-def detect_file_type(filename:str):
     """Detect the file type based on its extension."""
     if not isinstance(filename, str):
         filename = str(filename)
-    if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')): return 'csv'
-    elif filename.endswith('.xlsx'): return 'xlsx'
-    elif filename.endswith('.xls'): return 'xls'
-    elif filename.endswith('.parquet'): return 'parquet'
-    elif filename.endswith('.pdf'): return 'pdf'
-    elif filename.endswith('.jpg'): return 'jpg'
-    elif filename.endswith('.jpeg'): return 'jpeg'
-    elif filename.endswith('.png'): return 'png'
-    elif filename.endswith('.xfdf'): return 'xfdf'
-    elif filename.endswith('.docx'): return 'docx'
-    else: raise ValueError("Unsupported file type.")
-def read_file(filename:str, excel_sheet_name: str = ""):
     """Read the file based on its detected type."""
     file_type = detect_file_type(filename)
-    if file_type == 'csv':
         return pd.read_csv(filename, low_memory=False)
-    elif file_type == 'xlsx':
         if excel_sheet_name:
             try:
                 return pd.read_excel(filename, sheet_name=excel_sheet_name)
             except Exception as e:
-                print(f"Error reading {filename} with sheet name {excel_sheet_name}: {e}")
                 return pd.DataFrame()
         else:
             return pd.read_excel(filename)
-    elif file_type == 'parquet':
         return pd.read_parquet(filename)
-def ensure_output_folder_exists(output_folder:str):
-    """Checks if the specified folder exists, creates it if not."""
     if not os.path.exists(output_folder):
         # Create the folder if it doesn't exist
@@ -175,11 +264,10 @@ def ensure_output_folder_exists(output_folder:str):
         print(f"The {output_folder} folder already exists.")
-def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
-    '''
     When file is loaded, update the column dropdown choices and write to relevant data states.
-    '''
     custom_regex_df = pd.DataFrame()
     if in_file:
@@ -188,11 +276,13 @@ def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
         regex_file_names = [string for string in file_list if "csv" in string.lower()]
         if regex_file_names:
             regex_file_name = regex_file_names[0]
-            custom_regex_df = pd.read_csv(regex_file_name, low_memory=False, header=None)
             # Select just first columns
-            custom_regex_df = pd.DataFrame(custom_regex_df.iloc[:,[0]])
-            custom_regex_df.rename(columns={0:file_type}, inplace=True)
             custom_regex_df.columns = custom_regex_df.columns.astype(str)
@@ -200,23 +290,24 @@ def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
             print(output_text)
     else:
         output_text = "No file provided."
-        #print(output_text)
         return output_text, custom_regex_df
     return output_text, custom_regex_df
-def put_columns_in_df(in_file:List[str]):
     new_choices = []
     concat_choices = []
     all_sheet_names = []
     number_of_excel_files = 0
     for file in in_file:
         file_name = file.name
         file_type = detect_file_type(file_name)
         print("File type is:", file_type)
-        if (file_type == 'xlsx') | (file_type == 'xls'):
             number_of_excel_files += 1
             new_choices = []
             print("Running through all xlsx sheets")
@@ -240,47 +331,65 @@ def put_columns_in_df(in_file:List[str]):
             new_choices = []
         concat_choices.extend(new_choices)
     # Drop duplicate columns
     concat_choices = list(set(concat_choices))
-    if number_of_excel_files > 0:
-        return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names, visible=True)
     else:
-        return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(visible=False)
-def check_for_existing_textract_file(doc_file_name_no_extension_textbox:str, output_folder:str=OUTPUT_FOLDER):
-    textract_output_path = os.path.join(output_folder, doc_file_name_no_extension_textbox + "_textract.json")
     if os.path.exists(textract_output_path):
-        print("Existing Textract analysis output file found.")
         return True
     else:
         return False
-def check_for_relevant_ocr_output_with_words(doc_file_name_no_extension_textbox:str, text_extraction_method:str, output_folder:str=OUTPUT_FOLDER):
-    if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_text.json"
-    elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_ocr.json"
-    elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_textract.json"
     else:
         print("No valid text extraction method found. Returning False")
         return False
     doc_file_with_ending = doc_file_name_no_extension_textbox + file_ending
     local_ocr_output_path = os.path.join(output_folder, doc_file_with_ending)
     if os.path.exists(local_ocr_output_path):
-        print("Existing OCR with words analysis output file found.")
-        return True
     else:
         return False
 def add_folder_to_path(folder_path: str):
-    '''
     Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
-    '''
     if os.path.exists(folder_path) and os.path.isdir(folder_path):
         print(folder_path, "folder exists.")
@@ -288,21 +397,31 @@ def add_folder_to_path(folder_path: str):
         # Resolve relative path to absolute path
         absolute_path = os.path.abspath(folder_path)
-        current_path = os.environ['PATH']
         if absolute_path not in current_path.split(os.pathsep):
             full_path_extension = absolute_path + os.pathsep + current_path
-            os.environ['PATH'] = full_path_extension
-            #print(f"Updated PATH with: ", full_path_extension)
         else:
             print(f"Directory {folder_path} already exists in PATH.")
     else:
         print(f"Folder not found at {folder_path} - not added to PATH")
 # Upon running a process, the feedback buttons are revealed
 def reveal_feedback_buttons():
-    return gr.Radio(visible=True, label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document."), gr.Textbox(visible=True), gr.Button(visible=True), gr.Markdown(visible=True)
-def wipe_logs(feedback_logs_loc:str, usage_logs_loc:str):
     try:
         os.remove(feedback_logs_loc)
     except Exception as e:
@@ -312,7 +431,8 @@ def wipe_logs(feedback_logs_loc:str, usage_logs_loc:str):
     except Exception as e:
         print("Could not remove usage logs file", e)
-def merge_csv_files(file_list:List[str], output_folder:str=OUTPUT_FOLDER):
     # Initialise an empty list to hold DataFrames
     dataframes = []
@@ -325,14 +445,16 @@ def merge_csv_files(file_list:List[str], output_folder:str=OUTPUT_FOLDER):
         dataframes.append(df)
     # Concatenate all DataFrames into a single DataFrame
-    merged_df = pd.concat(dataframes, ignore_index=True)
-    for col in ['xmin', 'xmax', 'ymin', 'ymax']:
         merged_df[col] = np.floor(merged_df[col])
-    merged_df = merged_df.drop_duplicates(subset=['page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax'])
-    merged_df = merged_df.sort_values(['page', 'ymin', 'xmin', 'label'])
     file_out_name = os.path.basename(file_list[0])
@@ -344,51 +466,58 @@ def merge_csv_files(file_list:List[str], output_folder:str=OUTPUT_FOLDER):
     return output_files
-async def get_connection_params(request: gr.Request,
-                                output_folder_textbox:str=OUTPUT_FOLDER,
-                                input_folder_textbox:str=INPUT_FOLDER,
-                                session_output_folder:str=SESSION_OUTPUT_FOLDER,
-                                textract_document_upload_input_folder:str=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
-                                textract_document_upload_output_folder:str=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
-                                s3_textract_document_logs_subfolder:str=TEXTRACT_JOBS_S3_LOC,
-                                local_textract_document_logs_subfolder:str=TEXTRACT_JOBS_LOCAL_LOC):
-    #print("Session hash:", request.session_hash)
     if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
-            if CUSTOM_HEADER in request.headers:
-                supplied_custom_header_value = request.headers[CUSTOM_HEADER]
-                if supplied_custom_header_value == CUSTOM_HEADER_VALUE:
-                    print("Custom header supplied and matches CUSTOM_HEADER_VALUE")
-                else:
-                    print("Custom header value does not match expected value.")
-                    raise ValueError("Custom header value does not match expected value.")
             else:
-                print("Custom header value not found.")
-                raise ValueError("Custom header value not found.")
     # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
     if request.username:
         out_session_hash = request.username
-        #print("Request username found:", out_session_hash)
-    elif 'x-cognito-id' in request.headers:
-        out_session_hash = request.headers['x-cognito-id']
-        #print("Cognito ID found:", out_session_hash)
-    elif 'x-amzn-oidc-identity' in request.headers:
-        out_session_hash = request.headers['x-amzn-oidc-identity']
         # Fetch email address using Cognito client
-        cognito_client = boto3.client('cognito-idp')
         try:
             response = cognito_client.admin_get_user(
                 UserPoolId=AWS_USER_POOL_ID,  # Replace with your User Pool ID
-                Username=out_session_hash
             )
-            email = next(attr['Value'] for attr in response['UserAttributes'] if attr['Name'] == 'email')
-            #print("Email address found:", email)
             out_session_hash = email
         except ClientError as e:
@@ -400,33 +529,59 @@ async def get_connection_params(request: gr.Request,
     else:
         out_session_hash = request.session_hash
-    if session_output_folder == 'True':
         output_folder = output_folder_textbox + out_session_hash + "/"
         input_folder = input_folder_textbox + out_session_hash + "/"
-        textract_document_upload_input_folder = textract_document_upload_input_folder + "/" + out_session_hash
-        textract_document_upload_output_folder = textract_document_upload_output_folder + "/" + out_session_hash
-        s3_textract_document_logs_subfolder = s3_textract_document_logs_subfolder + "/" + out_session_hash
-        local_textract_document_logs_subfolder = local_textract_document_logs_subfolder + "/" + out_session_hash + "/"
     else:
         output_folder = output_folder_textbox
         input_folder = input_folder_textbox
-    if not os.path.exists(output_folder): os.mkdir(output_folder)
-    if not os.path.exists(input_folder): os.mkdir(input_folder)
-    return out_session_hash, output_folder, out_session_hash, input_folder, textract_document_upload_input_folder, textract_document_upload_output_folder, s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder
-def clean_unicode_text(text:str):
     # Step 1: Normalise unicode characters to decompose any special forms
-    normalized_text = unicodedata.normalize('NFKC', text)
     # Step 2: Replace smart quotes and special punctuation with standard ASCII equivalents
     replacements = {
-        '‘': "'", '’': "'", '“': '"', '”': '"',
-        '–': '-', '—': '-', '…': '...', '•': '*',
     }
     # Perform replacements
@@ -436,14 +591,15 @@ def clean_unicode_text(text:str):
     # Step 3: Optionally remove non-ASCII characters if needed
     # This regex removes any remaining non-ASCII characters, if desired.
     # Comment this line if you want to keep all Unicode characters.
-    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
     return cleaned_text
-def load_all_output_files(folder_path:str=OUTPUT_FOLDER) -> List[str]:
     """Get the file paths of all files in the given folder."""
     file_paths = []
     # List all files in the specified folder
     for filename in os.listdir(folder_path):
         # Construct full file path
@@ -451,26 +607,29 @@ def load_all_output_files(folder_path:str=OUTPUT_FOLDER) -> List[str]:
         # Check if it's a file (not a directory)
         if os.path.isfile(full_path):
             file_paths.append(full_path)
     return file_paths
-def calculate_aws_costs(number_of_pages:str,
-                        text_extract_method_radio:str,
-                        handwrite_signature_checkbox:List[str],
-                        pii_identification_method:str,
-                        textract_output_found_checkbox:bool,
-                        only_extract_text_radio:bool,
-                        convert_to_gbp:bool=True,
-                        usd_gbp_conversion_rate:float=0.76,
-                        textract_page_cost:float=1.5/1000,
-                        textract_signature_cost:float=2.0/1000,
-                        comprehend_unit_cost:float=0.0001,
-                        comprehend_size_unit_average:float=250,
-                        average_characters_per_page:float=2000,
-                        TEXTRACT_TEXT_EXTRACT_OPTION:str=TEXTRACT_TEXT_EXTRACT_OPTION,
-                        NO_REDACTION_PII_OPTION:str=NO_REDACTION_PII_OPTION,
-                        AWS_PII_OPTION:str=AWS_PII_OPTION):
-    '''
     Calculate the approximate cost of submitting a document to AWS Textract and/or AWS Comprehend, assuming that Textract outputs do not already exist in the output folder.
     - number_of_pages: The number of pages in the uploaded document(s).
@@ -489,49 +648,57 @@ def calculate_aws_costs(number_of_pages:str,
     - TEXTRACT_TEXT_EXTRACT_OPTION (str, optional): String label for the text_extract_method_radio button for AWS Textract.
     - NO_REDACTION_PII_OPTION (str, optional): String label for pii_identification_method_drop for no redaction.
     - AWS_PII_OPTION (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
-    '''
     text_extraction_cost = 0
     pii_identification_cost = 0
     calculated_aws_cost = 0
     number_of_pages = int(number_of_pages)
-    if textract_output_found_checkbox != True:
         if text_extract_method_radio == TEXTRACT_TEXT_EXTRACT_OPTION:
             text_extraction_cost = number_of_pages * textract_page_cost
             if "Extract signatures" in handwrite_signature_checkbox:
-                text_extraction_cost += (textract_signature_cost * number_of_pages)
     if pii_identification_method != NO_REDACTION_PII_OPTION:
         if pii_identification_method == AWS_PII_OPTION:
-            comprehend_page_cost = ceil(average_characters_per_page / comprehend_size_unit_average) * comprehend_unit_cost
             pii_identification_cost = comprehend_page_cost * number_of_pages
-    calculated_aws_cost = calculated_aws_cost + text_extraction_cost + pii_identification_cost
-    if convert_to_gbp == True:
         calculated_aws_cost *= usd_gbp_conversion_rate
     return calculated_aws_cost
-def calculate_time_taken(number_of_pages:str,
-                        text_extract_method_radio:str,
-                        pii_identification_method:str,
-                        textract_output_found_checkbox:bool,
-                        only_extract_text_radio:bool,
-                        local_ocr_output_found_checkbox:bool,
-                        convert_page_time:float=0.5,
-                        textract_page_time:float=1.2,
-                        comprehend_page_time:float=1.2,
-                        local_text_extraction_page_time:float=0.3,
-                        local_pii_redaction_page_time:float=0.5,
-                        local_ocr_extraction_page_time:float=1.5,
-                        TEXTRACT_TEXT_EXTRACT_OPTION:str=TEXTRACT_TEXT_EXTRACT_OPTION,
-                        SELECTABLE_TEXT_EXTRACT_OPTION:str=SELECTABLE_TEXT_EXTRACT_OPTION,
-                        local_ocr_option:str=TESSERACT_TEXT_EXTRACT_OPTION,
-                        NO_REDACTION_PII_OPTION:str=NO_REDACTION_PII_OPTION,
-                        AWS_PII_OPTION:str=AWS_PII_OPTION):
-    '''
     Calculate the approximate time to redact a document.
     - number_of_pages: The number of pages in the uploaded document(s).
@@ -548,9 +715,9 @@ def calculate_time_taken(number_of_pages:str,
     - TEXTRACT_TEXT_EXTRACT_OPTION (str, optional): String label for the text_extract_method_radio button for AWS Textract.
     - SELECTABLE_TEXT_EXTRACT_OPTION (str, optional): String label for text_extract_method_radio for text extraction.
     - local_ocr_option (str, optional): String label for text_extract_method_radio for local OCR.
-    - NO_REDACTION_PII_OPTION (str, optional): String label for pii_identification_method_drop for no redaction.
     - AWS_PII_OPTION (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
-    '''
     calculated_time_taken = 0
     page_conversion_time_taken = 0
     page_extraction_time_taken = 0
@@ -559,16 +726,20 @@ def calculate_time_taken(number_of_pages:str,
     number_of_pages = int(number_of_pages)
     # Page preparation/conversion to image time
-    if (text_extract_method_radio != SELECTABLE_TEXT_EXTRACT_OPTION) and (textract_output_found_checkbox != True):
         page_conversion_time_taken = number_of_pages * convert_page_time
     # Page text extraction time
     if text_extract_method_radio == TEXTRACT_TEXT_EXTRACT_OPTION:
-        if textract_output_found_checkbox != True:
             page_extraction_time_taken = number_of_pages * textract_page_time
     elif text_extract_method_radio == local_ocr_option:
-        if local_ocr_output_found_checkbox != True:
-            page_extraction_time_taken = number_of_pages * local_ocr_extraction_page_time
     elif text_extract_method_radio == SELECTABLE_TEXT_EXTRACT_OPTION:
         page_conversion_time_taken = number_of_pages * local_text_extraction_page_time
@@ -579,47 +750,91 @@ def calculate_time_taken(number_of_pages:str,
         else:
             page_redaction_time_taken = number_of_pages * local_pii_redaction_page_time
-    calculated_time_taken = (page_conversion_time_taken + page_extraction_time_taken + page_redaction_time_taken)/60
     return calculated_time_taken
-def reset_base_dataframe(df:pd.DataFrame):
     return df
-def reset_ocr_base_dataframe(df:pd.DataFrame):
     if df.empty:
         return pd.DataFrame(columns=["page", "line", "text"])
     else:
         return df.loc[:, ["page", "line", "text"]]
-def reset_ocr_with_words_base_dataframe(df:pd.DataFrame, page_entity_dropdown_redaction_value:str):
     df["index"] = df.index
     output_df = df.copy()
-    df["page"]=df["page"].astype(str)
-    output_df_filtered = df.loc[df["page"]==str(page_entity_dropdown_redaction_value), ["page", "line", "word_text", "word_x0", "word_y0", "word_x1", "word_y1", "index"]]
     return output_df_filtered, output_df
-def update_language_dropdown(chosen_language_full_name_drop, textract_language_choices=textract_language_choices, aws_comprehend_language_choices=aws_comprehend_language_choices, LANGUAGE_MAP=LANGUAGE_MAP):
     try:
         full_language_name = chosen_language_full_name_drop.lower()
         matched_language = LANGUAGE_MAP[full_language_name]
-        chosen_language_drop = gr.Dropdown(value = matched_language, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False, visible=True)
-        if matched_language not in aws_comprehend_language_choices and matched_language not in textract_language_choices:
-            gr.Info(f"Note that {full_language_name} is not supported by AWS Comprehend or AWS Textract")
         elif matched_language not in aws_comprehend_language_choices:
-            gr.Info(f"Note that {full_language_name} is not supported by AWS Comprehend")
         elif matched_language not in textract_language_choices:
             gr.Info(f"Note that {full_language_name} is not supported by AWS Textract")
     except Exception as e:
         print(e)
         gr.Info("Could not find language in list")
-        chosen_language_drop = gr.Dropdown(value = DEFAULT_LANGUAGE, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False)
     return chosen_language_drop

 import os
 import re
+import unicodedata
+from math import ceil
+from typing import List
 import boto3
 import gradio as gr
 import numpy as np
+import pandas as pd
+from botocore.exceptions import ClientError
 from gradio_image_annotation import image_annotator
+from tools.config import (
+    AWS_PII_OPTION,
+    AWS_USER_POOL_ID,
+    CUSTOM_HEADER,
+    CUSTOM_HEADER_VALUE,
+    DEFAULT_LANGUAGE,
+    INPUT_FOLDER,
+    LANGUAGE_CHOICES,
+    MAPPED_LANGUAGE_CHOICES,
+    NO_REDACTION_PII_OPTION,
+    OUTPUT_FOLDER,
+    SELECTABLE_TEXT_EXTRACT_OPTION,
+    SESSION_OUTPUT_FOLDER,
+    TESSERACT_TEXT_EXTRACT_OPTION,
+    TEXTRACT_JOBS_LOCAL_LOC,
+    TEXTRACT_JOBS_S3_LOC,
+    TEXTRACT_TEXT_EXTRACT_OPTION,
+    TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+    TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+    aws_comprehend_language_choices,
+    textract_language_choices,
+)
 def _get_env_list(env_var_name: str) -> List[str]:
     """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
     if not value:
         return []
     # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(",") if s.strip()]
+if textract_language_choices:
+    textract_language_choices = _get_env_list(textract_language_choices)
+if aws_comprehend_language_choices:
+    aws_comprehend_language_choices = _get_env_list(aws_comprehend_language_choices)
+if MAPPED_LANGUAGE_CHOICES:
+    MAPPED_LANGUAGE_CHOICES = _get_env_list(MAPPED_LANGUAGE_CHOICES)
+if LANGUAGE_CHOICES:
+    LANGUAGE_CHOICES = _get_env_list(LANGUAGE_CHOICES)
 LANGUAGE_MAP = dict(zip(MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES))
 def reset_state_vars():
+    return (
+        [],
+        pd.DataFrame(),
+        pd.DataFrame(),
+        0,
+        "",
+        image_annotator(
             label="Modify redaction boxes",
             label_list=["Redaction"],
             label_colors=[(0, 0, 0)],
             show_label=False,
+            sources=None,  # ["upload"],
             show_clear_button=False,
             show_share_button=False,
             show_remove_button=False,
+            interactive=False,
+        ),
+        [],
+        [],
+        pd.DataFrame(),
+        pd.DataFrame(),
+        [],
+        [],
+        "",
+        False,
+        0,
+        [],
+    )
 def reset_ocr_results_state():
     return pd.DataFrame(), pd.DataFrame(), []
 def reset_review_vars():
     return pd.DataFrame(), pd.DataFrame()
 def reset_data_vars():
     return 0, [], 0
 def reset_aws_call_vars():
     return 0, 0
 def load_in_default_allow_list(allow_list_file_path):
     if isinstance(allow_list_file_path, str):
         allow_list_file_path = [allow_list_file_path]
     return allow_list_file_path
+def load_in_default_cost_codes(cost_codes_path: str, default_cost_code: str = ""):
+    """
     Load in the cost codes list from file.
+    """
     cost_codes_df = pd.read_csv(cost_codes_path)
     dropdown_choices = cost_codes_df.iloc[:, 0].astype(str).tolist()
         value=default_cost_code if default_cost_code in dropdown_choices else "",
         label="Choose cost code for analysis",
         choices=dropdown_choices,
+        allow_custom_value=False,
     )
     return cost_codes_df, cost_codes_df, out_dropdown
+def enforce_cost_codes(
+    enforce_cost_code_textbox: str,
+    cost_code_choice: str,
+    cost_code_df: pd.DataFrame,
+    verify_cost_codes: bool = True,
+):
+    """
     Check if the enforce cost codes variable is set to true, and then check that a cost cost has been chosen. If not, raise an error. Then, check against the values in the cost code dataframe to ensure that the cost code exists.
+    """
     if enforce_cost_code_textbox == "True":
         if not cost_code_choice:
             raise Exception("Please choose a cost code before continuing")
+        if verify_cost_codes is True:
             if cost_code_df.empty:
                 raise Exception("No cost codes present in dataframe for verification")
             else:
+                valid_cost_codes_list = list(cost_code_df.iloc[:, 0].unique())
+                if cost_code_choice not in valid_cost_codes_list:
+                    raise Exception(
+                        "Selected cost code not found in list. Please contact Finance if you cannot find the correct cost code from the given list of suggestions."
+                    )
     return
+def update_cost_code_dataframe_from_dropdown_select(
+    cost_dropdown_selection: str, cost_code_df: pd.DataFrame
+):
+    cost_code_df = cost_code_df.loc[
+        cost_code_df.iloc[:, 0] == cost_dropdown_selection, :
+    ]
     return cost_code_df
+def ensure_folder_exists(output_folder: str):
+    """Checks if the specified folder exists, creates it if not."""
     if not os.path.exists(output_folder):
         # Create the folder if it doesn't exist
     else:
         print(f"The {output_folder} folder already exists.")
+def update_dataframe(df: pd.DataFrame):
     df_copy = df.copy()
     return df_copy
 def get_file_name_without_type(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
     # Then, split the basename and its extension and return only the basename without the extension
     filename_without_extension, _ = os.path.splitext(basename)
+    # print(filename_without_extension)
     return filename_without_extension
+def detect_file_type(filename: str):
     """Detect the file type based on its extension."""
     if not isinstance(filename, str):
         filename = str(filename)
+    if (
+        (filename.endswith(".csv"))
+        | (filename.endswith(".csv.gz"))
+        | (filename.endswith(".zip"))
+    ):
+        return "csv"
+    elif filename.endswith(".xlsx"):
+        return "xlsx"
+    elif filename.endswith(".xls"):
+        return "xls"
+    elif filename.endswith(".parquet"):
+        return "parquet"
+    elif filename.endswith(".pdf"):
+        return "pdf"
+    elif filename.endswith(".jpg"):
+        return "jpg"
+    elif filename.endswith(".jpeg"):
+        return "jpeg"
+    elif filename.endswith(".png"):
+        return "png"
+    elif filename.endswith(".xfdf"):
+        return "xfdf"
+    elif filename.endswith(".docx"):
+        return "docx"
+    else:
+        raise ValueError("Unsupported file type.")
+def read_file(filename: str, excel_sheet_name: str = ""):
     """Read the file based on its detected type."""
     file_type = detect_file_type(filename)
+    if file_type == "csv":
         return pd.read_csv(filename, low_memory=False)
+    elif file_type == "xlsx":
         if excel_sheet_name:
             try:
                 return pd.read_excel(filename, sheet_name=excel_sheet_name)
             except Exception as e:
+                print(
+                    f"Error reading {filename} with sheet name {excel_sheet_name}: {e}"
+                )
                 return pd.DataFrame()
         else:
             return pd.read_excel(filename)
+    elif file_type == "parquet":
         return pd.read_parquet(filename)
+def ensure_output_folder_exists(output_folder: str):
+    """Checks if the specified folder exists, creates it if not."""
     if not os.path.exists(output_folder):
         # Create the folder if it doesn't exist
         print(f"The {output_folder} folder already exists.")
+def custom_regex_load(in_file: List[str], file_type: str = "allow_list"):
+    """
     When file is loaded, update the column dropdown choices and write to relevant data states.
+    """
     custom_regex_df = pd.DataFrame()
     if in_file:
         regex_file_names = [string for string in file_list if "csv" in string.lower()]
         if regex_file_names:
             regex_file_name = regex_file_names[0]
+            custom_regex_df = pd.read_csv(
+                regex_file_name, low_memory=False, header=None
+            )
             # Select just first columns
+            custom_regex_df = pd.DataFrame(custom_regex_df.iloc[:, [0]])
+            custom_regex_df.rename(columns={0: file_type}, inplace=True)
             custom_regex_df.columns = custom_regex_df.columns.astype(str)
             print(output_text)
     else:
         output_text = "No file provided."
+        # print(output_text)
         return output_text, custom_regex_df
     return output_text, custom_regex_df
+def put_columns_in_df(in_file: List[str]):
     new_choices = []
     concat_choices = []
     all_sheet_names = []
     number_of_excel_files = 0
     for file in in_file:
         file_name = file.name
         file_type = detect_file_type(file_name)
         print("File type is:", file_type)
+        if (file_type == "xlsx") | (file_type == "xls"):
             number_of_excel_files += 1
             new_choices = []
             print("Running through all xlsx sheets")
             new_choices = []
         concat_choices.extend(new_choices)
     # Drop duplicate columns
     concat_choices = list(set(concat_choices))
+    if number_of_excel_files > 0:
+        return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(
+            choices=all_sheet_names, value=all_sheet_names, visible=True
+        )
     else:
+        return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(
+            visible=False
+        )
+def check_for_existing_textract_file(
+    doc_file_name_no_extension_textbox: str, output_folder: str = OUTPUT_FOLDER
+):
+    textract_output_path = os.path.join(
+        output_folder, doc_file_name_no_extension_textbox + "_textract.json"
+    )
     if os.path.exists(textract_output_path):
+        print("Existing Textract analysis output file found.")
         return True
     else:
         return False
+def check_for_relevant_ocr_output_with_words(
+    doc_file_name_no_extension_textbox: str,
+    text_extraction_method: str,
+    output_folder: str = OUTPUT_FOLDER,
+):
+    if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
+        file_ending = "_ocr_results_with_words_local_text.json"
+    elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
+        file_ending = "_ocr_results_with_words_local_ocr.json"
+    elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+        file_ending = "_ocr_results_with_words_textract.json"
     else:
         print("No valid text extraction method found. Returning False")
         return False
     doc_file_with_ending = doc_file_name_no_extension_textbox + file_ending
     local_ocr_output_path = os.path.join(output_folder, doc_file_with_ending)
     if os.path.exists(local_ocr_output_path):
+        print("Existing OCR with words analysis output file found.")
+        return True
     else:
         return False
 def add_folder_to_path(folder_path: str):
+    """
     Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
+    """
     if os.path.exists(folder_path) and os.path.isdir(folder_path):
         print(folder_path, "folder exists.")
         # Resolve relative path to absolute path
         absolute_path = os.path.abspath(folder_path)
+        current_path = os.environ["PATH"]
         if absolute_path not in current_path.split(os.pathsep):
             full_path_extension = absolute_path + os.pathsep + current_path
+            os.environ["PATH"] = full_path_extension
+            # print(f"Updated PATH with: ", full_path_extension)
         else:
             print(f"Directory {folder_path} already exists in PATH.")
     else:
         print(f"Folder not found at {folder_path} - not added to PATH")
 # Upon running a process, the feedback buttons are revealed
 def reveal_feedback_buttons():
+    return (
+        gr.Radio(
+            visible=True,
+            label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
+        ),
+        gr.Textbox(visible=True),
+        gr.Button(visible=True),
+        gr.Markdown(visible=True),
+    )
+def wipe_logs(feedback_logs_loc: str, usage_logs_loc: str):
     try:
         os.remove(feedback_logs_loc)
     except Exception as e:
     except Exception as e:
         print("Could not remove usage logs file", e)
+def merge_csv_files(file_list: List[str], output_folder: str = OUTPUT_FOLDER):
     # Initialise an empty list to hold DataFrames
     dataframes = []
         dataframes.append(df)
     # Concatenate all DataFrames into a single DataFrame
+    merged_df = pd.concat(dataframes, ignore_index=True)
+    for col in ["xmin", "xmax", "ymin", "ymax"]:
         merged_df[col] = np.floor(merged_df[col])
+    merged_df = merged_df.drop_duplicates(
+        subset=["page", "label", "color", "xmin", "ymin", "xmax", "ymax"]
+    )
+    merged_df = merged_df.sort_values(["page", "ymin", "xmin", "label"])
     file_out_name = os.path.basename(file_list[0])
     return output_files
+async def get_connection_params(
+    request: gr.Request,
+    output_folder_textbox: str = OUTPUT_FOLDER,
+    input_folder_textbox: str = INPUT_FOLDER,
+    session_output_folder: str = SESSION_OUTPUT_FOLDER,
+    textract_document_upload_input_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+    textract_document_upload_output_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+    s3_textract_document_logs_subfolder: str = TEXTRACT_JOBS_S3_LOC,
+    local_textract_document_logs_subfolder: str = TEXTRACT_JOBS_LOCAL_LOC,
+):
+    # print("Session hash:", request.session_hash)
     if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
+        if CUSTOM_HEADER in request.headers:
+            supplied_custom_header_value = request.headers[CUSTOM_HEADER]
+            if supplied_custom_header_value == CUSTOM_HEADER_VALUE:
+                print("Custom header supplied and matches CUSTOM_HEADER_VALUE")
             else:
+                print("Custom header value does not match expected value.")
+                raise ValueError("Custom header value does not match expected value.")
+        else:
+            print("Custom header value not found.")
+            raise ValueError("Custom header value not found.")
     # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
     if request.username:
         out_session_hash = request.username
+        # print("Request username found:", out_session_hash)
+    elif "x-cognito-id" in request.headers:
+        out_session_hash = request.headers["x-cognito-id"]
+        # print("Cognito ID found:", out_session_hash)
+    elif "x-amzn-oidc-identity" in request.headers:
+        out_session_hash = request.headers["x-amzn-oidc-identity"]
         # Fetch email address using Cognito client
+        cognito_client = boto3.client("cognito-idp")
         try:
             response = cognito_client.admin_get_user(
                 UserPoolId=AWS_USER_POOL_ID,  # Replace with your User Pool ID
+                Username=out_session_hash,
             )
+            email = next(
+                attr["Value"]
+                for attr in response["UserAttributes"]
+                if attr["Name"] == "email"
+            )
+            # print("Email address found:", email)
             out_session_hash = email
         except ClientError as e:
     else:
         out_session_hash = request.session_hash
+    if session_output_folder == "True":
         output_folder = output_folder_textbox + out_session_hash + "/"
         input_folder = input_folder_textbox + out_session_hash + "/"
+        textract_document_upload_input_folder = (
+            textract_document_upload_input_folder + "/" + out_session_hash
+        )
+        textract_document_upload_output_folder = (
+            textract_document_upload_output_folder + "/" + out_session_hash
+        )
+        s3_textract_document_logs_subfolder = (
+            s3_textract_document_logs_subfolder + "/" + out_session_hash
+        )
+        local_textract_document_logs_subfolder = (
+            local_textract_document_logs_subfolder + "/" + out_session_hash + "/"
+        )
     else:
         output_folder = output_folder_textbox
         input_folder = input_folder_textbox
+    if not os.path.exists(output_folder):
+        os.mkdir(output_folder)
+    if not os.path.exists(input_folder):
+        os.mkdir(input_folder)
+    return (
+        out_session_hash,
+        output_folder,
+        out_session_hash,
+        input_folder,
+        textract_document_upload_input_folder,
+        textract_document_upload_output_folder,
+        s3_textract_document_logs_subfolder,
+        local_textract_document_logs_subfolder,
+    )
+def clean_unicode_text(text: str):
     # Step 1: Normalise unicode characters to decompose any special forms
+    normalized_text = unicodedata.normalize("NFKC", text)
     # Step 2: Replace smart quotes and special punctuation with standard ASCII equivalents
     replacements = {
+        "‘": "'",
+        "’": "'",
+        "“": '"',
+        "”": '"',
+        "–": "-",
+        "—": "-",
+        "…": "...",
+        "•": "*",
     }
     # Perform replacements
     # Step 3: Optionally remove non-ASCII characters if needed
     # This regex removes any remaining non-ASCII characters, if desired.
     # Comment this line if you want to keep all Unicode characters.
+    cleaned_text = re.sub(r"[^\x00-\x7F]+", "", normalized_text)
     return cleaned_text
+def load_all_output_files(folder_path: str = OUTPUT_FOLDER) -> List[str]:
     """Get the file paths of all files in the given folder."""
     file_paths = []
     # List all files in the specified folder
     for filename in os.listdir(folder_path):
         # Construct full file path
         # Check if it's a file (not a directory)
         if os.path.isfile(full_path):
             file_paths.append(full_path)
     return file_paths
+def calculate_aws_costs(
+    number_of_pages: str,
+    text_extract_method_radio: str,
+    handwrite_signature_checkbox: List[str],
+    pii_identification_method: str,
+    textract_output_found_checkbox: bool,
+    only_extract_text_radio: bool,
+    convert_to_gbp: bool = True,
+    usd_gbp_conversion_rate: float = 0.76,
+    textract_page_cost: float = 1.5 / 1000,
+    textract_signature_cost: float = 2.0 / 1000,
+    comprehend_unit_cost: float = 0.0001,
+    comprehend_size_unit_average: float = 250,
+    average_characters_per_page: float = 2000,
+    TEXTRACT_TEXT_EXTRACT_OPTION: str = TEXTRACT_TEXT_EXTRACT_OPTION,
+    NO_REDACTION_PII_OPTION: str = NO_REDACTION_PII_OPTION,
+    AWS_PII_OPTION: str = AWS_PII_OPTION,
+):
+    """
     Calculate the approximate cost of submitting a document to AWS Textract and/or AWS Comprehend, assuming that Textract outputs do not already exist in the output folder.
     - number_of_pages: The number of pages in the uploaded document(s).
     - TEXTRACT_TEXT_EXTRACT_OPTION (str, optional): String label for the text_extract_method_radio button for AWS Textract.
     - NO_REDACTION_PII_OPTION (str, optional): String label for pii_identification_method_drop for no redaction.
     - AWS_PII_OPTION (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
+    """
     text_extraction_cost = 0
     pii_identification_cost = 0
     calculated_aws_cost = 0
     number_of_pages = int(number_of_pages)
+    if textract_output_found_checkbox is not True:
         if text_extract_method_radio == TEXTRACT_TEXT_EXTRACT_OPTION:
             text_extraction_cost = number_of_pages * textract_page_cost
             if "Extract signatures" in handwrite_signature_checkbox:
+                text_extraction_cost += textract_signature_cost * number_of_pages
     if pii_identification_method != NO_REDACTION_PII_OPTION:
         if pii_identification_method == AWS_PII_OPTION:
+            comprehend_page_cost = (
+                ceil(average_characters_per_page / comprehend_size_unit_average)
+                * comprehend_unit_cost
+            )
             pii_identification_cost = comprehend_page_cost * number_of_pages
+    calculated_aws_cost = (
+        calculated_aws_cost + text_extraction_cost + pii_identification_cost
+    )
+    if convert_to_gbp is True:
         calculated_aws_cost *= usd_gbp_conversion_rate
     return calculated_aws_cost
+def calculate_time_taken(
+    number_of_pages: str,
+    text_extract_method_radio: str,
+    pii_identification_method: str,
+    textract_output_found_checkbox: bool,
+    only_extract_text_radio: bool,
+    local_ocr_output_found_checkbox: bool,
+    convert_page_time: float = 0.5,
+    textract_page_time: float = 1.2,
+    comprehend_page_time: float = 1.2,
+    local_text_extraction_page_time: float = 0.3,
+    local_pii_redaction_page_time: float = 0.5,
+    local_ocr_extraction_page_time: float = 1.5,
+    TEXTRACT_TEXT_EXTRACT_OPTION: str = TEXTRACT_TEXT_EXTRACT_OPTION,
+    SELECTABLE_TEXT_EXTRACT_OPTION: str = SELECTABLE_TEXT_EXTRACT_OPTION,
+    local_ocr_option: str = TESSERACT_TEXT_EXTRACT_OPTION,
+    NO_REDACTION_PII_OPTION: str = NO_REDACTION_PII_OPTION,
+    AWS_PII_OPTION: str = AWS_PII_OPTION,
+):
+    """
     Calculate the approximate time to redact a document.
     - number_of_pages: The number of pages in the uploaded document(s).
     - TEXTRACT_TEXT_EXTRACT_OPTION (str, optional): String label for the text_extract_method_radio button for AWS Textract.
     - SELECTABLE_TEXT_EXTRACT_OPTION (str, optional): String label for text_extract_method_radio for text extraction.
     - local_ocr_option (str, optional): String label for text_extract_method_radio for local OCR.
+    - NO_REDACTION_PII_OPTION (str, optional): String label for pii_identification_method_drop for no redaction.
     - AWS_PII_OPTION (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
+    """
     calculated_time_taken = 0
     page_conversion_time_taken = 0
     page_extraction_time_taken = 0
     number_of_pages = int(number_of_pages)
     # Page preparation/conversion to image time
+    if (text_extract_method_radio != SELECTABLE_TEXT_EXTRACT_OPTION) and (
+        textract_output_found_checkbox is not True
+    ):
         page_conversion_time_taken = number_of_pages * convert_page_time
     # Page text extraction time
     if text_extract_method_radio == TEXTRACT_TEXT_EXTRACT_OPTION:
+        if textract_output_found_checkbox is not True:
             page_extraction_time_taken = number_of_pages * textract_page_time
     elif text_extract_method_radio == local_ocr_option:
+        if local_ocr_output_found_checkbox is not True:
+            page_extraction_time_taken = (
+                number_of_pages * local_ocr_extraction_page_time
+            )
     elif text_extract_method_radio == SELECTABLE_TEXT_EXTRACT_OPTION:
         page_conversion_time_taken = number_of_pages * local_text_extraction_page_time
         else:
             page_redaction_time_taken = number_of_pages * local_pii_redaction_page_time
+    calculated_time_taken = (
+        page_conversion_time_taken
+        + page_extraction_time_taken
+        + page_redaction_time_taken
+    ) / 60
     return calculated_time_taken
+def reset_base_dataframe(df: pd.DataFrame):
     return df
+def reset_ocr_base_dataframe(df: pd.DataFrame):
     if df.empty:
         return pd.DataFrame(columns=["page", "line", "text"])
     else:
         return df.loc[:, ["page", "line", "text"]]
+def reset_ocr_with_words_base_dataframe(
+    df: pd.DataFrame, page_entity_dropdown_redaction_value: str
+):
     df["index"] = df.index
     output_df = df.copy()
+    df["page"] = df["page"].astype(str)
+    output_df_filtered = df.loc[
+        df["page"] == str(page_entity_dropdown_redaction_value),
+        [
+            "page",
+            "line",
+            "word_text",
+            "word_x0",
+            "word_y0",
+            "word_x1",
+            "word_y1",
+            "index",
+        ],
+    ]
     return output_df_filtered, output_df
+def update_language_dropdown(
+    chosen_language_full_name_drop,
+    textract_language_choices=textract_language_choices,
+    aws_comprehend_language_choices=aws_comprehend_language_choices,
+    LANGUAGE_MAP=LANGUAGE_MAP,
+):
     try:
         full_language_name = chosen_language_full_name_drop.lower()
         matched_language = LANGUAGE_MAP[full_language_name]
+        chosen_language_drop = gr.Dropdown(
+            value=matched_language,
+            choices=LANGUAGE_CHOICES,
+            label="Chosen language short code",
+            multiselect=False,
+            visible=True,
+        )
+        if (
+            matched_language not in aws_comprehend_language_choices
+            and matched_language not in textract_language_choices
+        ):
+            gr.Info(
+                f"Note that {full_language_name} is not supported by AWS Comprehend or AWS Textract"
+            )
         elif matched_language not in aws_comprehend_language_choices:
+            gr.Info(
+                f"Note that {full_language_name} is not supported by AWS Comprehend"
+            )
         elif matched_language not in textract_language_choices:
             gr.Info(f"Note that {full_language_name} is not supported by AWS Textract")
     except Exception as e:
         print(e)
         gr.Info("Could not find language in list")
+        chosen_language_drop = gr.Dropdown(
+            value=DEFAULT_LANGUAGE,
+            choices=LANGUAGE_CHOICES,
+            label="Chosen language short code",
+            multiselect=False,
+        )
     return chosen_language_drop

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -1,36 +1,58 @@
 from typing import List
-from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
-from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts, NerModelConfiguration
 import spacy
 from spacy.matcher import Matcher
 from spaczz.matcher import FuzzyMatcher
 spacy.prefer_gpu()
-from spacy.cli.download import download
-import Levenshtein
-import re
 import os
-import requests
 import gradio as gr
-from tools.config import DEFAULT_LANGUAGE, TESSERACT_DATA_FOLDER, CUSTOM_ENTITIES
 from tools.helper_functions import _get_env_list
 score_threshold = 0.001
-if CUSTOM_ENTITIES: CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
 custom_entities = CUSTOM_ENTITIES
 # Create a class inheriting from SpacyNlpEngine
 class LoadedSpacyNlpEngine(SpacyNlpEngine):
     def __init__(self, loaded_spacy_model, language_code: str):
-        super().__init__(ner_model_configuration=NerModelConfiguration(labels_to_ignore=["CARDINAL", "ORDINAL"])) # Ignore non-relevant labels
         self.nlp = {language_code: loaded_spacy_model}
 def _base_language_code(language: str) -> str:
     lang = _normalize_language_input(language)
     if "_" in lang:
         return lang.split("_")[0]
     return lang
 def load_spacy_model(language: str = DEFAULT_LANGUAGE):
     """
     Load a spaCy model for the requested language and return it as `nlp`.
@@ -78,32 +100,39 @@ def load_spacy_model(language: str = DEFAULT_LANGUAGE):
         "en_trf": ["en_core_web_trf"],
         "en_md": ["en_core_web_md"],
         "en_sm": ["en_core_web_sm"],
         # Major languages (news pipelines)
-        "ca": ["ca_core_news_lg", "ca_core_news_md", "ca_core_news_sm"], # Catalan
-        "da": ["da_core_news_lg", "da_core_news_md", "da_core_news_sm"], # Danish
-        "de": ["de_core_news_lg", "de_core_news_md", "de_core_news_sm"], # German
-        "el": ["el_core_news_lg", "el_core_news_md", "el_core_news_sm"], # Greek
-        "es": ["es_core_news_lg", "es_core_news_md", "es_core_news_sm"], # Spanish
-        "fi": ["fi_core_news_lg", "fi_core_news_md", "fi_core_news_sm"], # Finnish
-        "fr": ["fr_core_news_lg", "fr_core_news_md", "fr_core_news_sm"], # French
-        "hr": ["hr_core_news_lg", "hr_core_news_md", "hr_core_news_sm"], # Croatian
-        "it": ["it_core_news_lg", "it_core_news_md", "it_core_news_sm"], # Italian
-        "ja": ["ja_core_news_lg", "ja_core_news_md", "ja_core_news_sm"], # Japanese
-        "ko": ["ko_core_news_lg", "ko_core_news_md", "ko_core_news_sm"], # Korean
-        "lt": ["lt_core_news_lg", "lt_core_news_md", "lt_core_news_sm"], # Lithuanian
-        "mk": ["mk_core_news_lg", "mk_core_news_md", "mk_core_news_sm"], # Macedonian
-        "nb": ["nb_core_news_lg", "nb_core_news_md", "nb_core_news_sm"], # Norwegian Bokmål
-        "nl": ["nl_core_news_lg", "nl_core_news_md", "nl_core_news_sm"], # Dutch
-        "pl": ["pl_core_news_lg", "pl_core_news_md", "pl_core_news_sm"], # Polish
-        "pt": ["pt_core_news_lg", "pt_core_news_md", "pt_core_news_sm"], # Portuguese
-        "ro": ["ro_core_news_lg", "ro_core_news_md", "ro_core_news_sm"], # Romanian
-        "ru": ["ru_core_news_lg", "ru_core_news_md", "ru_core_news_sm"], # Russian
-        "sl": ["sl_core_news_lg", "sl_core_news_md", "sl_core_news_sm"], # Slovenian
-        "sv": ["sv_core_news_lg", "sv_core_news_md", "sv_core_news_sm"], # Swedish
-        "uk": ["uk_core_news_lg", "uk_core_news_md", "uk_core_news_sm"], # Ukrainian
-        "zh": ["zh_core_web_lg", "zh_core_web_mod", "zh_core_web_sm", "zh_core_web_trf"], # Chinese
         # Multilingual NER
         "xx": ["xx_ent_wiki_sm"],
     }
@@ -158,17 +187,22 @@ def load_spacy_model(language: str = DEFAULT_LANGUAGE):
             last_error = e
             continue
-    raise RuntimeError(f"Failed to load spaCy model for language '{language}'. Last error: {last_error}")
 # Language-aware spaCy model loader
 def _normalize_language_input(language: str) -> str:
     return language.strip().lower().replace("-", "_")
 # Update the global variables to use the new function
 ACTIVE_LANGUAGE_CODE = _base_language_code(DEFAULT_LANGUAGE)
-nlp = None # Placeholder, will be loaded in the create_nlp_analyser function below #load_spacy_model(DEFAULT_LANGUAGE)
-def get_tesseract_lang_code(short_code:str):
     """
     Maps a two-letter language code to the corresponding Tesseract OCR code.
@@ -200,12 +234,15 @@ def get_tesseract_lang_code(short_code:str):
         "ru": "rus",
         "sl": "slv",
         "sv": "swe",
-        "uk": "ukr"
     }
     return lang_map.get(short_code)
-def download_tesseract_lang_pack(short_lang_code:str, tessdata_dir=TESSERACT_DATA_FOLDER):
     """
     Downloads a Tesseract language pack to a local directory.
@@ -214,7 +251,7 @@ def download_tesseract_lang_pack(short_lang_code:str, tessdata_dir=TESSERACT_DAT
         tessdata_dir (str, optional): The directory to save the language pack.
                                      Defaults to "tessdata".
     """
     # Create the directory if it doesn't exist
     if not os.path.exists(tessdata_dir):
         os.makedirs(tessdata_dir)
@@ -223,16 +260,18 @@ def download_tesseract_lang_pack(short_lang_code:str, tessdata_dir=TESSERACT_DAT
     lang_code = get_tesseract_lang_code(short_lang_code)
     if lang_code is None:
-        raise ValueError(f"Language code {short_lang_code} not found in Tesseract language map")
     # Set the local file path
     file_path = os.path.join(tessdata_dir, f"{lang_code}.traineddata")
     # Check if the file already exists
     if os.path.exists(file_path):
         print(f"Language pack {lang_code}.traineddata already exists at {file_path}")
         return file_path
     # Construct the URL for the language pack
     url = f"https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/{lang_code}.traineddata"
@@ -252,35 +291,61 @@ def download_tesseract_lang_pack(short_lang_code:str, tessdata_dir=TESSERACT_DAT
         print(f"Error downloading {lang_code}.traineddata: {e}")
         return None
 #### Custom recognisers
-def custom_word_list_recogniser(custom_list:List[str]=[]):
     # Create regex pattern, handling quotes carefully
     quote_str = '"'
     replace_str = '(?:"|"|")'
-    custom_regex = '|'.join(
-        rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
         for term in custom_list
     )
-    #print(custom_regex)
-    custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1)
-    custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
-        global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
     return custom_recogniser
 # Initialise custom recogniser that will be overwritten later
 custom_recogniser = custom_word_list_recogniser()
 # Custom title recogniser
-titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
-titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
-titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
-titles_recogniser = PatternRecognizer(supported_entity="TITLES", name="TITLES", patterns = [titles_pattern],
-    global_regex_flags=re.DOTALL | re.MULTILINE)
 # %%
 # Custom postcode recogniser
@@ -289,38 +354,117 @@ titles_recogniser = PatternRecognizer(supported_entity="TITLES", name="TITLES",
 ukpostcode_pattern = Pattern(
     name="ukpostcode_pattern",
     regex=r"\b([A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}|GIR ?0AA)\b",
-    score=1
 )
 # Define the recognizer with one or more patterns
-ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
 ### Street name
-def extract_street_name(text:str) -> str:
     """
     Extracts the street name and preceding word (that should contain at least one number) from the given text.
-    """
     street_types = [
-    'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
-    'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
-    'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
-    'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
-    'Alley', 'Arcade', 'Avenue', 'Ave', 'Bay', 'Bend', 'Brae', 'Byway', 'Close', 'Corner', 'Cove',
-    'Crescent', 'Cres', 'Cul-de-sac', 'Dell', 'Drive', 'Dr', 'Esplanade', 'Glen', 'Green', 'Grove', 'Heights', 'Hts',
-    'Mews', 'Parade', 'Path', 'Piazza', 'Promenade', 'Quay', 'Ridge', 'Row', 'Terrace', 'Ter', 'Track', 'Trail', 'View', 'Villas',
-    'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
     ]
     # Dynamically construct the regex pattern with all possible street types
-    street_types_pattern = '|'.join(rf"{re.escape(street_type)}" for street_type in street_types)
     # The overall regex pattern to capture the street name and preceding word(s)
-    pattern = rf'(?P<preceding_word>\w*\d\w*)\s*'
-    pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
     # Find all matches in text
     matches = re.finditer(pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
@@ -329,26 +473,29 @@ def extract_street_name(text:str) -> str:
     end_positions = []
     for match in matches:
-        preceding_word = match.group('preceding_word').strip()
-        street_name = match.group('street_name').strip()
         start_pos = match.start()
         end_pos = match.end()
-        #print(f"Start: {start_pos}, End: {end_pos}")
-        #print(f"Preceding words: {preceding_word}")
-        #print(f"Street name: {street_name}")
         start_positions.append(start_pos)
         end_positions.append(end_pos)
     return start_positions, end_positions
 class StreetNameRecognizer(EntityRecognizer):
     def load(self) -> None:
         """No loading is required."""
         pass
-    def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
         """
         Logic for detecting a specific PII
         """
@@ -360,32 +507,33 @@ class StreetNameRecognizer(EntityRecognizer):
         for i in range(0, len(start_pos)):
             result = RecognizerResult(
-                        entity_type="STREETNAME",
-                        start = start_pos[i],
-                        end = end_pos[i],
-                        score= 1
-                    )
             results.append(result)
         return results
 street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
 ## Custom fuzzy match recogniser for list of strings
-def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
     # Create regex pattern, handling quotes carefully
     quote_str = '"'
     replace_str = '(?:"|"|")'
-    custom_regex_pattern = '|'.join(
-        rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
         for term in custom_list
     )
     # Find all matches in text
-    matches = re.finditer(custom_regex_pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
     start_positions = []
     end_positions = []
@@ -401,46 +549,69 @@ def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
 class CustomWordFuzzyRecognizer(EntityRecognizer):
-    def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
         super().__init__(supported_entities=supported_entities)
         self.custom_list = custom_list  # Store the custom_list as an instance attribute
-        self.spelling_mistakes_max = spelling_mistakes_max  # Store the max spelling mistakes
-        self.search_whole_phrase = search_whole_phrase  # Store the search whole phrase flag
     def load(self) -> None:
         """No loading is required."""
         pass
-    def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
         """
         Logic for detecting a specific PII
         """
-        start_pos, end_pos = spacy_fuzzy_search(text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase)  # Pass new parameters
         results = []
         for i in range(0, len(start_pos)):
             result = RecognizerResult(
-                entity_type="CUSTOM_FUZZY",
-                start=start_pos[i],
-                end=end_pos[i],
-                score=1
             )
             results.append(result)
         return results
 custom_list_default = []
-custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
 # Pass the loaded model to the new LoadedSpacyNlpEngine
-loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp, language_code = ACTIVE_LANGUAGE_CODE)
-def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str] = None,
-                       spelling_mistakes_max: int = 1, search_whole_phrase: bool = True, existing_nlp_analyser: AnalyzerEngine = None, return_also_model: bool = False):
     """
     Create an nlp_analyser object based on the specified language input.
     Args:
         language (str): Language code (e.g., "en", "de", "fr", "es", etc.)
         custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
@@ -448,12 +619,12 @@ def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str]
         search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
         existing_nlp_analyser (AnalyzerEngine, optional): Existing nlp_analyser object to use. Defaults to None.
         return_also_model (bool, optional): Whether to return the nlp_model object as well. Defaults to False.
     Returns:
         AnalyzerEngine: Configured nlp_analyser object with custom recognizers
     """
-    if existing_nlp_analyser is None:
         pass
     else:
         if existing_nlp_analyser.supported_languages[0] == language:
@@ -463,28 +634,27 @@ def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str]
     # Load spaCy model for the specified language
     nlp_model = load_spacy_model(language)
     # Get base language code
     base_lang_code = _base_language_code(language)
     # Create custom recognizers
     if custom_list is None:
         custom_list = []
     custom_recogniser = custom_word_list_recogniser(custom_list)
     custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(
-        supported_entities=["CUSTOM_FUZZY"],
         custom_list=custom_list,
         spelling_mistakes_max=spelling_mistakes_max,
-        search_whole_phrase=search_whole_phrase
     )
     # Create NLP engine with loaded model
     loaded_nlp_engine = LoadedSpacyNlpEngine(
-        loaded_spacy_model=nlp_model,
-        language_code=base_lang_code
     )
     # Create analyzer engine
     nlp_analyser = AnalyzerEngine(
         nlp_engine=loaded_nlp_engine,
@@ -492,11 +662,11 @@ def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str]
         supported_languages=[base_lang_code],
         log_decision_process=False,
     )
     # Add custom recognizers to nlp_analyser
     nlp_analyser.registry.add_recognizer(custom_recogniser)
     nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)
     # Add language-specific recognizers for English
     if base_lang_code == "en":
         nlp_analyser.registry.add_recognizer(street_recogniser)
@@ -505,21 +675,30 @@ def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str]
     if return_also_model:
         return nlp_analyser, nlp_model
     return nlp_analyser
 # Create the default nlp_analyser using the new function
 nlp_analyser, nlp = create_nlp_analyser(DEFAULT_LANGUAGE, return_also_model=True)
-def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
-    ''' Conduct fuzzy match on a list of text data.'''
     all_matches = []
     all_start_positions = []
     all_end_positions = []
     all_ratios = []
-    #print("custom_query_list:", custom_query_list)
     if not text:
         out_message = "No text data found. Skipping page."
@@ -530,23 +709,31 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
         query = nlp(string_query)
-        if search_whole_phrase == False:
             # Keep only words that are not stop words
-            token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
             spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
             if len(token_query) > 1:
-                #pattern_lemma = [{"LEMMA": {"IN": query}}]
-                pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
             else:
-                #pattern_lemma = [{"LEMMA": query[0]}]
-                pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
-            matcher = Matcher(nlp.vocab)
             matcher.add(string_query, [pattern_fuzz])
-            #matcher.add(string_query, [pattern_lemma])
         else:
             # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
             # If you want to match the whole phrase, use phrase matcher
@@ -558,52 +745,51 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
         docs = nlp.pipe([text], batch_size=batch_size)
         # Get number of matches per doc
-        for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
             matches = matcher(doc)
             match_count = len(matches)
             # If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
-            if search_whole_phrase==False:
                 all_matches.append(match_count)
                 for match_id, start, end in matches:
                     span = str(doc[start:end]).strip()
                     query_search = str(query).strip()
                     # Convert word positions to character positions
                     start_char = doc[start].idx  # Start character position
-                    end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
                     # The positions here are word position, not character position
                     all_matches.append(match_count)
                     all_start_positions.append(start_char)
                     all_end_positions.append(end_char)
             else:
                 for match_id, start, end, ratio, pattern in matches:
                     span = str(doc[start:end]).strip()
                     query_search = str(query).strip()
                     # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
                     distance = Levenshtein.distance(query_search.lower(), span.lower())
-                    #print("Levenshtein distance:", distance)
-                    if distance > spelling_mistakes_max:
                         match_count = match_count - 1
                     else:
                         # Convert word positions to character positions
                         start_char = doc[start].idx  # Start character position
-                        end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
                         all_matches.append(match_count)
                         all_start_positions.append(start_char)
                         all_end_positions.append(end_char)
-                        all_ratios.append(ratio)
     return all_start_positions, all_end_positions

 from typing import List
 import spacy
+from presidio_analyzer import (
+    AnalyzerEngine,
+    EntityRecognizer,
+    Pattern,
+    PatternRecognizer,
+    RecognizerResult,
+)
+from presidio_analyzer.nlp_engine import (
+    NerModelConfiguration,
+    NlpArtifacts,
+    SpacyNlpEngine,
+)
 from spacy.matcher import Matcher
 from spaczz.matcher import FuzzyMatcher
 spacy.prefer_gpu()
 import os
+import re
 import gradio as gr
+import Levenshtein
+import requests
+from spacy.cli.download import download
+from tools.config import CUSTOM_ENTITIES, DEFAULT_LANGUAGE, TESSERACT_DATA_FOLDER
 from tools.helper_functions import _get_env_list
 score_threshold = 0.001
+if CUSTOM_ENTITIES:
+    CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
 custom_entities = CUSTOM_ENTITIES
 # Create a class inheriting from SpacyNlpEngine
 class LoadedSpacyNlpEngine(SpacyNlpEngine):
     def __init__(self, loaded_spacy_model, language_code: str):
+        super().__init__(
+            ner_model_configuration=NerModelConfiguration(
+                labels_to_ignore=["CARDINAL", "ORDINAL"]
+            )
+        )  # Ignore non-relevant labels
         self.nlp = {language_code: loaded_spacy_model}
 def _base_language_code(language: str) -> str:
     lang = _normalize_language_input(language)
     if "_" in lang:
         return lang.split("_")[0]
     return lang
 def load_spacy_model(language: str = DEFAULT_LANGUAGE):
     """
     Load a spaCy model for the requested language and return it as `nlp`.
         "en_trf": ["en_core_web_trf"],
         "en_md": ["en_core_web_md"],
         "en_sm": ["en_core_web_sm"],
         # Major languages (news pipelines)
+        "ca": ["ca_core_news_lg", "ca_core_news_md", "ca_core_news_sm"],  # Catalan
+        "da": ["da_core_news_lg", "da_core_news_md", "da_core_news_sm"],  # Danish
+        "de": ["de_core_news_lg", "de_core_news_md", "de_core_news_sm"],  # German
+        "el": ["el_core_news_lg", "el_core_news_md", "el_core_news_sm"],  # Greek
+        "es": ["es_core_news_lg", "es_core_news_md", "es_core_news_sm"],  # Spanish
+        "fi": ["fi_core_news_lg", "fi_core_news_md", "fi_core_news_sm"],  # Finnish
+        "fr": ["fr_core_news_lg", "fr_core_news_md", "fr_core_news_sm"],  # French
+        "hr": ["hr_core_news_lg", "hr_core_news_md", "hr_core_news_sm"],  # Croatian
+        "it": ["it_core_news_lg", "it_core_news_md", "it_core_news_sm"],  # Italian
+        "ja": ["ja_core_news_lg", "ja_core_news_md", "ja_core_news_sm"],  # Japanese
+        "ko": ["ko_core_news_lg", "ko_core_news_md", "ko_core_news_sm"],  # Korean
+        "lt": ["lt_core_news_lg", "lt_core_news_md", "lt_core_news_sm"],  # Lithuanian
+        "mk": ["mk_core_news_lg", "mk_core_news_md", "mk_core_news_sm"],  # Macedonian
+        "nb": [
+            "nb_core_news_lg",
+            "nb_core_news_md",
+            "nb_core_news_sm",
+        ],  # Norwegian Bokmål
+        "nl": ["nl_core_news_lg", "nl_core_news_md", "nl_core_news_sm"],  # Dutch
+        "pl": ["pl_core_news_lg", "pl_core_news_md", "pl_core_news_sm"],  # Polish
+        "pt": ["pt_core_news_lg", "pt_core_news_md", "pt_core_news_sm"],  # Portuguese
+        "ro": ["ro_core_news_lg", "ro_core_news_md", "ro_core_news_sm"],  # Romanian
+        "ru": ["ru_core_news_lg", "ru_core_news_md", "ru_core_news_sm"],  # Russian
+        "sl": ["sl_core_news_lg", "sl_core_news_md", "sl_core_news_sm"],  # Slovenian
+        "sv": ["sv_core_news_lg", "sv_core_news_md", "sv_core_news_sm"],  # Swedish
+        "uk": ["uk_core_news_lg", "uk_core_news_md", "uk_core_news_sm"],  # Ukrainian
+        "zh": [
+            "zh_core_web_lg",
+            "zh_core_web_mod",
+            "zh_core_web_sm",
+            "zh_core_web_trf",
+        ],  # Chinese
         # Multilingual NER
         "xx": ["xx_ent_wiki_sm"],
     }
             last_error = e
             continue
+    raise RuntimeError(
+        f"Failed to load spaCy model for language '{language}'. Last error: {last_error}"
+    )
 # Language-aware spaCy model loader
 def _normalize_language_input(language: str) -> str:
     return language.strip().lower().replace("-", "_")
 # Update the global variables to use the new function
 ACTIVE_LANGUAGE_CODE = _base_language_code(DEFAULT_LANGUAGE)
+nlp = None  # Placeholder, will be loaded in the create_nlp_analyser function below #load_spacy_model(DEFAULT_LANGUAGE)
+def get_tesseract_lang_code(short_code: str):
     """
     Maps a two-letter language code to the corresponding Tesseract OCR code.
         "ru": "rus",
         "sl": "slv",
         "sv": "swe",
+        "uk": "ukr",
     }
     return lang_map.get(short_code)
+def download_tesseract_lang_pack(
+    short_lang_code: str, tessdata_dir=TESSERACT_DATA_FOLDER
+):
     """
     Downloads a Tesseract language pack to a local directory.
         tessdata_dir (str, optional): The directory to save the language pack.
                                      Defaults to "tessdata".
     """
     # Create the directory if it doesn't exist
     if not os.path.exists(tessdata_dir):
         os.makedirs(tessdata_dir)
     lang_code = get_tesseract_lang_code(short_lang_code)
     if lang_code is None:
+        raise ValueError(
+            f"Language code {short_lang_code} not found in Tesseract language map"
+        )
     # Set the local file path
     file_path = os.path.join(tessdata_dir, f"{lang_code}.traineddata")
     # Check if the file already exists
     if os.path.exists(file_path):
         print(f"Language pack {lang_code}.traineddata already exists at {file_path}")
         return file_path
     # Construct the URL for the language pack
     url = f"https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/{lang_code}.traineddata"
         print(f"Error downloading {lang_code}.traineddata: {e}")
         return None
 #### Custom recognisers
+def custom_word_list_recogniser(custom_list: List[str] = []):
     # Create regex pattern, handling quotes carefully
     quote_str = '"'
     replace_str = '(?:"|"|")'
+    custom_regex = "|".join(
+        rf"(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)"
         for term in custom_list
     )
+    # print(custom_regex)
+    custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score=1)
+    custom_recogniser = PatternRecognizer(
+        supported_entity="CUSTOM",
+        name="CUSTOM",
+        patterns=[custom_pattern],
+        global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE,
+    )
     return custom_recogniser
 # Initialise custom recogniser that will be overwritten later
 custom_recogniser = custom_word_list_recogniser()
 # Custom title recogniser
+titles_list = [
+    "Sir",
+    "Ma'am",
+    "Madam",
+    "Mr",
+    "Mr.",
+    "Mrs",
+    "Mrs.",
+    "Ms",
+    "Ms.",
+    "Miss",
+    "Dr",
+    "Dr.",
+    "Professor",
+]
+titles_regex = (
+    "\\b" + "\\b|\\b".join(rf"{re.escape(title)}" for title in titles_list) + "\\b"
+)
+titles_pattern = Pattern(name="titles_pattern", regex=titles_regex, score=1)
+titles_recogniser = PatternRecognizer(
+    supported_entity="TITLES",
+    name="TITLES",
+    patterns=[titles_pattern],
+    global_regex_flags=re.DOTALL | re.MULTILINE,
+)
 # %%
 # Custom postcode recogniser
 ukpostcode_pattern = Pattern(
     name="ukpostcode_pattern",
     regex=r"\b([A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}|GIR ?0AA)\b",
+    score=1,
 )
 # Define the recognizer with one or more patterns
+ukpostcode_recogniser = PatternRecognizer(
+    supported_entity="UKPOSTCODE", name="UKPOSTCODE", patterns=[ukpostcode_pattern]
+)
 ### Street name
+def extract_street_name(text: str) -> str:
     """
     Extracts the street name and preceding word (that should contain at least one number) from the given text.
+    """
     street_types = [
+        "Street",
+        "St",
+        "Boulevard",
+        "Blvd",
+        "Highway",
+        "Hwy",
+        "Broadway",
+        "Freeway",
+        "Causeway",
+        "Cswy",
+        "Expressway",
+        "Way",
+        "Walk",
+        "Lane",
+        "Ln",
+        "Road",
+        "Rd",
+        "Avenue",
+        "Ave",
+        "Circle",
+        "Cir",
+        "Cove",
+        "Cv",
+        "Drive",
+        "Dr",
+        "Parkway",
+        "Pkwy",
+        "Park",
+        "Court",
+        "Ct",
+        "Square",
+        "Sq",
+        "Loop",
+        "Place",
+        "Pl",
+        "Parade",
+        "Estate",
+        "Alley",
+        "Arcade",
+        "Avenue",
+        "Ave",
+        "Bay",
+        "Bend",
+        "Brae",
+        "Byway",
+        "Close",
+        "Corner",
+        "Cove",
+        "Crescent",
+        "Cres",
+        "Cul-de-sac",
+        "Dell",
+        "Drive",
+        "Dr",
+        "Esplanade",
+        "Glen",
+        "Green",
+        "Grove",
+        "Heights",
+        "Hts",
+        "Mews",
+        "Parade",
+        "Path",
+        "Piazza",
+        "Promenade",
+        "Quay",
+        "Ridge",
+        "Row",
+        "Terrace",
+        "Ter",
+        "Track",
+        "Trail",
+        "View",
+        "Villas",
+        "Marsh",
+        "Embankment",
+        "Cut",
+        "Hill",
+        "Passage",
+        "Rise",
+        "Vale",
+        "Side",
     ]
     # Dynamically construct the regex pattern with all possible street types
+    street_types_pattern = "|".join(
+        rf"{re.escape(street_type)}" for street_type in street_types
+    )
     # The overall regex pattern to capture the street name and preceding word(s)
+    pattern = r"(?P<preceding_word>\w*\d\w*)\s*"
+    pattern += rf"(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)"
     # Find all matches in text
     matches = re.finditer(pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
     end_positions = []
     for match in matches:
+        match.group("preceding_word").strip()
+        match.group("street_name").strip()
         start_pos = match.start()
         end_pos = match.end()
+        # print(f"Start: {start_pos}, End: {end_pos}")
+        # print(f"Preceding words: {preceding_word}")
+        # print(f"Street name: {street_name}")
         start_positions.append(start_pos)
         end_positions.append(end_pos)
     return start_positions, end_positions
 class StreetNameRecognizer(EntityRecognizer):
     def load(self) -> None:
         """No loading is required."""
         pass
+    def analyze(
+        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
+    ) -> List[RecognizerResult]:
         """
         Logic for detecting a specific PII
         """
         for i in range(0, len(start_pos)):
             result = RecognizerResult(
+                entity_type="STREETNAME", start=start_pos[i], end=end_pos[i], score=1
+            )
             results.append(result)
         return results
 street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
 ## Custom fuzzy match recogniser for list of strings
+def custom_fuzzy_word_list_regex(text: str, custom_list: List[str] = []):
     # Create regex pattern, handling quotes carefully
     quote_str = '"'
     replace_str = '(?:"|"|")'
+    custom_regex_pattern = "|".join(
+        rf"(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)"
         for term in custom_list
     )
     # Find all matches in text
+    matches = re.finditer(
+        custom_regex_pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE
+    )
     start_positions = []
     end_positions = []
 class CustomWordFuzzyRecognizer(EntityRecognizer):
+    def __init__(
+        self,
+        supported_entities: List[str],
+        custom_list: List[str] = [],
+        spelling_mistakes_max: int = 1,
+        search_whole_phrase: bool = True,
+    ):
         super().__init__(supported_entities=supported_entities)
         self.custom_list = custom_list  # Store the custom_list as an instance attribute
+        self.spelling_mistakes_max = (
+            spelling_mistakes_max  # Store the max spelling mistakes
+        )
+        self.search_whole_phrase = (
+            search_whole_phrase  # Store the search whole phrase flag
+        )
     def load(self) -> None:
         """No loading is required."""
         pass
+    def analyze(
+        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
+    ) -> List[RecognizerResult]:
         """
         Logic for detecting a specific PII
         """
+        start_pos, end_pos = spacy_fuzzy_search(
+            text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase
+        )  # Pass new parameters
         results = []
         for i in range(0, len(start_pos)):
             result = RecognizerResult(
+                entity_type="CUSTOM_FUZZY", start=start_pos[i], end=end_pos[i], score=1
             )
             results.append(result)
         return results
 custom_list_default = []
+custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(
+    supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default
+)
 # Pass the loaded model to the new LoadedSpacyNlpEngine
+loaded_nlp_engine = LoadedSpacyNlpEngine(
+    loaded_spacy_model=nlp, language_code=ACTIVE_LANGUAGE_CODE
+)
+def create_nlp_analyser(
+    language: str = DEFAULT_LANGUAGE,
+    custom_list: List[str] = None,
+    spelling_mistakes_max: int = 1,
+    search_whole_phrase: bool = True,
+    existing_nlp_analyser: AnalyzerEngine = None,
+    return_also_model: bool = False,
+):
     """
     Create an nlp_analyser object based on the specified language input.
     Args:
         language (str): Language code (e.g., "en", "de", "fr", "es", etc.)
         custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
         search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
         existing_nlp_analyser (AnalyzerEngine, optional): Existing nlp_analyser object to use. Defaults to None.
         return_also_model (bool, optional): Whether to return the nlp_model object as well. Defaults to False.
     Returns:
         AnalyzerEngine: Configured nlp_analyser object with custom recognizers
     """
+    if existing_nlp_analyser is None:
         pass
     else:
         if existing_nlp_analyser.supported_languages[0] == language:
     # Load spaCy model for the specified language
     nlp_model = load_spacy_model(language)
     # Get base language code
     base_lang_code = _base_language_code(language)
     # Create custom recognizers
     if custom_list is None:
         custom_list = []
     custom_recogniser = custom_word_list_recogniser(custom_list)
     custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(
+        supported_entities=["CUSTOM_FUZZY"],
         custom_list=custom_list,
         spelling_mistakes_max=spelling_mistakes_max,
+        search_whole_phrase=search_whole_phrase,
     )
     # Create NLP engine with loaded model
     loaded_nlp_engine = LoadedSpacyNlpEngine(
+        loaded_spacy_model=nlp_model, language_code=base_lang_code
     )
     # Create analyzer engine
     nlp_analyser = AnalyzerEngine(
         nlp_engine=loaded_nlp_engine,
         supported_languages=[base_lang_code],
         log_decision_process=False,
     )
     # Add custom recognizers to nlp_analyser
     nlp_analyser.registry.add_recognizer(custom_recogniser)
     nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)
     # Add language-specific recognizers for English
     if base_lang_code == "en":
         nlp_analyser.registry.add_recognizer(street_recogniser)
     if return_also_model:
         return nlp_analyser, nlp_model
     return nlp_analyser
 # Create the default nlp_analyser using the new function
 nlp_analyser, nlp = create_nlp_analyser(DEFAULT_LANGUAGE, return_also_model=True)
+def spacy_fuzzy_search(
+    text: str,
+    custom_query_list: List[str] = [],
+    spelling_mistakes_max: int = 1,
+    search_whole_phrase: bool = True,
+    nlp=nlp,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """Conduct fuzzy match on a list of text data."""
     all_matches = []
     all_start_positions = []
     all_end_positions = []
     all_ratios = []
+    # print("custom_query_list:", custom_query_list)
     if not text:
         out_message = "No text data found. Skipping page."
         query = nlp(string_query)
+        if search_whole_phrase is False:
             # Keep only words that are not stop words
+            token_query = [
+                token.text
+                for token in query
+                if not token.is_space and not token.is_stop and not token.is_punct
+            ]
             spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
             if len(token_query) > 1:
+                # pattern_lemma = [{"LEMMA": {"IN": query}}]
+                pattern_fuzz = [
+                    {"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}
+                ]
             else:
+                # pattern_lemma = [{"LEMMA": query[0]}]
+                pattern_fuzz = [
+                    {"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}
+                ]
+            matcher = Matcher(nlp.vocab)
             matcher.add(string_query, [pattern_fuzz])
+            # matcher.add(string_query, [pattern_lemma])
         else:
             # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
             # If you want to match the whole phrase, use phrase matcher
         docs = nlp.pipe([text], batch_size=batch_size)
         # Get number of matches per doc
+        for doc in docs:  # progress.tqdm(docs, desc = "Searching text", unit = "rows"):
             matches = matcher(doc)
             match_count = len(matches)
             # If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
+            if search_whole_phrase is False:
                 all_matches.append(match_count)
                 for match_id, start, end in matches:
                     span = str(doc[start:end]).strip()
                     query_search = str(query).strip()
                     # Convert word positions to character positions
                     start_char = doc[start].idx  # Start character position
+                    end_char = doc[end - 1].idx + len(
+                        doc[end - 1]
+                    )  # End character position
                     # The positions here are word position, not character position
                     all_matches.append(match_count)
                     all_start_positions.append(start_char)
                     all_end_positions.append(end_char)
             else:
                 for match_id, start, end, ratio, pattern in matches:
                     span = str(doc[start:end]).strip()
                     query_search = str(query).strip()
                     # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
                     distance = Levenshtein.distance(query_search.lower(), span.lower())
+                    # print("Levenshtein distance:", distance)
+                    if distance > spelling_mistakes_max:
                         match_count = match_count - 1
                     else:
                         # Convert word positions to character positions
                         start_char = doc[start].idx  # Start character position
+                        end_char = doc[end - 1].idx + len(
+                            doc[end - 1]
+                        )  # End character position
                         all_matches.append(match_count)
                         all_start_positions.append(start_char)
                         all_end_positions.append(end_char)
+                        all_ratios.append(ratio)
     return all_start_positions, all_end_positions

tools/presidio_analyzer_custom.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import gradio as gr
-from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
-#from tqdm import tqdm
 from presidio_analyzer import DictAnalyzerResult, RecognizerResult
 from presidio_analyzer.nlp_engine import NlpArtifacts
 def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
     """
     Create RecognizerResult from a dictionary.
@@ -25,116 +27,116 @@ def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
     score = data.get("Score")
     analysis_explanation = None
     recognition_metadata = None
-    return RecognizerResult(entity_type, start, end, score, analysis_explanation, recognition_metadata)
 def analyze_iterator_custom(
-        self,
-        texts: Iterable[Union[str, bool, float, int]],
-        language: str,
-        list_length:int,
-        progress=gr.Progress(),
-        **kwargs,
-    ) -> List[List[RecognizerResult]]:
-        """
-        Analyze an iterable of strings.
-        :param texts: An list containing strings to be analyzed.
-        :param language: Input language
-        :param list_length: Length of the input list.
-        :param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
-        """
-        # validate types
-        texts = self._validate_types(texts)
-        # Process the texts as batch for improved performance
-        nlp_artifacts_batch: Iterator[
-            Tuple[str, NlpArtifacts]
-        ] = self.analyzer_engine.nlp_engine.process_batch(
-            texts=texts, language=language
-        )
-        list_results = []
-        # Uncomment this if you want to show progress within a file
-        #for text, nlp_artifacts in progress.tqdm(nlp_artifacts_batch, total = list_length, desc = "Analysing text for personal information", unit = "rows"):
-        for text, nlp_artifacts in nlp_artifacts_batch:
-            results = self.analyzer_engine.analyze(
-                text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
-            )
-            list_results.append(results)
-        return list_results
 def analyze_dict(
-        self,
-        input_dict: Dict[str, Union[Any, Iterable[Any]]],
-        language: str,
-        keys_to_skip: Optional[List[str]] = None,
-        **kwargs,
-    ) -> Iterator[DictAnalyzerResult]:
-        """
-        Analyze a dictionary of keys (strings) and values/iterable of values.
-        Non-string values are returned as is.
-        :param input_dict: The input dictionary for analysis
-        :param language: Input language
-        :param keys_to_skip: Keys to ignore during analysis
-        :param kwargs: Additional keyword arguments
-        for the `AnalyzerEngine.analyze` method.
-        Use this to pass arguments to the analyze method,
-        such as `ad_hoc_recognizers`, `context`, `return_decision_process`.
-        See `AnalyzerEngine.analyze` for the full list.
-        """
-        context = []
-        if "context" in kwargs:
-            context = kwargs["context"]
-            del kwargs["context"]
-        if not keys_to_skip:
-            keys_to_skip = []
-        for key, value in input_dict.items():
-            if not value or key in keys_to_skip:
-                yield DictAnalyzerResult(key=key, value=value, recognizer_results=[])
-                continue  # skip this key as requested
-            # Add the key as an additional context
-            specific_context = context[:]
-            specific_context.append(key)
-            if type(value) in (str, int, bool, float):
-                results: List[RecognizerResult] = self.analyzer_engine.analyze(
-                    text=str(value), language=language, context=[key], **kwargs
-                )
-            elif isinstance(value, dict):
-                new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip)
-                results = self.analyze_dict(
-                    input_dict=value,
-                    language=language,
-                    context=specific_context,
-                    keys_to_skip=new_keys_to_skip,
-                    **kwargs,
-                )
-            elif isinstance(value, Iterable):
-                # Recursively iterate nested dicts
-                list_length = len(value)
-                results: List[List[RecognizerResult]] = analyze_iterator_custom(self,
-                    texts=value,
-                    language=language,
-                    context=specific_context,
-                    list_length=list_length,
-                    **kwargs,
-                )
-            else:
-                raise ValueError(f"type {type(value)} is unsupported.")
-            yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)

+from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union
 import gradio as gr
+# from tqdm import tqdm
 from presidio_analyzer import DictAnalyzerResult, RecognizerResult
 from presidio_analyzer.nlp_engine import NlpArtifacts
 def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
     """
     Create RecognizerResult from a dictionary.
     score = data.get("Score")
     analysis_explanation = None
     recognition_metadata = None
+    return RecognizerResult(
+        entity_type, start, end, score, analysis_explanation, recognition_metadata
+    )
 def analyze_iterator_custom(
+    self,
+    texts: Iterable[Union[str, bool, float, int]],
+    language: str,
+    list_length: int,
+    progress=gr.Progress(),
+    **kwargs,
+) -> List[List[RecognizerResult]]:
+    """
+    Analyze an iterable of strings.
+    :param texts: An list containing strings to be analyzed.
+    :param language: Input language
+    :param list_length: Length of the input list.
+    :param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
+    """
+    # validate types
+    texts = self._validate_types(texts)
+    # Process the texts as batch for improved performance
+    nlp_artifacts_batch: Iterator[Tuple[str, NlpArtifacts]] = (
+        self.analyzer_engine.nlp_engine.process_batch(texts=texts, language=language)
+    )
+    list_results = []
+    # Uncomment this if you want to show progress within a file
+    # for text, nlp_artifacts in progress.tqdm(nlp_artifacts_batch, total = list_length, desc = "Analysing text for personal information", unit = "rows"):
+    for text, nlp_artifacts in nlp_artifacts_batch:
+        results = self.analyzer_engine.analyze(
+            text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
+        )
+        list_results.append(results)
+    return list_results
 def analyze_dict(
+    self,
+    input_dict: Dict[str, Union[Any, Iterable[Any]]],
+    language: str,
+    keys_to_skip: Optional[List[str]] = None,
+    **kwargs,
+) -> Iterator[DictAnalyzerResult]:
+    """
+    Analyze a dictionary of keys (strings) and values/iterable of values.
+    Non-string values are returned as is.
+    :param input_dict: The input dictionary for analysis
+    :param language: Input language
+    :param keys_to_skip: Keys to ignore during analysis
+    :param kwargs: Additional keyword arguments
+    for the `AnalyzerEngine.analyze` method.
+    Use this to pass arguments to the analyze method,
+    such as `ad_hoc_recognizers`, `context`, `return_decision_process`.
+    See `AnalyzerEngine.analyze` for the full list.
+    """
+    context = []
+    if "context" in kwargs:
+        context = kwargs["context"]
+        del kwargs["context"]
+    if not keys_to_skip:
+        keys_to_skip = []
+    for key, value in input_dict.items():
+        if not value or key in keys_to_skip:
+            yield DictAnalyzerResult(key=key, value=value, recognizer_results=[])
+            continue  # skip this key as requested
+        # Add the key as an additional context
+        specific_context = context[:]
+        specific_context.append(key)
+        if type(value) in (str, int, bool, float):
+            results: List[RecognizerResult] = self.analyzer_engine.analyze(
+                text=str(value), language=language, context=[key], **kwargs
+            )
+        elif isinstance(value, dict):
+            new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip)
+            results = self.analyze_dict(
+                input_dict=value,
+                language=language,
+                context=specific_context,
+                keys_to_skip=new_keys_to_skip,
+                **kwargs,
+            )
+        elif isinstance(value, Iterable):
+            # Recursively iterate nested dicts
+            list_length = len(value)
+            results: List[List[RecognizerResult]] = analyze_iterator_custom(
+                self,
+                texts=value,
+                language=language,
+                context=specific_context,
+                list_length=list_length,
+                **kwargs,
+            )
+        else:
+            raise ValueError(f"type {type(value)} is unsupported.")
+        yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)

tools/redaction_review.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

tools/textract_batch_call.py CHANGED Viewed

@@ -1,36 +1,54 @@
-import boto3
-import os
-import pandas as pd
 import json
 import logging
-import datetime
-import pymupdf
 import gradio as gr
 from gradio import FileData
-from typing import List
-from io import StringIO
-from urllib.parse import urlparse
-from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
-from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, RUN_AWS_FUNCTIONS, INPUT_FOLDER, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS
 from tools.aws_functions import download_file_from_s3
 from tools.file_conversion import get_input_file_names
 from tools.helper_functions import get_file_name_without_type
 DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
 def analyse_document_with_textract_api(
     local_pdf_path: str,
     s3_input_prefix: str,
     s3_output_prefix: str,
-    job_df:pd.DataFrame,
     s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
-    local_output_dir: str = OUTPUT_FOLDER,
-    handwrite_signature_checkbox:List[str] = list(),
-    successful_job_number:int=0,
-    total_document_page_count:int=1,
     general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
-    aws_region: str = AWS_REGION # Optional: specify region if not default
-    ):
     """
     Uploads a local PDF to S3, starts a Textract analysis job (detecting text & signatures),
     waits for completion, and downloads the output JSON from S3 to a local directory.
@@ -42,7 +60,7 @@ def analyse_document_with_textract_api(
         s3_output_prefix (str): S3 prefix (folder) where Textract should write output.
         job_df (pd.DataFrame): Dataframe containing information from previous Textract API calls.
         s3_bucket_name (str, optional): S3 bucket in which to save API call outputs.
-        local_output_dir (str, optional): Local directory to save the downloaded JSON results.
         handwrite_signature_checkbox (List[str], optional): List of feature types to extract from the document.
         successful_job_number (int): The number of successful jobs that have been submitted in this session.
         total_document_page_count (int): The number of pages in the document
@@ -71,7 +89,7 @@ def analyse_document_with_textract_api(
     file_extension = os.path.splitext(local_pdf_path)[1].lower()
     # Load pdf to get page count if not provided
-    if not total_document_page_count and file_extension in ['.pdf']:
         print("Page count not provided. Loading PDF to get page count")
         try:
             pymupdf_doc = pymupdf.open(local_pdf_path)
@@ -81,7 +99,7 @@ def analyse_document_with_textract_api(
         except Exception as e:
             print("Failed to load PDF to get page count:", e, "setting page count to 1")
             total_document_page_count = 1
-            #raise Exception(f"Failed to load PDF to get page count: {e}")
     else:
         total_document_page_count = 1
@@ -89,50 +107,92 @@ def analyse_document_with_textract_api(
         os.makedirs(local_output_dir)
         log_message = f"Created local output directory: {local_output_dir}"
         print(log_message)
-        #logging.info(log_message)
     # Initialize boto3 clients
     session = boto3.Session(region_name=aws_region)
-    s3_client = session.client('s3')
-    textract_client = session.client('textract')
     # --- 1. Upload PDF to S3 ---
     pdf_filename = os.path.basename(local_pdf_path)
-    s3_input_key = os.path.join(s3_input_prefix, pdf_filename).replace("\\", "/") # Ensure forward slashes for S3
-    log_message = f"Uploading '{local_pdf_path}' to 's3://{s3_bucket_name}/{s3_input_key}'..."
     print(log_message)
-    #logging.info(log_message)
     try:
         s3_client.upload_file(local_pdf_path, s3_bucket_name, s3_input_key)
         log_message = "Upload successful."
         print(log_message)
-        #logging.info(log_message)
     except Exception as e:
         log_message = f"Failed to upload PDF to S3: {e}"
         print(log_message)
-        #logging.error(log_message)
         raise
     # Filter job_df to include rows only where the analysis date is after the current date - DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS
     if not job_df.empty:
-        job_df = job_df.loc[job_df["job_date_time"] > (datetime.datetime.now() - datetime.timedelta(days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)),:]
     # If job_df is not empty
-    if not job_df.empty:
         if "file_name" in job_df.columns:
-            matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "file_name"]
-            matching_job_id_file_names_dates = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "job_date_time"]
-            matching_job_id = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "job_id"]
-            matching_handwrite_signature = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "signature_extraction"]
             if len(matching_job_id) > 0:
                 pass
             else:
                 matching_job_id = "unknown_job_id"
-            if len(matching_job_id_file_names) > 0 and len(matching_handwrite_signature) > 0:
                 out_message = f"Existing Textract outputs found for file {pdf_filename} from date {matching_job_id_file_names_dates.iloc[0]}. No need to re-analyse. Please download existing results from the list with job ID {matching_job_id.iloc[0]}"
                 gr.Warning(out_message)
                 raise Exception(out_message)
@@ -142,104 +202,121 @@ def analyse_document_with_textract_api(
     print(message)
     try:
-        if "Extract signatures" in handwrite_signature_checkbox or "Extract forms" in handwrite_signature_checkbox or "Extract layout" in handwrite_signature_checkbox or "Extract tables" in handwrite_signature_checkbox:
             feature_types = list()
-            if 'Extract signatures' in handwrite_signature_checkbox:
-                feature_types.append('SIGNATURES')
             if "Extract forms" in handwrite_signature_checkbox:
-                feature_types.append('FORMS')
             if "Extract layout" in handwrite_signature_checkbox:
-                feature_types.append('LAYOUT')
             if "Extract tables" in handwrite_signature_checkbox:
-                feature_types.append('TABLES')
             response = textract_client.start_document_analysis(
                 DocumentLocation={
-                    'S3Object': {
-                        'Bucket': s3_bucket_name,
-                        'Name': s3_input_key
-                    }
                 },
-                FeatureTypes=feature_types, # Analyze for signatures, forms, and tables
-                OutputConfig={
-                    'S3Bucket': s3_bucket_name,
-                    'S3Prefix': s3_output_prefix
-                }
             )
-            job_type="document_analysis"
-        if not "Extract signatures" in handwrite_signature_checkbox and not "Extract forms" in handwrite_signature_checkbox and not "Extract layout" in handwrite_signature_checkbox and not "Extract tables" in handwrite_signature_checkbox:
             response = textract_client.start_document_text_detection(
                 DocumentLocation={
-                    'S3Object': {
-                        'Bucket': s3_bucket_name,
-                        'Name': s3_input_key
-                    }
                 },
-                OutputConfig={
-                    'S3Bucket': s3_bucket_name,
-                    'S3Prefix': s3_output_prefix
-                }
             )
-            job_type="document_text_detection"
-        job_id = response['JobId']
         print(f"Textract job started with JobId: {job_id}")
         # Prepare CSV in memory
         log_csv_key_location = f"{s3_output_prefix}/textract_document_jobs.csv"
-        job_location_full = f"s3://{s3_bucket_name}/{s3_output_prefix}/{job_id}/"
-        csv_buffer = StringIO()
-        log_df = pd.DataFrame([{
-            'job_id': job_id,
-            'file_name': pdf_filename,
-            'job_type': job_type,
-            'signature_extraction':handwrite_signature_checkbox,
-            'job_date_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-        }])
         # File path
         log_file_path = os.path.join(local_output_dir, "textract_document_jobs.csv")
-        log_file_path_job_id = os.path.join(local_output_dir, pdf_filename + "_textract_document_jobs_job_id.txt")
         # Write latest job ID to local text file
-        with open(log_file_path_job_id, 'w') as f:
             f.write(job_id)
         # Check if file exists
         file_exists = os.path.exists(log_file_path)
         # Append to CSV if it exists, otherwise write with header
-        log_df.to_csv(log_file_path, mode='a', index=False, header=not file_exists)
-        #log_df.to_csv(csv_buffer)
         # Upload the file
-        s3_client.upload_file(log_file_path, general_s3_bucket_name, log_csv_key_location)
         # Upload to S3 (overwrite existing file)
-        #s3_client.put_object(Bucket=general_s3_bucket_name, Key=log_csv_key_location, Body=csv_buffer.getvalue())
         print(f"Job ID written to {log_csv_key_location}")
-        #logging.info(f"Job ID written to s3://{s3_bucket_name}/{s3_output_prefix}/textract_document_jobs.csv")
     except Exception as e:
         error = f"Failed to start Textract job: {e}"
         print(error)
-        #logging.error(error)
         raise
     successful_job_number += 1
     total_number_of_textract_page_calls = total_document_page_count
-    return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call, total_number_of_textract_page_calls, task_textbox
-def return_job_status(job_id:str,
-                     response:dict,
-                     attempts:int,
-                     poll_interval_seconds: int = 0,
-                     max_polling_attempts: int = 1 # ~10 minutes total wait time
-                     ):
-    '''
     Polls the AWS Textract service to retrieve the current status of an asynchronous document analysis job.
     This function checks the job status from the provided response and logs relevant information or errors.
@@ -255,87 +332,103 @@ def return_job_status(job_id:str,
     Raises:
         Exception: If the Textract job status is 'FAILED' or 'PARTIAL_SUCCESS', or if an unexpected status is encountered.
-    '''
-    job_status = response['JobStatus']
-    logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
-    if job_status == 'IN_PROGRESS':
         pass
-        #time.sleep(poll_interval_seconds)
-    elif job_status == 'SUCCEEDED':
         logging.info("Textract job succeeded.")
-    elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
-        status_message = response.get('StatusMessage', 'No status message provided.')
-        warnings = response.get('Warnings', [])
-        logging.error(f"Textract job ended with status: {job_status}. Message: {status_message}")
         if warnings:
             logging.warning(f"Warnings: {warnings}")
         # Decide if PARTIAL_SUCCESS should proceed or raise error
         # For simplicity here, we raise for both FAILED and PARTIAL_SUCCESS
-        raise Exception(f"Textract job {job_id} failed or partially failed. Status: {job_status}. Message: {status_message}")
     else:
         # Should not happen based on documentation, but handle defensively
         raise Exception(f"Unexpected Textract job status: {job_status}")
     return job_status
-def download_textract_job_files(s3_client:str,
-                                s3_bucket_name:str,
-                                s3_output_key_prefix:str,
-                                pdf_filename:str,
-                                job_id:str,
-                                local_output_dir:str):
-    '''
     Download and combine selected job files from the AWS Textract service.
-    '''
-    #print("s3_output_key_prefix at download:", s3_output_key_prefix)
     list_response = s3_client.list_objects_v2(
-        Bucket=s3_bucket_name,
-        Prefix=s3_output_key_prefix
     )
-    output_files = list_response.get('Contents', [])
     if not output_files:
         # Sometimes Textract might take a moment longer to write the output after SUCCEEDED status
-        #logging.warning("No output files found immediately after job success. Waiting briefly and retrying list...")
-        #time.sleep(5)
         list_response = s3_client.list_objects_v2(
-            Bucket=s3_bucket_name,
-            Prefix=s3_output_key_prefix
         )
-        output_files = list_response.get('Contents', [])
     if not output_files:
-        logging.error(f"No output files found in s3://{s3_bucket_name}/{s3_output_key_prefix}")
         # You could alternatively try getting results via get_document_analysis pagination here
         # but sticking to the request to download from S3 output path.
-        raise FileNotFoundError(f"Textract output files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
     # Usually, we only need the first/main JSON output file(s)
     # For simplicity, download the first one found. A more complex scenario might merge multiple files.
     # Filter out potential directory markers if any key ends with '/'
     json_files_to_download = [
-    f for f in output_files
-    if f['Key'] != s3_output_key_prefix and not f['Key'].endswith('/') and 'access_check' not in f['Key']
-]
-    #print("json_files_to_download:", json_files_to_download)
     if not json_files_to_download:
         error = f"No JSON files found (only prefix marker?) in s3://{s3_bucket_name}/{s3_output_key_prefix}"
         print(error)
-        #logging.error(error)
         raise FileNotFoundError(error)
     combined_blocks = []
-    for f in sorted(json_files_to_download, key=lambda x: x['Key']):  # Optional: sort to ensure consistent order
-        obj = s3_client.get_object(Bucket=s3_bucket_name, Key=f['Key'])
-        data = json.loads(obj['Body'].read())
         # Assuming Textract-style output with a "Blocks" key
         if "Blocks" in data:
             combined_blocks.extend(data["Blocks"])
@@ -345,10 +438,10 @@ def download_textract_job_files(s3_client:str,
     # Build final combined JSON structure
     combined_output = {
         "DocumentMetadata": {
-            "Pages": len(set(block.get('Page', 1) for block in combined_blocks))
         },
         "Blocks": combined_blocks,
-        "JobStatus": "SUCCEEDED"
     }
     output_filename_base = os.path.basename(pdf_filename)
@@ -356,7 +449,7 @@ def download_textract_job_files(s3_client:str,
     local_output_filename = f"{output_filename_base_no_ext}_textract.json"
     local_output_path = os.path.join(local_output_dir, local_output_filename)
-    with open(local_output_path, 'w') as f:
         json.dump(combined_output, f)
     print(f"Combined Textract output written to {local_output_path}")
@@ -367,133 +460,201 @@ def download_textract_job_files(s3_client:str,
     downloaded_file_path = local_output_path
     # Log if multiple files were found, as user might need to handle them
-    #if len(json_files_to_download) > 1:
     #    logging.warning(f"Multiple output files found in S3 output location. Downloaded the first: '{s3_output_key}'. Other files exist.")
     return downloaded_file_path
-def check_for_provided_job_id(job_id:str):
     if not job_id:
-        raise Exception("Please provide a job ID.")
     return
 def load_pdf_job_file_from_s3(
     load_s3_jobs_input_loc,
     pdf_filename,
     local_output_dir,
     s3_bucket_name,
-    RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS):
     try:
-        pdf_file_location = ''
-        doc_file_name_no_extension_textbox = ''
-        s3_input_key_prefix = os.path.join(load_s3_jobs_input_loc, pdf_filename).replace("\\", "/")
         s3_input_key_prefix = s3_input_key_prefix + ".pdf"
         local_input_file_path = os.path.join(local_output_dir, pdf_filename)
         local_input_file_path = local_input_file_path + ".pdf"
-        download_file_from_s3(s3_bucket_name, s3_input_key_prefix, local_input_file_path, RUN_AWS_FUNCTIONS= RUN_AWS_FUNCTIONS)
         pdf_file_location = [local_input_file_path]
         doc_file_name_no_extension_textbox = get_file_name_without_type(pdf_filename)
     except Exception as e:
-        print("Could not download PDF job file from S3 due to:", e)
     return pdf_file_location, doc_file_name_no_extension_textbox
-def replace_existing_pdf_input_for_whole_document_outputs(
-    load_s3_jobs_input_loc:str,
-    pdf_filename:str,
-    local_output_dir:str,
-    s3_bucket_name:str,
-    in_doc_files:FileData=[],
-    input_folder:str=INPUT_FOLDER,
     RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
-    progress = gr.Progress(track_tqdm=True)):
     progress(0.1, "Loading PDF from s3")
     if in_doc_files:
-        doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count = get_input_file_names(in_doc_files)
         if pdf_filename == doc_file_name_no_extension_textbox:
             print("Existing loaded PDF file has same name as file from S3")
             doc_file_name_no_extension_textbox = pdf_filename
             downloaded_pdf_file_location = in_doc_files
         else:
-            downloaded_pdf_file_location, doc_file_name_no_extension_textbox = load_pdf_job_file_from_s3(load_s3_jobs_input_loc, pdf_filename, local_output_dir, s3_bucket_name, RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS)
-            doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count = get_input_file_names(downloaded_pdf_file_location)
-    else:
-        downloaded_pdf_file_location, doc_file_name_no_extension_textbox = load_pdf_job_file_from_s3(load_s3_jobs_input_loc, pdf_filename, local_output_dir, s3_bucket_name, RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS)
-        doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count = get_input_file_names(downloaded_pdf_file_location)
-    return downloaded_pdf_file_location, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count
 def poll_whole_document_textract_analysis_progress_and_download(
-    job_id:str,
-    job_type_dropdown:str,
     s3_output_prefix: str,
-    pdf_filename:str,
-    job_df:pd.DataFrame,
     s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
     local_output_dir: str = OUTPUT_FOLDER,
-    load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
-    load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
-    aws_region: str = AWS_REGION, # Optional: specify region if not default
-    load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
     poll_interval_seconds: int = 1,
-    max_polling_attempts: int = 1, # ~10 minutes total wait time):
     DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS: int = DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS,
-    progress = gr.Progress(track_tqdm=True)
-    ):
-    '''
     Poll AWS for the status of a Textract API job. Return status, and if finished, combine and download results into a locally-stored json file for further processing by the app.
-    '''
     progress(0.1, "Querying AWS Textract for status of document analysis job")
     if job_id:
         # Initialize boto3 clients
         session = boto3.Session(region_name=aws_region)
-        s3_client = session.client('s3')
-        textract_client = session.client('textract')
         # --- 3. Poll for Job Completion ---
-        job_status = 'IN_PROGRESS'
         attempts = 0
         message = "Polling Textract for job completion status..."
         print(message)
-        #logging.info("Polling Textract for job completion status...")
         # Update Textract document history df
         try:
-            job_df = load_in_textract_job_details(load_s3_jobs=load_jobs_from_s3,
-                                        load_s3_jobs_loc=load_s3_jobs_loc,
-                                        load_local_jobs_loc=load_local_jobs_loc)
         except Exception as e:
-            #logging.error(f"Failed to update job details dataframe: {e}")
             print(f"Failed to update job details dataframe: {e}")
-            #raise
-        while job_status == 'IN_PROGRESS' and attempts <= max_polling_attempts:
             attempts += 1
             try:
-                if job_type_dropdown=="document_analysis":
                     response = textract_client.get_document_analysis(JobId=job_id)
-                    job_status = return_job_status(job_id, response, attempts, poll_interval_seconds, max_polling_attempts)
-                elif job_type_dropdown=="document_text_detection":
                     response = textract_client.get_document_text_detection(JobId=job_id)
-                    job_status = return_job_status(job_id, response, attempts, poll_interval_seconds, max_polling_attempts)
                 else:
-                    error = f"Unknown job type, cannot poll job"
                     print(error)
-                    logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than {DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS} days) or never existed.")
-                    raise Exception(error_message)
             except textract_client.exceptions.InvalidJobIdException:
                 error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than {DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS} days) or never existed."
@@ -501,17 +662,19 @@ def poll_whole_document_textract_analysis_progress_and_download(
                 logging.error(error_message)
                 raise Exception(error_message)
             except Exception as e:
-                error_message = f"Error while polling Textract status for job {job_id}: {e}"
                 print(error_message)
                 logging.error(error_message)
                 raise Exception(error_message)
         downloaded_file_path = None
-        if job_status == 'SUCCEEDED':
-            #raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
-            # 3b - Replace PDF file name if it exists in the job dataframe
-            progress(0.5, "Document analysis task outputs found. Downloading from S3")
             # If job_df is not empty
@@ -520,7 +683,9 @@ def poll_whole_document_textract_analysis_progress_and_download(
             if not job_df.empty:
                 if "file_name" in job_df.columns:
-                    matching_job_id_file_names = job_df.loc[job_df["job_id"] == job_id, "file_name"]
                     if pdf_filename and not matching_job_id_file_names.empty:
                         if pdf_filename == matching_job_id_file_names.iloc[0]:
@@ -537,58 +702,77 @@ def poll_whole_document_textract_analysis_progress_and_download(
             # Textract typically creates output under s3_output_prefix/job_id/
             # There might be multiple JSON files if pagination occurred during writing.
             # Usually, for smaller docs, there's one file, often named '1'.
-            # For robust handling, list objects and find the JSON(s).
-            s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
-            logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
             try:
-                downloaded_file_path = download_textract_job_files(s3_client,
-                                                s3_bucket_name,
-                                                s3_output_key_prefix,
-                                                pdf_filename,
-                                                job_id,
-                                                local_output_dir)
             except Exception as e:
-                #logging.error(f"Failed to download or process Textract output from S3: {e}")
                 print(f"Failed to download or process Textract output from S3: {e}")
                 raise
     else:
-        raise Exception("No Job ID provided.")
     output_pdf_filename = get_file_name_without_type(pdf_filename)
     return downloaded_file_path, job_status, job_df, output_pdf_filename
-def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
-                                     load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
-                                     load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
-                                     document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
-                                     aws_region:str=AWS_REGION,
-                                     DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS:int=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS):
-    '''
     Load in a dataframe of jobs previous submitted to the Textract API service.
-    '''
-    job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time'])
     # Initialize boto3 clients
     session = boto3.Session(region_name=aws_region)
-    s3_client = session.client('s3')
-    local_output_path = f'{load_local_jobs_loc}/textract_document_jobs.csv'
-    if load_s3_jobs == 'True':
-        s3_output_key = f'{load_s3_jobs_loc}/textract_document_jobs.csv'
         try:
             s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
-            #print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
-            s3_client.download_file(document_redaction_bucket, s3_output_key, local_output_path)
-            #print("Download successful.")
         except ClientError as e:
-            if e.response['Error']['Code'] == '404':
                 print("Log file does not exist in S3.")
             else:
                 print(f"Unexpected error occurred: {e}")
@@ -602,22 +786,37 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
         job_df = pd.read_csv(local_output_path)
         if "job_date_time" in job_df.columns:
-            job_df["job_date_time"] = pd.to_datetime(job_df["job_date_time"], errors='coerce')
             # Keep only jobs that have been completed in the last 'DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS' days
-            cutoff_time = pd.Timestamp.now() - pd.Timedelta(days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
-            job_df = job_df.loc[job_df["job_date_time"] > cutoff_time,:]
         try:
-            job_df = job_df[['job_id','file_name','job_type','signature_extraction','job_date_time']]
         except Exception as e:
-            print("Could not find one or more columns in Textract whole document list dataframe:", e)
     return job_df
-def download_textract_output(job_id:str,
-                             output_bucket:str,
-                             output_prefix:str,
-                             local_folder:str):
     """
     Checks the status of a Textract job and downloads the output ZIP file if the job is complete.
@@ -626,23 +825,26 @@ def download_textract_output(job_id:str,
     :param output_prefix: The prefix (folder path) in S3 where the output file is stored.
     :param local_folder: The local directory where the ZIP file should be saved.
     """
-    textract_client = boto3.client('textract')
-    s3_client = boto3.client('s3')
     # Check job status
     while True:
         response = textract_client.get_document_analysis(JobId=job_id)
-        status = response['JobStatus']
-        if status == 'SUCCEEDED':
             print("Job completed successfully.")
             break
-        elif status == 'FAILED':
-            print("Job failed:", response.get("StatusMessage", "No error message provided."))
             return
         else:
             print(f"Job is still {status}.")
-            #time.sleep(10)  # Wait before checking again
     # Find output ZIP file in S3
     output_file_key = f"{output_prefix}/{job_id}.zip"
@@ -655,8 +857,12 @@ def download_textract_output(job_id:str,
     except Exception as e:
         print(f"Error downloading file: {e}")
 def check_textract_outputs_exist(textract_output_found_checkbox):
-        if textract_output_found_checkbox == True:
-            print("Textract outputs found")
-            return
-        else: raise Exception("Relevant Textract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above")

+import datetime
 import json
 import logging
+import os
+from io import StringIO
+from typing import List
+import boto3
 import gradio as gr
+import pandas as pd
+import pymupdf
+from botocore.exceptions import (
+    ClientError,
+    NoCredentialsError,
+    PartialCredentialsError,
+    TokenRetrievalError,
+)
 from gradio import FileData
 from tools.aws_functions import download_file_from_s3
+from tools.config import (
+    AWS_REGION,
+    DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS,
+    DOCUMENT_REDACTION_BUCKET,
+    INPUT_FOLDER,
+    LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
+    OUTPUT_FOLDER,
+    RUN_AWS_FUNCTIONS,
+    TEXTRACT_JOBS_LOCAL_LOC,
+    TEXTRACT_JOBS_S3_LOC,
+    TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+)
 from tools.file_conversion import get_input_file_names
 from tools.helper_functions import get_file_name_without_type
 DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
 def analyse_document_with_textract_api(
     local_pdf_path: str,
     s3_input_prefix: str,
     s3_output_prefix: str,
+    job_df: pd.DataFrame,
     s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+    local_output_dir: str = OUTPUT_FOLDER,
+    handwrite_signature_checkbox: List[str] = list(),
+    successful_job_number: int = 0,
+    total_document_page_count: int = 1,
     general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
+    aws_region: str = AWS_REGION,  # Optional: specify region if not default
+):
     """
     Uploads a local PDF to S3, starts a Textract analysis job (detecting text & signatures),
     waits for completion, and downloads the output JSON from S3 to a local directory.
         s3_output_prefix (str): S3 prefix (folder) where Textract should write output.
         job_df (pd.DataFrame): Dataframe containing information from previous Textract API calls.
         s3_bucket_name (str, optional): S3 bucket in which to save API call outputs.
+        local_output_dir (str, optional): Local directory to save the downloaded JSON results.
         handwrite_signature_checkbox (List[str], optional): List of feature types to extract from the document.
         successful_job_number (int): The number of successful jobs that have been submitted in this session.
         total_document_page_count (int): The number of pages in the document
     file_extension = os.path.splitext(local_pdf_path)[1].lower()
     # Load pdf to get page count if not provided
+    if not total_document_page_count and file_extension in [".pdf"]:
         print("Page count not provided. Loading PDF to get page count")
         try:
             pymupdf_doc = pymupdf.open(local_pdf_path)
         except Exception as e:
             print("Failed to load PDF to get page count:", e, "setting page count to 1")
             total_document_page_count = 1
+            # raise Exception(f"Failed to load PDF to get page count: {e}")
     else:
         total_document_page_count = 1
         os.makedirs(local_output_dir)
         log_message = f"Created local output directory: {local_output_dir}"
         print(log_message)
+        # logging.info(log_message)
     # Initialize boto3 clients
     session = boto3.Session(region_name=aws_region)
+    s3_client = session.client("s3")
+    textract_client = session.client("textract")
     # --- 1. Upload PDF to S3 ---
     pdf_filename = os.path.basename(local_pdf_path)
+    s3_input_key = os.path.join(s3_input_prefix, pdf_filename).replace(
+        "\\", "/"
+    )  # Ensure forward slashes for S3
+    log_message = (
+        f"Uploading '{local_pdf_path}' to 's3://{s3_bucket_name}/{s3_input_key}'..."
+    )
     print(log_message)
+    # logging.info(log_message)
     try:
         s3_client.upload_file(local_pdf_path, s3_bucket_name, s3_input_key)
         log_message = "Upload successful."
         print(log_message)
+        # logging.info(log_message)
     except Exception as e:
         log_message = f"Failed to upload PDF to S3: {e}"
         print(log_message)
+        # logging.error(log_message)
         raise
     # Filter job_df to include rows only where the analysis date is after the current date - DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS
     if not job_df.empty:
+        job_df = job_df.loc[
+            job_df["job_date_time"]
+            > (
+                datetime.datetime.now()
+                - datetime.timedelta(days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
+            ),
+            :,
+        ]
     # If job_df is not empty
+    if not job_df.empty:
         if "file_name" in job_df.columns:
+            matching_job_id_file_names = job_df.loc[
+                (job_df["file_name"] == pdf_filename)
+                & (
+                    job_df["signature_extraction"].astype(str)
+                    == str(handwrite_signature_checkbox)
+                ),
+                "file_name",
+            ]
+            matching_job_id_file_names_dates = job_df.loc[
+                (job_df["file_name"] == pdf_filename)
+                & (
+                    job_df["signature_extraction"].astype(str)
+                    == str(handwrite_signature_checkbox)
+                ),
+                "job_date_time",
+            ]
+            matching_job_id = job_df.loc[
+                (job_df["file_name"] == pdf_filename)
+                & (
+                    job_df["signature_extraction"].astype(str)
+                    == str(handwrite_signature_checkbox)
+                ),
+                "job_id",
+            ]
+            matching_handwrite_signature = job_df.loc[
+                (job_df["file_name"] == pdf_filename)
+                & (
+                    job_df["signature_extraction"].astype(str)
+                    == str(handwrite_signature_checkbox)
+                ),
+                "signature_extraction",
+            ]
             if len(matching_job_id) > 0:
                 pass
             else:
                 matching_job_id = "unknown_job_id"
+            if (
+                len(matching_job_id_file_names) > 0
+                and len(matching_handwrite_signature) > 0
+            ):
                 out_message = f"Existing Textract outputs found for file {pdf_filename} from date {matching_job_id_file_names_dates.iloc[0]}. No need to re-analyse. Please download existing results from the list with job ID {matching_job_id.iloc[0]}"
                 gr.Warning(out_message)
                 raise Exception(out_message)
     print(message)
     try:
+        if (
+            "Extract signatures" in handwrite_signature_checkbox
+            or "Extract forms" in handwrite_signature_checkbox
+            or "Extract layout" in handwrite_signature_checkbox
+            or "Extract tables" in handwrite_signature_checkbox
+        ):
             feature_types = list()
+            if "Extract signatures" in handwrite_signature_checkbox:
+                feature_types.append("SIGNATURES")
             if "Extract forms" in handwrite_signature_checkbox:
+                feature_types.append("FORMS")
             if "Extract layout" in handwrite_signature_checkbox:
+                feature_types.append("LAYOUT")
             if "Extract tables" in handwrite_signature_checkbox:
+                feature_types.append("TABLES")
             response = textract_client.start_document_analysis(
                 DocumentLocation={
+                    "S3Object": {"Bucket": s3_bucket_name, "Name": s3_input_key}
                 },
+                FeatureTypes=feature_types,  # Analyze for signatures, forms, and tables
+                OutputConfig={"S3Bucket": s3_bucket_name, "S3Prefix": s3_output_prefix},
             )
+            job_type = "document_analysis"
+        if (
+            "Extract signatures" not in handwrite_signature_checkbox
+            and "Extract forms" not in handwrite_signature_checkbox
+            and "Extract layout" not in handwrite_signature_checkbox
+            and "Extract tables" not in handwrite_signature_checkbox
+        ):
             response = textract_client.start_document_text_detection(
                 DocumentLocation={
+                    "S3Object": {"Bucket": s3_bucket_name, "Name": s3_input_key}
                 },
+                OutputConfig={"S3Bucket": s3_bucket_name, "S3Prefix": s3_output_prefix},
             )
+            job_type = "document_text_detection"
+        job_id = response["JobId"]
         print(f"Textract job started with JobId: {job_id}")
         # Prepare CSV in memory
         log_csv_key_location = f"{s3_output_prefix}/textract_document_jobs.csv"
+        StringIO()
+        log_df = pd.DataFrame(
+            [
+                {
+                    "job_id": job_id,
+                    "file_name": pdf_filename,
+                    "job_type": job_type,
+                    "signature_extraction": handwrite_signature_checkbox,
+                    "job_date_time": datetime.datetime.now().strftime(
+                        "%Y-%m-%d %H:%M:%S"
+                    ),
+                }
+            ]
+        )
         # File path
         log_file_path = os.path.join(local_output_dir, "textract_document_jobs.csv")
+        log_file_path_job_id = os.path.join(
+            local_output_dir, pdf_filename + "_textract_document_jobs_job_id.txt"
+        )
         # Write latest job ID to local text file
+        with open(log_file_path_job_id, "w") as f:
             f.write(job_id)
         # Check if file exists
         file_exists = os.path.exists(log_file_path)
         # Append to CSV if it exists, otherwise write with header
+        log_df.to_csv(log_file_path, mode="a", index=False, header=not file_exists)
+        # log_df.to_csv(csv_buffer)
         # Upload the file
+        s3_client.upload_file(
+            log_file_path, general_s3_bucket_name, log_csv_key_location
+        )
         # Upload to S3 (overwrite existing file)
+        # s3_client.put_object(Bucket=general_s3_bucket_name, Key=log_csv_key_location, Body=csv_buffer.getvalue())
         print(f"Job ID written to {log_csv_key_location}")
+        # logging.info(f"Job ID written to s3://{s3_bucket_name}/{s3_output_prefix}/textract_document_jobs.csv")
     except Exception as e:
         error = f"Failed to start Textract job: {e}"
         print(error)
+        # logging.error(error)
         raise
     successful_job_number += 1
     total_number_of_textract_page_calls = total_document_page_count
+    return (
+        f"Textract analysis job submitted, job ID:{job_id}",
+        job_id,
+        job_type,
+        successful_job_number,
+        is_a_textract_api_call,
+        total_number_of_textract_page_calls,
+        task_textbox,
+    )
+def return_job_status(
+    job_id: str,
+    response: dict,
+    attempts: int,
+    poll_interval_seconds: int = 0,
+    max_polling_attempts: int = 1,  # ~10 minutes total wait time
+):
+    """
     Polls the AWS Textract service to retrieve the current status of an asynchronous document analysis job.
     This function checks the job status from the provided response and logs relevant information or errors.
     Raises:
         Exception: If the Textract job status is 'FAILED' or 'PARTIAL_SUCCESS', or if an unexpected status is encountered.
+    """
+    job_status = response["JobStatus"]
+    logging.info(
+        f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}"
+    )
+    if job_status == "IN_PROGRESS":
         pass
+        # time.sleep(poll_interval_seconds)
+    elif job_status == "SUCCEEDED":
         logging.info("Textract job succeeded.")
+    elif job_status in ["FAILED", "PARTIAL_SUCCESS"]:
+        status_message = response.get("StatusMessage", "No status message provided.")
+        warnings = response.get("Warnings", [])
+        logging.error(
+            f"Textract job ended with status: {job_status}. Message: {status_message}"
+        )
         if warnings:
             logging.warning(f"Warnings: {warnings}")
         # Decide if PARTIAL_SUCCESS should proceed or raise error
         # For simplicity here, we raise for both FAILED and PARTIAL_SUCCESS
+        raise Exception(
+            f"Textract job {job_id} failed or partially failed. Status: {job_status}. Message: {status_message}"
+        )
     else:
         # Should not happen based on documentation, but handle defensively
         raise Exception(f"Unexpected Textract job status: {job_status}")
     return job_status
+def download_textract_job_files(
+    s3_client: str,
+    s3_bucket_name: str,
+    s3_output_key_prefix: str,
+    pdf_filename: str,
+    job_id: str,
+    local_output_dir: str,
+):
+    """
     Download and combine selected job files from the AWS Textract service.
+    """
+    # print("s3_output_key_prefix at download:", s3_output_key_prefix)
     list_response = s3_client.list_objects_v2(
+        Bucket=s3_bucket_name, Prefix=s3_output_key_prefix
     )
+    output_files = list_response.get("Contents", [])
     if not output_files:
         # Sometimes Textract might take a moment longer to write the output after SUCCEEDED status
+        # logging.warning("No output files found immediately after job success. Waiting briefly and retrying list...")
+        # time.sleep(5)
         list_response = s3_client.list_objects_v2(
+            Bucket=s3_bucket_name, Prefix=s3_output_key_prefix
         )
+        output_files = list_response.get("Contents", [])
     if not output_files:
+        logging.error(
+            f"No output files found in s3://{s3_bucket_name}/{s3_output_key_prefix}"
+        )
         # You could alternatively try getting results via get_document_analysis pagination here
         # but sticking to the request to download from S3 output path.
+        raise FileNotFoundError(
+            f"Textract output files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}"
+        )
     # Usually, we only need the first/main JSON output file(s)
     # For simplicity, download the first one found. A more complex scenario might merge multiple files.
     # Filter out potential directory markers if any key ends with '/'
     json_files_to_download = [
+        f
+        for f in output_files
+        if f["Key"] != s3_output_key_prefix
+        and not f["Key"].endswith("/")
+        and "access_check" not in f["Key"]
+    ]
+    # print("json_files_to_download:", json_files_to_download)
     if not json_files_to_download:
         error = f"No JSON files found (only prefix marker?) in s3://{s3_bucket_name}/{s3_output_key_prefix}"
         print(error)
+        # logging.error(error)
         raise FileNotFoundError(error)
     combined_blocks = []
+    for f in sorted(
+        json_files_to_download, key=lambda x: x["Key"]
+    ):  # Optional: sort to ensure consistent order
+        obj = s3_client.get_object(Bucket=s3_bucket_name, Key=f["Key"])
+        data = json.loads(obj["Body"].read())
         # Assuming Textract-style output with a "Blocks" key
         if "Blocks" in data:
             combined_blocks.extend(data["Blocks"])
     # Build final combined JSON structure
     combined_output = {
         "DocumentMetadata": {
+            "Pages": len(set(block.get("Page", 1) for block in combined_blocks))
         },
         "Blocks": combined_blocks,
+        "JobStatus": "SUCCEEDED",
     }
     output_filename_base = os.path.basename(pdf_filename)
     local_output_filename = f"{output_filename_base_no_ext}_textract.json"
     local_output_path = os.path.join(local_output_dir, local_output_filename)
+    with open(local_output_path, "w") as f:
         json.dump(combined_output, f)
     print(f"Combined Textract output written to {local_output_path}")
     downloaded_file_path = local_output_path
     # Log if multiple files were found, as user might need to handle them
+    # if len(json_files_to_download) > 1:
     #    logging.warning(f"Multiple output files found in S3 output location. Downloaded the first: '{s3_output_key}'. Other files exist.")
     return downloaded_file_path
+def check_for_provided_job_id(job_id: str):
     if not job_id:
+        raise Exception("Please provide a job ID.")
     return
 def load_pdf_job_file_from_s3(
     load_s3_jobs_input_loc,
     pdf_filename,
     local_output_dir,
     s3_bucket_name,
+    RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+):
     try:
+        pdf_file_location = ""
+        doc_file_name_no_extension_textbox = ""
+        s3_input_key_prefix = os.path.join(
+            load_s3_jobs_input_loc, pdf_filename
+        ).replace("\\", "/")
         s3_input_key_prefix = s3_input_key_prefix + ".pdf"
         local_input_file_path = os.path.join(local_output_dir, pdf_filename)
         local_input_file_path = local_input_file_path + ".pdf"
+        download_file_from_s3(
+            s3_bucket_name,
+            s3_input_key_prefix,
+            local_input_file_path,
+            RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+        )
         pdf_file_location = [local_input_file_path]
         doc_file_name_no_extension_textbox = get_file_name_without_type(pdf_filename)
     except Exception as e:
+        print("Could not download PDF job file from S3 due to:", e)
     return pdf_file_location, doc_file_name_no_extension_textbox
+def replace_existing_pdf_input_for_whole_document_outputs(
+    load_s3_jobs_input_loc: str,
+    pdf_filename: str,
+    local_output_dir: str,
+    s3_bucket_name: str,
+    in_doc_files: FileData = [],
+    input_folder: str = INPUT_FOLDER,
     RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+    progress=gr.Progress(track_tqdm=True),
+):
     progress(0.1, "Loading PDF from s3")
     if in_doc_files:
+        (
+            doc_file_name_no_extension_textbox,
+            doc_file_name_with_extension_textbox,
+            doc_full_file_name_textbox,
+            doc_file_name_textbox_list,
+            total_pdf_page_count,
+        ) = get_input_file_names(in_doc_files)
         if pdf_filename == doc_file_name_no_extension_textbox:
             print("Existing loaded PDF file has same name as file from S3")
             doc_file_name_no_extension_textbox = pdf_filename
             downloaded_pdf_file_location = in_doc_files
         else:
+            downloaded_pdf_file_location, doc_file_name_no_extension_textbox = (
+                load_pdf_job_file_from_s3(
+                    load_s3_jobs_input_loc,
+                    pdf_filename,
+                    local_output_dir,
+                    s3_bucket_name,
+                    RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+                )
+            )
+            (
+                doc_file_name_no_extension_textbox,
+                doc_file_name_with_extension_textbox,
+                doc_full_file_name_textbox,
+                doc_file_name_textbox_list,
+                total_pdf_page_count,
+            ) = get_input_file_names(downloaded_pdf_file_location)
+    else:
+        downloaded_pdf_file_location, doc_file_name_no_extension_textbox = (
+            load_pdf_job_file_from_s3(
+                load_s3_jobs_input_loc,
+                pdf_filename,
+                local_output_dir,
+                s3_bucket_name,
+                RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+            )
+        )
+        (
+            doc_file_name_no_extension_textbox,
+            doc_file_name_with_extension_textbox,
+            doc_full_file_name_textbox,
+            doc_file_name_textbox_list,
+            total_pdf_page_count,
+        ) = get_input_file_names(downloaded_pdf_file_location)
+    return (
+        downloaded_pdf_file_location,
+        doc_file_name_no_extension_textbox,
+        doc_file_name_with_extension_textbox,
+        doc_full_file_name_textbox,
+        doc_file_name_textbox_list,
+        total_pdf_page_count,
+    )
 def poll_whole_document_textract_analysis_progress_and_download(
+    job_id: str,
+    job_type_dropdown: str,
     s3_output_prefix: str,
+    pdf_filename: str,
+    job_df: pd.DataFrame,
     s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
     local_output_dir: str = OUTPUT_FOLDER,
+    load_s3_jobs_loc: str = TEXTRACT_JOBS_S3_LOC,
+    load_local_jobs_loc: str = TEXTRACT_JOBS_LOCAL_LOC,
+    aws_region: str = AWS_REGION,  # Optional: specify region if not default
+    load_jobs_from_s3: str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
     poll_interval_seconds: int = 1,
+    max_polling_attempts: int = 1,  # ~10 minutes total wait time):
     DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS: int = DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """
     Poll AWS for the status of a Textract API job. Return status, and if finished, combine and download results into a locally-stored json file for further processing by the app.
+    """
     progress(0.1, "Querying AWS Textract for status of document analysis job")
     if job_id:
         # Initialize boto3 clients
         session = boto3.Session(region_name=aws_region)
+        s3_client = session.client("s3")
+        textract_client = session.client("textract")
         # --- 3. Poll for Job Completion ---
+        job_status = "IN_PROGRESS"
         attempts = 0
         message = "Polling Textract for job completion status..."
         print(message)
+        # logging.info("Polling Textract for job completion status...")
         # Update Textract document history df
         try:
+            job_df = load_in_textract_job_details(
+                load_s3_jobs=load_jobs_from_s3,
+                load_s3_jobs_loc=load_s3_jobs_loc,
+                load_local_jobs_loc=load_local_jobs_loc,
+            )
         except Exception as e:
+            # logging.error(f"Failed to update job details dataframe: {e}")
             print(f"Failed to update job details dataframe: {e}")
+            # raise
+        while job_status == "IN_PROGRESS" and attempts <= max_polling_attempts:
             attempts += 1
             try:
+                if job_type_dropdown == "document_analysis":
                     response = textract_client.get_document_analysis(JobId=job_id)
+                    job_status = return_job_status(
+                        job_id,
+                        response,
+                        attempts,
+                        poll_interval_seconds,
+                        max_polling_attempts,
+                    )
+                elif job_type_dropdown == "document_text_detection":
                     response = textract_client.get_document_text_detection(JobId=job_id)
+                    job_status = return_job_status(
+                        job_id,
+                        response,
+                        attempts,
+                        poll_interval_seconds,
+                        max_polling_attempts,
+                    )
                 else:
+                    error = "Unknown job type, cannot poll job"
                     print(error)
+                    logging.error(error)
+                    raise Exception(error)
             except textract_client.exceptions.InvalidJobIdException:
                 error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than {DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS} days) or never existed."
                 logging.error(error_message)
                 raise Exception(error_message)
             except Exception as e:
+                error_message = (
+                    f"Error while polling Textract status for job {job_id}: {e}"
+                )
                 print(error_message)
                 logging.error(error_message)
                 raise Exception(error_message)
         downloaded_file_path = None
+        if job_status == "SUCCEEDED":
+            # raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
+            # 3b - Replace PDF file name if it exists in the job dataframe
+            progress(0.5, "Document analysis task outputs found. Downloading from S3")
             # If job_df is not empty
             if not job_df.empty:
                 if "file_name" in job_df.columns:
+                    matching_job_id_file_names = job_df.loc[
+                        job_df["job_id"] == job_id, "file_name"
+                    ]
                     if pdf_filename and not matching_job_id_file_names.empty:
                         if pdf_filename == matching_job_id_file_names.iloc[0]:
             # Textract typically creates output under s3_output_prefix/job_id/
             # There might be multiple JSON files if pagination occurred during writing.
             # Usually, for smaller docs, there's one file, often named '1'.
+            # For robust handling, list objects and find the JSON(s).
+            s3_output_key_prefix = (
+                os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
+            )
+            logging.info(
+                f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}"
+            )
             try:
+                downloaded_file_path = download_textract_job_files(
+                    s3_client,
+                    s3_bucket_name,
+                    s3_output_key_prefix,
+                    pdf_filename,
+                    job_id,
+                    local_output_dir,
+                )
             except Exception as e:
+                # logging.error(f"Failed to download or process Textract output from S3: {e}")
                 print(f"Failed to download or process Textract output from S3: {e}")
                 raise
     else:
+        raise Exception("No Job ID provided.")
     output_pdf_filename = get_file_name_without_type(pdf_filename)
     return downloaded_file_path, job_status, job_df, output_pdf_filename
+def load_in_textract_job_details(
+    load_s3_jobs: str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
+    load_s3_jobs_loc: str = TEXTRACT_JOBS_S3_LOC,
+    load_local_jobs_loc: str = TEXTRACT_JOBS_LOCAL_LOC,
+    document_redaction_bucket: str = DOCUMENT_REDACTION_BUCKET,
+    aws_region: str = AWS_REGION,
+    DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS: int = DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS,
+):
+    """
     Load in a dataframe of jobs previous submitted to the Textract API service.
+    """
+    job_df = pd.DataFrame(
+        columns=[
+            "job_id",
+            "file_name",
+            "job_type",
+            "signature_extraction",
+            "job_date_time",
+        ]
+    )
     # Initialize boto3 clients
     session = boto3.Session(region_name=aws_region)
+    s3_client = session.client("s3")
+    local_output_path = f"{load_local_jobs_loc}/textract_document_jobs.csv"
+    if load_s3_jobs == "True":
+        s3_output_key = f"{load_s3_jobs_loc}/textract_document_jobs.csv"
         try:
             s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
+            # print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
+            s3_client.download_file(
+                document_redaction_bucket, s3_output_key, local_output_path
+            )
+            # print("Download successful.")
         except ClientError as e:
+            if e.response["Error"]["Code"] == "404":
                 print("Log file does not exist in S3.")
             else:
                 print(f"Unexpected error occurred: {e}")
         job_df = pd.read_csv(local_output_path)
         if "job_date_time" in job_df.columns:
+            job_df["job_date_time"] = pd.to_datetime(
+                job_df["job_date_time"], errors="coerce"
+            )
             # Keep only jobs that have been completed in the last 'DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS' days
+            cutoff_time = pd.Timestamp.now() - pd.Timedelta(
+                days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS
+            )
+            job_df = job_df.loc[job_df["job_date_time"] > cutoff_time, :]
         try:
+            job_df = job_df[
+                [
+                    "job_id",
+                    "file_name",
+                    "job_type",
+                    "signature_extraction",
+                    "job_date_time",
+                ]
+            ]
         except Exception as e:
+            print(
+                "Could not find one or more columns in Textract whole document list dataframe:",
+                e,
+            )
     return job_df
+def download_textract_output(
+    job_id: str, output_bucket: str, output_prefix: str, local_folder: str
+):
     """
     Checks the status of a Textract job and downloads the output ZIP file if the job is complete.
     :param output_prefix: The prefix (folder path) in S3 where the output file is stored.
     :param local_folder: The local directory where the ZIP file should be saved.
     """
+    textract_client = boto3.client("textract")
+    s3_client = boto3.client("s3")
     # Check job status
     while True:
         response = textract_client.get_document_analysis(JobId=job_id)
+        status = response["JobStatus"]
+        if status == "SUCCEEDED":
             print("Job completed successfully.")
             break
+        elif status == "FAILED":
+            print(
+                "Job failed:",
+                response.get("StatusMessage", "No error message provided."),
+            )
             return
         else:
             print(f"Job is still {status}.")
+            # time.sleep(10)  # Wait before checking again
     # Find output ZIP file in S3
     output_file_key = f"{output_prefix}/{job_id}.zip"
     except Exception as e:
         print(f"Error downloading file: {e}")
 def check_textract_outputs_exist(textract_output_found_checkbox):
+    if textract_output_found_checkbox is True:
+        print("Textract outputs found")
+        return
+    else:
+        raise Exception(
+            "Relevant Textract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above"
+        )