Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Sep 23

Commit

f957846

1 Parent(s): bafcf39

General code changes and reformatting to address code vulnerabilities highlighted by codeQL scan, and black/ruff repplied to code. Fixes/optimisation of Github Actions

Browse files

Files changed (27) hide show

.dockerignore +8 -0
.github/README.md +1 -1
.github/scripts/setup_test_data.py +6 -3
.github/workflows/ci.yml +14 -8
.github/workflows/multi-os-test.yml +10 -6
.github/workflows/simple-test.yml +4 -0
.github/workflows/test.yml +5 -0
.gitignore +8 -0
cdk/cdk_functions.py +2 -2
test/run_tests.py +1 -1
test/test.py +50 -41
test/test_gui_only.py +44 -35
tools/aws_functions.py +4 -3
tools/aws_textract.py +3 -2
tools/config.py +1 -1
tools/custom_csvlogger.py +9 -3
tools/custom_image_analyser_engine.py +2 -5
tools/data_anonymise.py +17 -20
tools/file_conversion.py +23 -16
tools/file_redaction.py +22 -20
tools/find_duplicate_pages.py +8 -5
tools/find_duplicate_tabular.py +11 -8
tools/helper_functions.py +7 -5
tools/redaction_review.py +20 -19
tools/secure_path_utils.py +267 -0
tools/secure_regex_utils.py +292 -0
tools/textract_batch_call.py +18 -15

.dockerignore CHANGED Viewed

@@ -26,3 +26,11 @@ input/
 feedback/
 config/
 usage/

 feedback/
 config/
 usage/
+test/config/*
+test/feedback/*
+test/input/*
+test/logs/*
+test/output/*
+test/tmp/*
+test/usage/*
+.ruff_cache/*

.github/README.md CHANGED Viewed

@@ -27,7 +27,7 @@ This directory contains GitHub Actions workflows for automated testing of the CL
 ### 3. **Multi-OS Testing** (`.github/workflows/multi-os-test.yml`)
 - **Purpose**: Cross-platform testing
-- **OS**: Ubuntu, Windows, macOS
 - **Python**: 3.10, 3.11, 3.12
 - **Features**: Tests compatibility across different operating systems

 ### 3. **Multi-OS Testing** (`.github/workflows/multi-os-test.yml`)
 - **Purpose**: Cross-platform testing
+- **OS**: Ubuntu, macOS (Windows not included currently but may be reintroduced)
 - **Python**: 3.10, 3.11, 3.12
 - **Features**: Tests compatibility across different operating systems

.github/scripts/setup_test_data.py CHANGED Viewed

@@ -142,14 +142,17 @@ def create_allow_deny_lists():
 def create_ocr_output():
     """Create dummy OCR output CSV."""
     ocr_data = {
-        "file_name": ["test.pdf", "test.pdf", "test.pdf"],
-        "page_number": [1, 2, 3],
         "text": [
             "This is page 1 content with some text",
             "This is page 2 content with different text",
             "This is page 3 content with more text",
         ],
-        "confidence": [0.95, 0.92, 0.88],
     }
     df = pd.DataFrame(ocr_data)
     df.to_csv(

 def create_ocr_output():
     """Create dummy OCR output CSV."""
     ocr_data = {
+        "page": [1, 2, 3],
         "text": [
             "This is page 1 content with some text",
             "This is page 2 content with different text",
             "This is page 3 content with more text",
         ],
+        "left": [0.1, 0.3, 0.5],
+        "top": [0.95, 0.92, 0.88],
+        "width": [0.05, 0.02, 0.02],
+        "height": [0.01, 0.02, 0.02],
+        "line": [1, 2, 3],
     }
     df = pd.DataFrame(ocr_data)
     df.to_csv(

.github/workflows/ci.yml CHANGED Viewed

@@ -2,12 +2,18 @@ name: CI/CD Pipeline
 on:
   push:
-    branches: [ main, dev ]
   pull_request:
-    branches: [ main, dev ]
-  schedule:
-    # Run tests daily at 2 AM UTC
-    - cron: '0 2 * * *'
 env:
   PYTHON_VERSION: "3.11"
@@ -38,7 +44,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.10, 3.11, 3.12]
     steps:
     - uses: actions/checkout@v4
@@ -180,9 +186,9 @@ jobs:
         python -m pip install --upgrade pip
         pip install safety bandit
-    - name: Run safety check
       run: |
-        safety check -r requirements.txt
     - name: Run bandit security check
       run: |

 on:
   push:
+    branches: [ main ]
   pull_request:
+    branches: [ main ]
+  #schedule:
+  # Run tests daily at 2 AM UTC
+  #  - cron: '0 2 * * *'
+permissions:
+  contents: read
+  actions: read
+  pull-requests: write
+  issues: write
 env:
   PYTHON_VERSION: "3.11"
     runs-on: ubuntu-latest
     strategy:
       matrix:
+        python-version: [3.11, 3.12, 3.13]
     steps:
     - uses: actions/checkout@v4
         python -m pip install --upgrade pip
         pip install safety bandit
+    - name: Run safety scan
       run: |
+        safety scan -r requirements.txt
     - name: Run bandit security check
       run: |

.github/workflows/multi-os-test.yml CHANGED Viewed

@@ -2,23 +2,27 @@ name: Multi-OS Test
 on:
   push:
-    branches: [ main, dev ]
   pull_request:
-    branches: [ main, dev ]
 jobs:
   test:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, windows-latest, macos-latest]
         python-version: ["3.10", "3.11", "3.12"]
         exclude:
           # Exclude some combinations to reduce CI time
-          - os: windows-latest
-            python-version: "3.10"
           - os: macos-latest
-            python-version: "3.12"
     steps:
     - uses: actions/checkout@v4

 on:
   push:
+    branches: [ main ]
   pull_request:
+    branches: [ main ]
+permissions:
+  contents: read
+  actions: read
 jobs:
   test:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
+        os: [ubuntu-latest, macos-latest] # windows-latest removed for now as I have not been able to install tesseract on Windows using this method
         python-version: ["3.10", "3.11", "3.12"]
         exclude:
           # Exclude some combinations to reduce CI time
+          #- os: windows-latest
+          #  python-version: "3.10"
           - os: macos-latest
+            python-version: "3.11"
     steps:
     - uses: actions/checkout@v4

.github/workflows/simple-test.yml CHANGED Viewed

@@ -6,6 +6,10 @@ on:
   pull_request:
     branches: [ main, dev ]
 jobs:
   test:
     runs-on: ubuntu-latest

   pull_request:
     branches: [ main, dev ]
+permissions:
+  contents: read
+  actions: read
 jobs:
   test:
     runs-on: ubuntu-latest

.github/workflows/test.yml CHANGED Viewed

@@ -6,6 +6,11 @@ on:
   pull_request:
     branches: [ main, dev ]
 jobs:
   test:
     runs-on: ubuntu-latest

   pull_request:
     branches: [ main, dev ]
+permissions:
+  contents: read
+  actions: read
+  pull-requests: write
 jobs:
   test:
     runs-on: ubuntu-latest

.gitignore CHANGED Viewed

@@ -29,3 +29,11 @@ cdk.context.json
 .quarto/*
 /.quarto/
 /_site/

 .quarto/*
 /.quarto/
 /_site/
+test/config/*
+test/feedback/*
+test/input/*
+test/logs/*
+test/output/*
+test/tmp/*
+test/usage/*
+.ruff_cache/*

cdk/cdk_functions.py CHANGED Viewed

@@ -856,14 +856,14 @@ def check_for_secret(secret_name: str, secret_value: dict = ""):
     try:
         # Try to get the secret. If it doesn't exist, a ResourceNotFoundException will be raised.
         secret_value = secretsmanager_client.get_secret_value(SecretId=secret_name)
-        print(f"Secret '{secret_name}' already exists.")
         return True, secret_value
     except secretsmanager_client.exceptions.ResourceNotFoundException:
         print("Secret not found")
         return False, {}
     except Exception as e:
         # Handle other potential exceptions during the get operation
-        print(f"Error checking for secret '{secret_name}': {e}")
         return False, {}

     try:
         # Try to get the secret. If it doesn't exist, a ResourceNotFoundException will be raised.
         secret_value = secretsmanager_client.get_secret_value(SecretId=secret_name)
+        print("Secret already exists.")
         return True, secret_value
     except secretsmanager_client.exceptions.ResourceNotFoundException:
         print("Secret not found")
         return False, {}
     except Exception as e:
         # Handle other potential exceptions during the get operation
+        print(f"Error checking for secret: {e}")
         return False, {}

test/run_tests.py CHANGED Viewed

@@ -12,7 +12,7 @@ import sys
 # Add the parent directory to the path so we can import the test module
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.test import run_all_tests
 if __name__ == "__main__":
     print("Starting CLI Redaction Test Suite...")

 # Add the parent directory to the path so we can import the test module
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from test import run_all_tests
 if __name__ == "__main__":
     print("Starting CLI Redaction Test Suite...")

test/test.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import os
 import shutil
 import subprocess
-import tempfile
-import unittest
 import sys
 import threading
-import time
 from typing import List, Optional
@@ -893,35 +892,40 @@ class TestGUIApp(unittest.TestCase):
         cls.app_path = os.path.join(
             os.path.dirname(os.path.dirname(__file__)), "app.py"
         )
         # Verify app.py exists
         if not os.path.isfile(cls.app_path):
             raise FileNotFoundError(f"App file not found: {cls.app_path}")
         print(f"GUI test setup complete. App: {cls.app_path}")
     def test_app_import_and_initialization(self):
         """Test: Import app.py and check if the Gradio app object is created successfully."""
         print("\n=== Testing GUI app import and initialization ===")
         try:
             # Add the parent directory to the path so we can import app
             parent_dir = os.path.dirname(os.path.dirname(__file__))
             if parent_dir not in sys.path:
                 sys.path.insert(0, parent_dir)
             # Import the app module
             import app
             # Check if the app object exists and is a Gradio Blocks object
-            self.assertTrue(hasattr(app, 'app'), "App object should exist in the module")
             # Check if it's a Gradio Blocks instance
             import gradio as gr
-            self.assertIsInstance(app.app, gr.Blocks, "App should be a Gradio Blocks instance")
             print("✅ GUI app import and initialization passed")
         except ImportError as e:
             error_msg = f"Failed to import app module: {e}"
             if "gradio_image_annotation" in str(e):
@@ -935,41 +939,40 @@ class TestGUIApp(unittest.TestCase):
     def test_app_launch_headless(self):
         """Test: Launch the app in headless mode to verify it starts without errors."""
         print("\n=== Testing GUI app launch in headless mode ===")
         try:
             # Add the parent directory to the path
             parent_dir = os.path.dirname(os.path.dirname(__file__))
             if parent_dir not in sys.path:
                 sys.path.insert(0, parent_dir)
             # Import the app module
             import app
-            import gradio as gr
             # Set up a flag to track if the app launched successfully
             app_launched = threading.Event()
             launch_error = None
             def launch_app():
                 try:
                     # Launch the app in headless mode with a short timeout
                     app.app.launch(
                         show_error=True,
                         inbrowser=False,  # Don't open browser
-                        server_port=0,    # Use any available port
-                        quiet=True,       # Suppress output
-                        prevent_thread_lock=True  # Don't block the main thread
                     )
                     app_launched.set()
-                except Exception as e:
-                    launch_error = e
                     app_launched.set()
             # Start the app in a separate thread
             launch_thread = threading.Thread(target=launch_app)
             launch_thread.daemon = True
             launch_thread.start()
             # Wait for the app to launch (with timeout)
             if app_launched.wait(timeout=10):  # 10 second timeout
                 if launch_error:
@@ -978,7 +981,7 @@ class TestGUIApp(unittest.TestCase):
                     print("✅ GUI app launch in headless mode passed")
             else:
                 self.fail("App launch timed out after 10 seconds")
         except Exception as e:
             error_msg = f"Unexpected error during app launch test: {e}"
             if "gradio_image_annotation" in str(e):
@@ -990,33 +993,39 @@ class TestGUIApp(unittest.TestCase):
     def test_app_configuration_loading(self):
         """Test: Verify that the app can load its configuration without errors."""
         print("\n=== Testing GUI app configuration loading ===")
         try:
             # Add the parent directory to the path
             parent_dir = os.path.dirname(os.path.dirname(__file__))
             if parent_dir not in sys.path:
                 sys.path.insert(0, parent_dir)
-            # Import the app module
-            import app
             # Check if key configuration variables are accessible
             # These should be imported from tools.config
             from tools.config import (
                 GRADIO_SERVER_PORT,
                 MAX_FILE_SIZE,
-                DEFAULT_LANGUAGE,
-                PII_DETECTION_MODELS
             )
             # Verify these are not None/empty
-            self.assertIsNotNone(GRADIO_SERVER_PORT, "GRADIO_SERVER_PORT should be configured")
             self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
-            self.assertIsNotNone(DEFAULT_LANGUAGE, "DEFAULT_LANGUAGE should be configured")
-            self.assertIsNotNone(PII_DETECTION_MODELS, "PII_DETECTION_MODELS should be configured")
             print("✅ GUI app configuration loading passed")
         except ImportError as e:
             error_msg = f"Failed to import configuration: {e}"
             if "gradio_image_annotation" in str(e):
@@ -1048,11 +1057,11 @@ def run_all_tests():
     # Create test suite
     loader = unittest.TestLoader()
     suite = unittest.TestSuite()
     # Add CLI tests
     cli_suite = loader.loadTestsFromTestCase(TestCLIRedactExamples)
     suite.addTests(cli_suite)
     # Add GUI tests
     gui_suite = loader.loadTestsFromTestCase(TestGUIApp)
     suite.addTests(gui_suite)

 import os
 import shutil
 import subprocess
 import sys
+import tempfile
 import threading
+import unittest
 from typing import List, Optional
         cls.app_path = os.path.join(
             os.path.dirname(os.path.dirname(__file__)), "app.py"
         )
         # Verify app.py exists
         if not os.path.isfile(cls.app_path):
             raise FileNotFoundError(f"App file not found: {cls.app_path}")
         print(f"GUI test setup complete. App: {cls.app_path}")
     def test_app_import_and_initialization(self):
         """Test: Import app.py and check if the Gradio app object is created successfully."""
         print("\n=== Testing GUI app import and initialization ===")
         try:
             # Add the parent directory to the path so we can import app
             parent_dir = os.path.dirname(os.path.dirname(__file__))
             if parent_dir not in sys.path:
                 sys.path.insert(0, parent_dir)
             # Import the app module
             import app
             # Check if the app object exists and is a Gradio Blocks object
+            self.assertTrue(
+                hasattr(app, "app"), "App object should exist in the module"
+            )
             # Check if it's a Gradio Blocks instance
             import gradio as gr
+            self.assertIsInstance(
+                app.app, gr.Blocks, "App should be a Gradio Blocks instance"
+            )
             print("✅ GUI app import and initialization passed")
         except ImportError as e:
             error_msg = f"Failed to import app module: {e}"
             if "gradio_image_annotation" in str(e):
     def test_app_launch_headless(self):
         """Test: Launch the app in headless mode to verify it starts without errors."""
         print("\n=== Testing GUI app launch in headless mode ===")
         try:
             # Add the parent directory to the path
             parent_dir = os.path.dirname(os.path.dirname(__file__))
             if parent_dir not in sys.path:
                 sys.path.insert(0, parent_dir)
             # Import the app module
             import app
             # Set up a flag to track if the app launched successfully
             app_launched = threading.Event()
             launch_error = None
             def launch_app():
                 try:
                     # Launch the app in headless mode with a short timeout
                     app.app.launch(
                         show_error=True,
                         inbrowser=False,  # Don't open browser
+                        server_port=0,  # Use any available port
+                        quiet=True,  # Suppress output
+                        prevent_thread_lock=True,  # Don't block the main thread
                     )
                     app_launched.set()
+                except Exception:
                     app_launched.set()
             # Start the app in a separate thread
             launch_thread = threading.Thread(target=launch_app)
             launch_thread.daemon = True
             launch_thread.start()
             # Wait for the app to launch (with timeout)
             if app_launched.wait(timeout=10):  # 10 second timeout
                 if launch_error:
                     print("✅ GUI app launch in headless mode passed")
             else:
                 self.fail("App launch timed out after 10 seconds")
         except Exception as e:
             error_msg = f"Unexpected error during app launch test: {e}"
             if "gradio_image_annotation" in str(e):
     def test_app_configuration_loading(self):
         """Test: Verify that the app can load its configuration without errors."""
         print("\n=== Testing GUI app configuration loading ===")
         try:
             # Add the parent directory to the path
             parent_dir = os.path.dirname(os.path.dirname(__file__))
             if parent_dir not in sys.path:
                 sys.path.insert(0, parent_dir)
+            # Import the app module (not needed?)
+            # import app
             # Check if key configuration variables are accessible
             # These should be imported from tools.config
             from tools.config import (
+                DEFAULT_LANGUAGE,
                 GRADIO_SERVER_PORT,
                 MAX_FILE_SIZE,
+                PII_DETECTION_MODELS,
             )
             # Verify these are not None/empty
+            self.assertIsNotNone(
+                GRADIO_SERVER_PORT, "GRADIO_SERVER_PORT should be configured"
+            )
             self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
+            self.assertIsNotNone(
+                DEFAULT_LANGUAGE, "DEFAULT_LANGUAGE should be configured"
+            )
+            self.assertIsNotNone(
+                PII_DETECTION_MODELS, "PII_DETECTION_MODELS should be configured"
+            )
             print("✅ GUI app configuration loading passed")
         except ImportError as e:
             error_msg = f"Failed to import configuration: {e}"
             if "gradio_image_annotation" in str(e):
     # Create test suite
     loader = unittest.TestLoader()
     suite = unittest.TestSuite()
     # Add CLI tests
     cli_suite = loader.loadTestsFromTestCase(TestCLIRedactExamples)
     suite.addTests(cli_suite)
     # Add GUI tests
     gui_suite = loader.loadTestsFromTestCase(TestGUIApp)
     suite.addTests(gui_suite)

test/test_gui_only.py CHANGED Viewed

@@ -8,9 +8,8 @@ Run this script to verify that the Gradio interface can be imported and initiali
 import os
 import sys
-import unittest
 import threading
-import time
 # Add the parent directory to the path so we can import the app
 parent_dir = os.path.dirname(os.path.dirname(__file__))
@@ -25,30 +24,35 @@ class TestGUIAppOnly(unittest.TestCase):
     def setUpClass(cls):
         """Set up test environment for GUI tests."""
         cls.app_path = os.path.join(parent_dir, "app.py")
         # Verify app.py exists
         if not os.path.isfile(cls.app_path):
             raise FileNotFoundError(f"App file not found: {cls.app_path}")
         print(f"GUI test setup complete. App: {cls.app_path}")
     def test_app_import_and_initialization(self):
         """Test: Import app.py and check if the Gradio app object is created successfully."""
         print("\n=== Testing GUI app import and initialization ===")
         try:
             # Import the app module
             import app
             # Check if the app object exists and is a Gradio Blocks object
-            self.assertTrue(hasattr(app, 'app'), "App object should exist in the module")
             # Check if it's a Gradio Blocks instance
             import gradio as gr
-            self.assertIsInstance(app.app, gr.Blocks, "App should be a Gradio Blocks instance")
             print("✅ GUI app import and initialization passed")
         except ImportError as e:
             error_msg = f"Failed to import app module: {e}"
             if "gradio_image_annotation" in str(e):
@@ -62,36 +66,35 @@ class TestGUIAppOnly(unittest.TestCase):
     def test_app_launch_headless(self):
         """Test: Launch the app in headless mode to verify it starts without errors."""
         print("\n=== Testing GUI app launch in headless mode ===")
         try:
             # Import the app module
             import app
-            import gradio as gr
             # Set up a flag to track if the app launched successfully
             app_launched = threading.Event()
             launch_error = None
             def launch_app():
                 try:
                     # Launch the app in headless mode with a short timeout
                     app.app.launch(
                         show_error=True,
                         inbrowser=False,  # Don't open browser
-                        server_port=0,    # Use any available port
-                        quiet=True,       # Suppress output
-                        prevent_thread_lock=True  # Don't block the main thread
                     )
                     app_launched.set()
-                except Exception as e:
-                    launch_error = e
                     app_launched.set()
             # Start the app in a separate thread
             launch_thread = threading.Thread(target=launch_app)
             launch_thread.daemon = True
             launch_thread.start()
             # Wait for the app to launch (with timeout)
             if app_launched.wait(timeout=10):  # 10 second timeout
                 if launch_error:
@@ -100,7 +103,7 @@ class TestGUIAppOnly(unittest.TestCase):
                     print("✅ GUI app launch in headless mode passed")
             else:
                 self.fail("App launch timed out after 10 seconds")
         except Exception as e:
             error_msg = f"Unexpected error during app launch test: {e}"
             if "gradio_image_annotation" in str(e):
@@ -112,28 +115,34 @@ class TestGUIAppOnly(unittest.TestCase):
     def test_app_configuration_loading(self):
         """Test: Verify that the app can load its configuration without errors."""
         print("\n=== Testing GUI app configuration loading ===")
         try:
-            # Import the app module
-            import app
             # Check if key configuration variables are accessible
             # These should be imported from tools.config
             from tools.config import (
                 GRADIO_SERVER_PORT,
                 MAX_FILE_SIZE,
-                DEFAULT_LANGUAGE,
-                PII_DETECTION_MODELS
             )
             # Verify these are not None/empty
-            self.assertIsNotNone(GRADIO_SERVER_PORT, "GRADIO_SERVER_PORT should be configured")
             self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
-            self.assertIsNotNone(DEFAULT_LANGUAGE, "DEFAULT_LANGUAGE should be configured")
-            self.assertIsNotNone(PII_DETECTION_MODELS, "PII_DETECTION_MODELS should be configured")
             print("✅ GUI app configuration loading passed")
         except ImportError as e:
             error_msg = f"Failed to import configuration: {e}"
             if "gradio_image_annotation" in str(e):

 import os
 import sys
 import threading
+import unittest
 # Add the parent directory to the path so we can import the app
 parent_dir = os.path.dirname(os.path.dirname(__file__))
     def setUpClass(cls):
         """Set up test environment for GUI tests."""
         cls.app_path = os.path.join(parent_dir, "app.py")
         # Verify app.py exists
         if not os.path.isfile(cls.app_path):
             raise FileNotFoundError(f"App file not found: {cls.app_path}")
         print(f"GUI test setup complete. App: {cls.app_path}")
     def test_app_import_and_initialization(self):
         """Test: Import app.py and check if the Gradio app object is created successfully."""
         print("\n=== Testing GUI app import and initialization ===")
         try:
             # Import the app module
             import app
             # Check if the app object exists and is a Gradio Blocks object
+            self.assertTrue(
+                hasattr(app, "app"), "App object should exist in the module"
+            )
             # Check if it's a Gradio Blocks instance
             import gradio as gr
+            self.assertIsInstance(
+                app.app, gr.Blocks, "App should be a Gradio Blocks instance"
+            )
             print("✅ GUI app import and initialization passed")
         except ImportError as e:
             error_msg = f"Failed to import app module: {e}"
             if "gradio_image_annotation" in str(e):
     def test_app_launch_headless(self):
         """Test: Launch the app in headless mode to verify it starts without errors."""
         print("\n=== Testing GUI app launch in headless mode ===")
         try:
             # Import the app module
             import app
             # Set up a flag to track if the app launched successfully
             app_launched = threading.Event()
             launch_error = None
             def launch_app():
                 try:
                     # Launch the app in headless mode with a short timeout
                     app.app.launch(
                         show_error=True,
                         inbrowser=False,  # Don't open browser
+                        server_port=0,  # Use any available port
+                        quiet=True,  # Suppress output
+                        prevent_thread_lock=True,  # Don't block the main thread
                     )
                     app_launched.set()
+                except Exception:
                     app_launched.set()
             # Start the app in a separate thread
             launch_thread = threading.Thread(target=launch_app)
             launch_thread.daemon = True
             launch_thread.start()
             # Wait for the app to launch (with timeout)
             if app_launched.wait(timeout=10):  # 10 second timeout
                 if launch_error:
                     print("✅ GUI app launch in headless mode passed")
             else:
                 self.fail("App launch timed out after 10 seconds")
         except Exception as e:
             error_msg = f"Unexpected error during app launch test: {e}"
             if "gradio_image_annotation" in str(e):
     def test_app_configuration_loading(self):
         """Test: Verify that the app can load its configuration without errors."""
         print("\n=== Testing GUI app configuration loading ===")
         try:
+            # Import the app module (not necessary here?)
+            # import app
             # Check if key configuration variables are accessible
             # These should be imported from tools.config
             from tools.config import (
+                DEFAULT_LANGUAGE,
                 GRADIO_SERVER_PORT,
                 MAX_FILE_SIZE,
+                PII_DETECTION_MODELS,
             )
             # Verify these are not None/empty
+            self.assertIsNotNone(
+                GRADIO_SERVER_PORT, "GRADIO_SERVER_PORT should be configured"
+            )
             self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
+            self.assertIsNotNone(
+                DEFAULT_LANGUAGE, "DEFAULT_LANGUAGE should be configured"
+            )
+            self.assertIsNotNone(
+                PII_DETECTION_MODELS, "PII_DETECTION_MODELS should be configured"
+            )
             print("✅ GUI app configuration loading passed")
         except ImportError as e:
             error_msg = f"Failed to import configuration: {e}"
             if "gradio_image_annotation" in str(e):

tools/aws_functions.py CHANGED Viewed

@@ -10,6 +10,7 @@ from tools.config import (
     RUN_AWS_FUNCTIONS,
     SAVE_LOGS_TO_CSV,
 )
 PandasDataFrame = Type[pd.DataFrame]
@@ -90,7 +91,7 @@ def download_folder_from_s3(
             for obj in response.get("Contents", []):
                 # Extract object key and construct local file path
                 object_key = obj["Key"]
-                local_file_path = os.path.join(
                     local_folder, os.path.relpath(object_key, s3_folder)
                 )
@@ -143,8 +144,8 @@ def download_files_from_s3(
                 print("Found filenames in AWS folder: ", filenames)
             for filename in filenames:
-                object_key = os.path.join(s3_folder, filename)
-                local_file_path = os.path.join(local_folder, filename)
                 # Create directories if necessary
                 os.makedirs(os.path.dirname(local_file_path), exist_ok=True)

     RUN_AWS_FUNCTIONS,
     SAVE_LOGS_TO_CSV,
 )
+from tools.secure_path_utils import secure_join
 PandasDataFrame = Type[pd.DataFrame]
             for obj in response.get("Contents", []):
                 # Extract object key and construct local file path
                 object_key = obj["Key"]
+                local_file_path = secure_join(
                     local_folder, os.path.relpath(object_key, s3_folder)
                 )
                 print("Found filenames in AWS folder: ", filenames)
             for filename in filenames:
+                object_key = secure_join(s3_folder, filename)
+                local_file_path = secure_join(local_folder, filename)
                 # Create directories if necessary
                 os.makedirs(os.path.dirname(local_file_path), exist_ok=True)

tools/aws_textract.py CHANGED Viewed

@@ -16,6 +16,7 @@ from tools.config import (
     RUN_AWS_FUNCTIONS,
 )
 from tools.custom_image_analyser_engine import CustomImageRecognizerResult, OCRResult
 def extract_textract_metadata(response: object):
@@ -478,8 +479,8 @@ def load_and_convert_textract_json(
         log_files_output_paths.append(textract_json_file_path)
     try:
-        with open(textract_json_file_path, "r", encoding="utf-8") as json_file:
-            textract_data = json.load(json_file)
     except json.JSONDecodeError:
         print("Error: Failed to parse Textract JSON file. Returning empty data.")
         return {}, True, log_files_output_paths  # Indicate failure

     RUN_AWS_FUNCTIONS,
 )
 from tools.custom_image_analyser_engine import CustomImageRecognizerResult, OCRResult
+from tools.secure_path_utils import secure_file_read
 def extract_textract_metadata(response: object):
         log_files_output_paths.append(textract_json_file_path)
     try:
+        json_content = secure_file_read(textract_json_file_path, encoding="utf-8")
+        textract_data = json.loads(json_content)
     except json.JSONDecodeError:
         print("Error: Failed to parse Textract JSON file. Returning empty data.")
         return {}, True, log_files_output_paths  # Indicate failure

tools/config.py CHANGED Viewed

@@ -382,7 +382,7 @@ CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var(
 )  # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
 PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var(
-    "PREPROCESS_LOCAL_OCR_IMAGES", "True"
 )  # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this doesn't necessarily imporove results, and greatly slows down extraction.
 # Entities for redaction

 )  # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
 PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var(
+    "PREPROCESS_LOCAL_OCR_IMAGES", "False"
 )  # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this doesn't necessarily imporove results, and greatly slows down extraction.
 # Entities for redaction

tools/custom_csvlogger.py CHANGED Viewed

@@ -2,7 +2,6 @@ from __future__ import annotations
 import csv
 import os
-import re
 import time
 import uuid
 from collections.abc import Sequence
@@ -105,10 +104,17 @@ class CSVLogger_custom(FlaggingCallback):
             self.dataset_filepath = self.flagging_dir / self.dataset_file_name
         elif dataset_files:
             try:
                 latest_file = max(
-                    dataset_files, key=lambda f: int(re.findall(r"\d+", f.stem)[0])
                 )
-                latest_num = int(re.findall(r"\d+", latest_file.stem)[0])
                 with open(latest_file, newline="", encoding="utf-8") as csvfile:
                     reader = csv.reader(csvfile)

 import csv
 import os
 import time
 import uuid
 from collections.abc import Sequence
             self.dataset_filepath = self.flagging_dir / self.dataset_file_name
         elif dataset_files:
             try:
+                from tools.secure_regex_utils import (
+                    safe_extract_latest_number_from_filename,
+                )
                 latest_file = max(
+                    dataset_files,
+                    key=lambda f: safe_extract_latest_number_from_filename(f.stem) or 0,
+                )
+                latest_num = (
+                    safe_extract_latest_number_from_filename(latest_file.stem) or 0
                 )
                 with open(latest_file, newline="", encoding="utf-8") as csvfile:
                     reader = csv.reader(csvfile)

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -524,12 +524,9 @@ class CustomImageAnalyzerEngine:
         # Remove or replace invalid filename characters
         # Windows: < > : " | ? * \ /
         # Unix: / (forward slash)
-        # Also remove control characters and other problematic chars
-        invalid_chars = r'[<>:"|?*\\/\x00-\x1f\x7f-\x9f]'
-        sanitized = re.sub(invalid_chars, "_", text)
-        # Replace multiple consecutive underscores with a single one
-        sanitized = re.sub(r"_+", "_", sanitized)
         # Remove leading/trailing underscores and spaces
         sanitized = sanitized.strip("_ ")

         # Remove or replace invalid filename characters
         # Windows: < > : " | ? * \ /
         # Unix: / (forward slash)
+        from tools.secure_regex_utils import safe_sanitize_text
+        sanitized = safe_sanitize_text(text)
         # Remove leading/trailing underscores and spaces
         sanitized = sanitized.strip("_ ")

tools/data_anonymise.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import base64
 import os
-import re
 import secrets
 import time
 import unicodedata
@@ -20,7 +19,7 @@ from presidio_analyzer import (
     AnalyzerEngine,
     BatchAnalyzerEngine,
     DictAnalyzerResult,
-    RecognizerResult
 )
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
@@ -57,6 +56,7 @@ from tools.load_spacy_model_custom_recognisers import (
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict
 if DO_INITIAL_TABULAR_DATA_CLEAN == "True":
     DO_INITIAL_TABULAR_DATA_CLEAN = True
@@ -406,22 +406,21 @@ def handle_docx_anonymisation(
     base_name = os.path.basename(file_path)
     file_name_without_ext = os.path.splitext(base_name)[0]
-    output_docx_path = os.path.join(
         output_folder, f"{file_name_without_ext}_redacted.docx"
     )
-    log_file_path = os.path.join(
         output_folder, f"{file_name_without_ext}_redacted_log.txt"
     )
-    output_xlsx_path = os.path.join(
         output_folder, f"{file_name_without_ext}_redacted.csv"
     )
     anonymised_df.to_csv(output_xlsx_path, encoding="utf-8-sig", index=None)
     doc.save(output_docx_path)
-    with open(log_file_path, "w", encoding="utf-8-sig") as f:
-        f.write(decision_log)
     return output_docx_path, log_file_path, output_xlsx_path, comprehend_query_number
@@ -542,8 +541,6 @@ def anonymise_files_with_open_text(
             print(
                 "Connecting to Comprehend using AWS access key and secret keys from textboxes."
             )
-            print("aws_access_key_textbox:", aws_access_key_textbox)
-            print("aws_secret_access_key:", aws_secret_key_textbox)
             comprehend_client = boto3.client(
                 "comprehend",
                 aws_access_key_id=aws_access_key_textbox,
@@ -801,7 +798,10 @@ def anonymise_files_with_open_text(
             + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
         )
-        out_message_out = re.sub(r"^\n+|^\. ", "", out_message_out).strip()
     return (
         out_message_out,
@@ -1004,8 +1004,7 @@ def tabular_anonymise_wrapper_func(
             + excel_sheet_name
             + "_decision_process_output.txt"
         )
-        with open(decision_process_log_output_file, "w") as f:
-            f.write(decision_process_output_str)
     else:
         anon_export_file_name = (
@@ -1016,8 +1015,7 @@ def tabular_anonymise_wrapper_func(
         decision_process_log_output_file = (
             anon_export_file_name + "_decision_process_output.txt"
         )
-        with open(decision_process_log_output_file, "w") as f:
-            f.write(decision_process_output_str)
     out_file_paths.append(anon_export_file_name)
     log_files_output_paths.append(decision_process_log_output_file)
@@ -1296,11 +1294,9 @@ def anonymise_script(
     redact_config = {"DEFAULT": OperatorConfig("redact")}
     hash_config = {"DEFAULT": OperatorConfig("hash")}
     mask_config = {
-        "DEFAULT": OperatorConfig("mask", {
-            "masking_char": "*",
-            "chars_to_mask": 100,
-            "from_end": True
-        })
     }
     people_encrypt_config = {
         "PERSON": OperatorConfig("encrypt", {"key": key_string})
@@ -1343,7 +1339,8 @@ def anonymise_script(
     combined_config = {**chosen_mask_config}
     anonymizer_results = batch_anonymizer.anonymize_dict(
-        analyzer_results, operators=combined_config)
     scrubbed_df = pd.DataFrame(anonymizer_results)

 import base64
 import os
 import secrets
 import time
 import unicodedata
     AnalyzerEngine,
     BatchAnalyzerEngine,
     DictAnalyzerResult,
+    RecognizerResult,
 )
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict
+from tools.secure_path_utils import secure_file_write, secure_join
 if DO_INITIAL_TABULAR_DATA_CLEAN == "True":
     DO_INITIAL_TABULAR_DATA_CLEAN = True
     base_name = os.path.basename(file_path)
     file_name_without_ext = os.path.splitext(base_name)[0]
+    output_docx_path = secure_join(
         output_folder, f"{file_name_without_ext}_redacted.docx"
     )
+    log_file_path = secure_join(
         output_folder, f"{file_name_without_ext}_redacted_log.txt"
     )
+    output_xlsx_path = secure_join(
         output_folder, f"{file_name_without_ext}_redacted.csv"
     )
     anonymised_df.to_csv(output_xlsx_path, encoding="utf-8-sig", index=None)
     doc.save(output_docx_path)
+    secure_file_write(log_file_path, decision_log, encoding="utf-8-sig")
     return output_docx_path, log_file_path, output_xlsx_path, comprehend_query_number
             print(
                 "Connecting to Comprehend using AWS access key and secret keys from textboxes."
             )
             comprehend_client = boto3.client(
                 "comprehend",
                 aws_access_key_id=aws_access_key_textbox,
             + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
         )
+        from tools.secure_regex_utils import safe_remove_leading_newlines
+        out_message_out = safe_remove_leading_newlines(out_message_out)
+        out_message_out = out_message_out.lstrip(". ")
     return (
         out_message_out,
             + excel_sheet_name
             + "_decision_process_output.txt"
         )
+        secure_file_write(decision_process_log_output_file, decision_process_output_str)
     else:
         anon_export_file_name = (
         decision_process_log_output_file = (
             anon_export_file_name + "_decision_process_output.txt"
         )
+        secure_file_write(decision_process_log_output_file, decision_process_output_str)
     out_file_paths.append(anon_export_file_name)
     log_files_output_paths.append(decision_process_log_output_file)
     redact_config = {"DEFAULT": OperatorConfig("redact")}
     hash_config = {"DEFAULT": OperatorConfig("hash")}
     mask_config = {
+        "DEFAULT": OperatorConfig(
+            "mask", {"masking_char": "*", "chars_to_mask": 100, "from_end": True}
+        )
     }
     people_encrypt_config = {
         "PERSON": OperatorConfig("encrypt", {"key": key_string})
     combined_config = {**chosen_mask_config}
     anonymizer_results = batch_anonymizer.anonymize_dict(
+        analyzer_results, operators=combined_config
+    )
     scrubbed_df = pd.DataFrame(anonymizer_results)

tools/file_conversion.py CHANGED Viewed

@@ -34,6 +34,7 @@ from tools.config import (
     TEXTRACT_TEXT_EXTRACT_OPTION,
 )
 from tools.helper_functions import get_file_name_without_type, read_file
 # from tools.aws_textract import load_and_convert_textract_json
@@ -143,8 +144,8 @@ def process_single_page_for_image_conversion(
     if create_images is True:
         try:
             # Construct the full output directory path
-            image_output_dir = os.path.join(os.getcwd(), input_folder)
-            out_path = os.path.join(
                 image_output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png"
             )
             os.makedirs(os.path.dirname(out_path), exist_ok=True)
@@ -914,8 +915,8 @@ def prepare_image_or_pdf(
             if (file_extension in [".json"]) & (prepare_for_review is True):
                 if isinstance(file_path, str):
-                    with open(file_path, "r") as json_file:
-                        all_annotations_object = json.load(json_file)
                 else:
                     # Assuming file_path is a NamedString or similar
                     all_annotations_object = json.loads(
@@ -936,7 +937,7 @@ def prepare_image_or_pdf(
                 else:
                     output_textract_json_file_name = file_path_without_ext + ".json"
-                out_textract_path = os.path.join(
                     output_folder, output_textract_json_file_name
                 )
@@ -956,7 +957,7 @@ def prepare_image_or_pdf(
                 # if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
                 # else: output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
-                out_ocr_results_with_words_path = os.path.join(
                     output_folder, output_ocr_results_with_words_json_file_name
                 )
@@ -1026,10 +1027,12 @@ def prepare_image_or_pdf(
             if all_annotations_object:
                 # Get list of page numbers
                 image_file_paths_pages = [
-                    int(re.search(r"_(\d+)\.png$", os.path.basename(s)).group(1))
                     for s in image_file_paths
-                    if re.search(r"_(\d+)\.png$", os.path.basename(s))
                 ]
                 image_file_paths_pages = [int(i) for i in image_file_paths_pages]
@@ -1046,15 +1049,19 @@ def prepare_image_or_pdf(
                         try:
                             if not annotation:
                                 annotation = {"image": "", "boxes": []}
-                                annotation_page_number = int(
-                                    re.search(r"_(\d+)\.png$", image_file_path).group(1)
                                 )
                             else:
-                                annotation_page_number = int(
-                                    re.search(
-                                        r"_(\d+)\.png$", annotation["image"]
-                                    ).group(1)
                                 )
                         except Exception as e:
                             print("Extracting page number from image failed due to:", e)
                             annotation_page_number = 0
@@ -1110,7 +1117,7 @@ def prepare_image_or_pdf(
         if file_extension in [".zip"]:
             # Assume it's a Textract response object. Copy it to the output folder so it can be used later.
-            out_folder = os.path.join(
                 output_folder, file_path_without_ext + "_textract.json"
             )
@@ -1125,7 +1132,7 @@ def prepare_image_or_pdf(
                     json_filename = json_files[0]
                     # Extract the JSON file to the same directory as the ZIP file
-                    extracted_path = os.path.join(
                         os.path.dirname(file_path), json_filename
                     )
                     zip_ref.extract(json_filename, os.path.dirname(file_path))

     TEXTRACT_TEXT_EXTRACT_OPTION,
 )
 from tools.helper_functions import get_file_name_without_type, read_file
+from tools.secure_path_utils import secure_file_read, secure_join
 # from tools.aws_textract import load_and_convert_textract_json
     if create_images is True:
         try:
             # Construct the full output directory path
+            image_output_dir = secure_join(os.getcwd(), input_folder)
+            out_path = secure_join(
                 image_output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png"
             )
             os.makedirs(os.path.dirname(out_path), exist_ok=True)
             if (file_extension in [".json"]) & (prepare_for_review is True):
                 if isinstance(file_path, str):
+                    json_content = secure_file_read(file_path)
+                    all_annotations_object = json.loads(json_content)
                 else:
                     # Assuming file_path is a NamedString or similar
                     all_annotations_object = json.loads(
                 else:
                     output_textract_json_file_name = file_path_without_ext + ".json"
+                out_textract_path = secure_join(
                     output_folder, output_textract_json_file_name
                 )
                 # if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
                 # else: output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
+                out_ocr_results_with_words_path = secure_join(
                     output_folder, output_ocr_results_with_words_json_file_name
                 )
             if all_annotations_object:
                 # Get list of page numbers
+                from tools.secure_regex_utils import safe_extract_page_number_from_path
                 image_file_paths_pages = [
+                    safe_extract_page_number_from_path(s)
                     for s in image_file_paths
+                    if safe_extract_page_number_from_path(s) is not None
                 ]
                 image_file_paths_pages = [int(i) for i in image_file_paths_pages]
                         try:
                             if not annotation:
                                 annotation = {"image": "", "boxes": []}
+                                annotation_page_number = (
+                                    safe_extract_page_number_from_path(image_file_path)
                                 )
+                                if annotation_page_number is None:
+                                    continue
                             else:
+                                annotation_page_number = (
+                                    safe_extract_page_number_from_path(
+                                        annotation["image"]
+                                    )
                                 )
+                                if annotation_page_number is None:
+                                    continue
                         except Exception as e:
                             print("Extracting page number from image failed due to:", e)
                             annotation_page_number = 0
         if file_extension in [".zip"]:
             # Assume it's a Textract response object. Copy it to the output folder so it can be used later.
+            out_folder = secure_join(
                 output_folder, file_path_without_ext + "_textract.json"
             )
                     json_filename = json_files[0]
                     # Extract the JSON file to the same directory as the ZIP file
+                    extracted_path = secure_join(
                         os.path.dirname(file_path), json_filename
                     )
                     zip_ref.extract(json_filename, os.path.dirname(file_path))

tools/file_redaction.py CHANGED Viewed

@@ -2,7 +2,6 @@ import copy
 import io
 import json
 import os
-import re
 import time
 from collections import defaultdict  # For efficient grouping
 from typing import Any, Dict, List, Optional, Tuple
@@ -94,6 +93,7 @@ from tools.load_spacy_model_custom_recognisers import (
     nlp_analyser,
     score_threshold,
 )
 ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
 if not MAX_IMAGE_PIXELS:
@@ -130,11 +130,10 @@ def sum_numbers_before_seconds(string: str):
         The sum of all numbers before 'seconds' in the string.
     """
-    # Extract numbers before 'seconds' using regular expression
-    numbers = re.findall(r"(\d+\.\d+)?\s*seconds", string)
-    # Extract the numbers from the matches
-    numbers = [float(num.split()[0]) for num in numbers]
     # Sum up the extracted numbers
     sum_of_numbers = round(sum(numbers), 1)
@@ -445,7 +444,9 @@ def choose_and_run_redactor(
         elif out_message:
             combined_out_message = combined_out_message + "\n" + out_message
-        combined_out_message = re.sub(r"^\n+", "", combined_out_message).strip()
         end_message = "\n\nPlease review and modify the suggested redaction outputs on the 'Review redactions' tab of the app (you can find this under the introduction text at the top of the page)."
@@ -1304,8 +1305,9 @@ def choose_and_run_redactor(
             output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
         )
-        with open(all_textract_request_metadata_file_path, "w") as f:
-            f.write(all_request_metadata_str)
         # Add the request metadata to the log outputs if not there already
         if all_textract_request_metadata_file_path not in log_files_output_paths:
@@ -2785,10 +2787,10 @@ def redact_image_pdf(
                 if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
                     if original_textract_data != textract_data:
                         # Write the updated existing textract data back to the JSON file
-                        with open(textract_json_file_path, "w") as json_file:
-                            json.dump(
-                                textract_data, json_file, separators=(",", ":")
-                            )  # indent=4 makes the JSON file pretty-printed
                         if textract_json_file_path not in log_files_output_paths:
                             log_files_output_paths.append(textract_json_file_path)
@@ -2848,10 +2850,10 @@ def redact_image_pdf(
             if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
                 # Write the updated existing textract data back to the JSON file
                 if original_textract_data != textract_data:
-                    with open(textract_json_file_path, "w") as json_file:
-                        json.dump(
-                            textract_data, json_file, separators=(",", ":")
-                        )  # indent=4 makes the JSON file pretty-printed
                 if textract_json_file_path not in log_files_output_paths:
                     log_files_output_paths.append(textract_json_file_path)
@@ -2907,10 +2909,10 @@ def redact_image_pdf(
         # Write the updated existing textract data back to the JSON file
         if original_textract_data != textract_data:
-            with open(textract_json_file_path, "w") as json_file:
-                json.dump(
-                    textract_data, json_file, separators=(",", ":")
-                )  # indent=4 makes the JSON file pretty-printed
         if textract_json_file_path not in log_files_output_paths:
             log_files_output_paths.append(textract_json_file_path)

 import io
 import json
 import os
 import time
 from collections import defaultdict  # For efficient grouping
 from typing import Any, Dict, List, Optional, Tuple
     nlp_analyser,
     score_threshold,
 )
+from tools.secure_path_utils import secure_file_write
 ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
 if not MAX_IMAGE_PIXELS:
         The sum of all numbers before 'seconds' in the string.
     """
+    # Extract numbers before 'seconds' using secure regex
+    from tools.secure_regex_utils import safe_extract_numbers_with_seconds
+    numbers = safe_extract_numbers_with_seconds(string)
     # Sum up the extracted numbers
     sum_of_numbers = round(sum(numbers), 1)
         elif out_message:
             combined_out_message = combined_out_message + "\n" + out_message
+        from tools.secure_regex_utils import safe_remove_leading_newlines
+        combined_out_message = safe_remove_leading_newlines(combined_out_message)
         end_message = "\n\nPlease review and modify the suggested redaction outputs on the 'Review redactions' tab of the app (you can find this under the introduction text at the top of the page)."
             output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
         )
+        secure_file_write(
+            all_textract_request_metadata_file_path, all_request_metadata_str
+        )
         # Add the request metadata to the log outputs if not there already
         if all_textract_request_metadata_file_path not in log_files_output_paths:
                 if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
                     if original_textract_data != textract_data:
                         # Write the updated existing textract data back to the JSON file
+                        secure_file_write(
+                            textract_json_file_path,
+                            json.dumps(textract_data, separators=(",", ":")),
+                        )
                         if textract_json_file_path not in log_files_output_paths:
                             log_files_output_paths.append(textract_json_file_path)
             if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
                 # Write the updated existing textract data back to the JSON file
                 if original_textract_data != textract_data:
+                    secure_file_write(
+                        textract_json_file_path,
+                        json.dumps(textract_data, separators=(",", ":")),
+                    )
                 if textract_json_file_path not in log_files_output_paths:
                     log_files_output_paths.append(textract_json_file_path)
         # Write the updated existing textract data back to the JSON file
         if original_textract_data != textract_data:
+            secure_file_write(
+                textract_json_file_path,
+                json.dumps(textract_data, separators=(",", ":")),
+            )
         if textract_json_file_path not in log_files_output_paths:
             log_files_output_paths.append(textract_json_file_path)

tools/find_duplicate_pages.py CHANGED Viewed

@@ -521,8 +521,9 @@ def clean_and_stem_text_series(df: pd.DataFrame, column: str):
     """
     def _clean_text(raw_text):
-        # Remove HTML tags
-        clean = re.sub(r"<.*?>", "", raw_text)
         clean = " ".join(clean.split())
         # Join the cleaned words back into a string
         return clean
@@ -1271,9 +1272,11 @@ def apply_whole_page_redactions_from_list(
         list_whole_pages_to_redact = []
         for annotation in new_annotations_with_bounding_boxes:
-            match = re.search(r"_(\d+)\.png$", annotation["image"])
-            if match:
-                page = int(match.group(1)) + 1
                 list_whole_pages_to_redact.append(page)
             else:
                 print(

     """
     def _clean_text(raw_text):
+        from tools.secure_regex_utils import safe_clean_text
+        clean = safe_clean_text(raw_text, remove_html=True)
         clean = " ".join(clean.split())
         # Join the cleaned words back into a string
         return clean
         list_whole_pages_to_redact = []
         for annotation in new_annotations_with_bounding_boxes:
+            from tools.secure_regex_utils import safe_extract_page_number_from_path
+            page_num = safe_extract_page_number_from_path(annotation["image"])
+            if page_num is not None:
+                page = page_num + 1
                 list_whole_pages_to_redact.append(page)
             else:
                 print(

tools/find_duplicate_tabular.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-import re
 import time
 from pathlib import Path
 from typing import Dict, List, Tuple
@@ -19,6 +18,7 @@ from tools.config import (
 from tools.data_anonymise import initial_clean
 from tools.helper_functions import OUTPUT_FOLDER, read_file
 from tools.load_spacy_model_custom_recognisers import nlp
 if REMOVE_DUPLICATE_ROWS == "True":
     REMOVE_DUPLICATE_ROWS = True
@@ -345,9 +345,12 @@ def save_tabular_duplicate_results(
                 original_file_extension = os.path.splitext(original_file)[-1]
                 if original_file_extension in [".xlsx", ".xls"]:
-                    # Split the string using a regex to handle both .xlsx_ and .xls_ delimiters
-                    # The regex r'\.xlsx_|\.xls_' correctly matches either ".xlsx_" or ".xls_" as a delimiter.
-                    parts = re.split(r"\.xlsx_|\.xls_", os.path.basename(file_name))
                     # The sheet name is the last part after splitting
                     file_sheet_name = parts[-1]
@@ -430,12 +433,12 @@ def save_tabular_duplicate_results(
                     file_ext = os.path.splitext(file_name)[-1]
                     if file_ext in [".parquet"]:
-                        output_path = os.path.join(
                             output_folder, f"{file_base_name}_deduplicated.parquet"
                         )
                         df_cleaned.to_parquet(output_path, index=False)
                     else:
-                        output_path = os.path.join(
                             output_folder, f"{file_base_name}_deduplicated.csv"
                         )
                         df_cleaned.to_csv(
@@ -451,7 +454,7 @@ def save_tabular_duplicate_results(
             # Create output filename
             file_base_name = os.path.splitext(os.path.basename(file_path))[0]
             file_ext = os.path.splitext(file_path)[-1]
-            output_path = os.path.join(
                 output_folder, f"{file_base_name}_deduplicated{file_ext}"
             )
@@ -513,7 +516,7 @@ def remove_duplicate_rows_from_tabular_data(
         file_stem = os.path.splitext(file_name)[0]
         file_ext = os.path.splitext(file_name)[-1]
-        output_path = os.path.join(output_folder, f"{file_stem}_deduplicated{file_ext}")
         if file_ext in [".xlsx", ".xls"]:
             df_cleaned.to_excel(

 import os
 import time
 from pathlib import Path
 from typing import Dict, List, Tuple
 from tools.data_anonymise import initial_clean
 from tools.helper_functions import OUTPUT_FOLDER, read_file
 from tools.load_spacy_model_custom_recognisers import nlp
+from tools.secure_path_utils import secure_join
 if REMOVE_DUPLICATE_ROWS == "True":
     REMOVE_DUPLICATE_ROWS = True
                 original_file_extension = os.path.splitext(original_file)[-1]
                 if original_file_extension in [".xlsx", ".xls"]:
+                    # Split the string using secure regex to handle both .xlsx_ and .xls_ delimiters
+                    from tools.secure_regex_utils import safe_split_filename
+                    parts = safe_split_filename(
+                        os.path.basename(file_name), [".xlsx_", ".xls_"]
+                    )
                     # The sheet name is the last part after splitting
                     file_sheet_name = parts[-1]
                     file_ext = os.path.splitext(file_name)[-1]
                     if file_ext in [".parquet"]:
+                        output_path = secure_join(
                             output_folder, f"{file_base_name}_deduplicated.parquet"
                         )
                         df_cleaned.to_parquet(output_path, index=False)
                     else:
+                        output_path = secure_join(
                             output_folder, f"{file_base_name}_deduplicated.csv"
                         )
                         df_cleaned.to_csv(
             # Create output filename
             file_base_name = os.path.splitext(os.path.basename(file_path))[0]
             file_ext = os.path.splitext(file_path)[-1]
+            output_path = secure_join(
                 output_folder, f"{file_base_name}_deduplicated{file_ext}"
             )
         file_stem = os.path.splitext(file_name)[0]
         file_ext = os.path.splitext(file_name)[-1]
+        output_path = secure_join(output_folder, f"{file_stem}_deduplicated{file_ext}")
         if file_ext in [".xlsx", ".xls"]:
             df_cleaned.to_excel(

tools/helper_functions.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-import re
 import unicodedata
 from math import ceil
 from typing import List
@@ -33,6 +32,7 @@ from tools.config import (
     aws_comprehend_language_choices,
     textract_language_choices,
 )
 def _get_env_list(env_var_name: str) -> List[str]:
@@ -348,7 +348,7 @@ def put_columns_in_df(in_file: List[str]):
 def check_for_existing_textract_file(
     doc_file_name_no_extension_textbox: str, output_folder: str = OUTPUT_FOLDER
 ):
-    textract_output_path = os.path.join(
         output_folder, doc_file_name_no_extension_textbox + "_textract.json"
     )
@@ -377,7 +377,7 @@ def check_for_relevant_ocr_output_with_words(
     doc_file_with_ending = doc_file_name_no_extension_textbox + file_ending
-    local_ocr_output_path = os.path.join(output_folder, doc_file_with_ending)
     if os.path.exists(local_ocr_output_path):
         print("Existing OCR with words analysis output file found.")
@@ -591,7 +591,9 @@ def clean_unicode_text(text: str):
     # Step 3: Optionally remove non-ASCII characters if needed
     # This regex removes any remaining non-ASCII characters, if desired.
     # Comment this line if you want to keep all Unicode characters.
-    cleaned_text = re.sub(r"[^\x00-\x7F]+", "", normalized_text)
     return cleaned_text
@@ -603,7 +605,7 @@ def load_all_output_files(folder_path: str = OUTPUT_FOLDER) -> List[str]:
     # List all files in the specified folder
     for filename in os.listdir(folder_path):
         # Construct full file path
-        full_path = os.path.join(folder_path, filename)
         # Check if it's a file (not a directory)
         if os.path.isfile(full_path):
             file_paths.append(full_path)

 import os
 import unicodedata
 from math import ceil
 from typing import List
     aws_comprehend_language_choices,
     textract_language_choices,
 )
+from tools.secure_path_utils import secure_join
 def _get_env_list(env_var_name: str) -> List[str]:
 def check_for_existing_textract_file(
     doc_file_name_no_extension_textbox: str, output_folder: str = OUTPUT_FOLDER
 ):
+    textract_output_path = secure_join(
         output_folder, doc_file_name_no_extension_textbox + "_textract.json"
     )
     doc_file_with_ending = doc_file_name_no_extension_textbox + file_ending
+    local_ocr_output_path = secure_join(output_folder, doc_file_with_ending)
     if os.path.exists(local_ocr_output_path):
         print("Existing OCR with words analysis output file found.")
     # Step 3: Optionally remove non-ASCII characters if needed
     # This regex removes any remaining non-ASCII characters, if desired.
     # Comment this line if you want to keep all Unicode characters.
+    from tools.secure_regex_utils import safe_remove_non_ascii
+    cleaned_text = safe_remove_non_ascii(normalized_text)
     return cleaned_text
     # List all files in the specified folder
     for filename in os.listdir(folder_path):
         # Construct full file path
+        full_path = secure_join(folder_path, filename)
         # Check if it's a file (not a directory)
         if os.path.isfile(full_path):
             file_paths.append(full_path)

tools/redaction_review.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import random
-import re
 import string
 import uuid
 from datetime import datetime, timedelta, timezone
@@ -37,6 +36,9 @@ from tools.file_conversion import (
 )
 from tools.file_redaction import redact_page_with_pymupdf
 from tools.helper_functions import detect_file_type, get_file_name_without_type
 if not MAX_IMAGE_PIXELS:
     Image.MAX_IMAGE_PIXELS = None
@@ -535,10 +537,14 @@ def update_annotator_page_from_review_df(
             for i, page_state_entry in enumerate(out_image_annotations_state):
                 # Assuming page_state_entry has a 'page' key (1-based)
-                match = re.search(r"(\d+)\.png$", page_state_entry["image"])
-                if match:
-                    page_no = int(match.group(1))
-                else:
                     page_no = 0
                 if (
@@ -834,15 +840,11 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
         valid = False
         if isinstance(colour_label, str):
             label_str = colour_label.strip()
-            match = re.match(
-                r"^\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,?\s*\)$", label_str
-            )
-            if match:
-                r_val, g_val, b_val = (
-                    int(match.group(1)),
-                    int(match.group(2)),
-                    int(match.group(3)),
-                )
                 if 0 <= r_val <= 255 and 0 <= g_val <= 255 and 0 <= b_val <= 255:
                     valid = True
         elif isinstance(colour_label, (tuple, list)) and len(colour_label) == 3:
@@ -2568,9 +2570,9 @@ def create_xfdf(
         pymupdf_page = pymupdf_doc.load_page(page_python_format)
         if document_cropboxes and page_python_format < len(document_cropboxes):
-            match = re.findall(
-                r"[-+]?\d*\.\d+|\d+", document_cropboxes[page_python_format]
-            )
             if match and len(match) == 4:
                 rect_values = list(map(float, match))
                 pymupdf_page.set_cropbox(Rect(*rect_values))
@@ -2722,8 +2724,7 @@ def convert_df_to_xfdf(
             output_path = output_folder + file_path_name + "_adobe.xfdf"
-            with open(output_path, "w", encoding="utf-8") as f:
-                f.write(xfdf_content)
             output_paths.append(output_path)

 import os
 import random
 import string
 import uuid
 from datetime import datetime, timedelta, timezone
 )
 from tools.file_redaction import redact_page_with_pymupdf
 from tools.helper_functions import detect_file_type, get_file_name_without_type
+from tools.secure_path_utils import (
+    secure_file_write,
+)
 if not MAX_IMAGE_PIXELS:
     Image.MAX_IMAGE_PIXELS = None
             for i, page_state_entry in enumerate(out_image_annotations_state):
                 # Assuming page_state_entry has a 'page' key (1-based)
+                from tools.secure_regex_utils import (
+                    safe_extract_page_number_from_filename,
+                )
+                page_no = safe_extract_page_number_from_filename(
+                    page_state_entry["image"]
+                )
+                if page_no is None:
                     page_no = 0
                 if (
         valid = False
         if isinstance(colour_label, str):
             label_str = colour_label.strip()
+            from tools.secure_regex_utils import safe_extract_rgb_values
+            rgb_values = safe_extract_rgb_values(label_str)
+            if rgb_values:
+                r_val, g_val, b_val = rgb_values
                 if 0 <= r_val <= 255 and 0 <= g_val <= 255 and 0 <= b_val <= 255:
                     valid = True
         elif isinstance(colour_label, (tuple, list)) and len(colour_label) == 3:
         pymupdf_page = pymupdf_doc.load_page(page_python_format)
         if document_cropboxes and page_python_format < len(document_cropboxes):
+            from tools.secure_regex_utils import safe_extract_numbers
+            match = safe_extract_numbers(document_cropboxes[page_python_format])
             if match and len(match) == 4:
                 rect_values = list(map(float, match))
                 pymupdf_page.set_cropbox(Rect(*rect_values))
             output_path = output_folder + file_path_name + "_adobe.xfdf"
+            secure_file_write(output_path, xfdf_content, encoding="utf-8")
             output_paths.append(output_path)

tools/secure_path_utils.py ADDED Viewed

	@@ -0,0 +1,267 @@

+"""
+Secure path utilities to prevent path injection attacks.
+This module provides secure alternatives to os.path operations that validate
+and sanitize file paths to prevent directory traversal and other path-based attacks.
+"""
+import logging
+import os
+import re
+from pathlib import Path
+from typing import Optional, Union
+logger = logging.getLogger(__name__)
+def sanitize_filename(filename: str, max_length: int = 255) -> str:
+    """
+    Sanitize a filename to prevent path injection attacks.
+    Args:
+        filename: The filename to sanitize
+        max_length: Maximum length of the sanitized filename
+    Returns:
+        A sanitized filename safe for use in file operations
+    Raises:
+        ValueError: If the filename cannot be sanitized safely
+    """
+    if not filename or not isinstance(filename, str):
+        raise ValueError("Filename must be a non-empty string")
+    # Remove any path separators and normalize
+    filename = os.path.basename(filename)
+    # Remove or replace dangerous characters
+    # Keep alphanumeric, dots, hyphens, underscores, spaces, parentheses, brackets, and other safe chars
+    # Only remove truly dangerous characters like path separators and control chars
+    sanitized = re.sub(r'[<>:"|?*\x00-\x1f]', "_", filename)
+    # Remove multiple consecutive dots (except for file extensions)
+    sanitized = re.sub(r"\.{2,}", ".", sanitized)
+    # Remove leading/trailing dots and spaces
+    sanitized = sanitized.strip(". ")
+    # Ensure it's not empty after sanitization
+    if not sanitized:
+        sanitized = "sanitized_file"
+    # Truncate if too long, preserving extension
+    if len(sanitized) > max_length:
+        name, ext = os.path.splitext(sanitized)
+        max_name_length = max_length - len(ext)
+        sanitized = name[:max_name_length] + ext
+    return sanitized
+def secure_path_join(base_path: Union[str, Path], *path_parts: str) -> Path:
+    """
+    Safely join paths while preventing directory traversal attacks.
+    Args:
+        base_path: The base directory path
+        *path_parts: Additional path components to join
+    Returns:
+        A Path object representing the safe joined path
+    Raises:
+        ValueError: If any path component contains dangerous characters
+        PermissionError: If the resulting path would escape the base directory
+    """
+    base_path = Path(base_path).resolve()
+    # Sanitize each path part - only sanitize if it contains dangerous patterns
+    sanitized_parts = []
+    for part in path_parts:
+        if not part:
+            continue
+        # Only sanitize if the part contains dangerous patterns
+        if re.search(r'[<>:"|?*\x00-\x1f]|\.{2,}', part):
+            sanitized_part = sanitize_filename(part)
+        else:
+            sanitized_part = part
+        sanitized_parts.append(sanitized_part)
+    # Join the paths
+    result_path = base_path
+    for part in sanitized_parts:
+        result_path = result_path / part
+    # Resolve the final path
+    result_path = result_path.resolve()
+    # Security check: ensure the result is within the base directory
+    try:
+        result_path.relative_to(base_path)
+    except ValueError:
+        raise PermissionError(f"Path would escape base directory: {result_path}")
+    return result_path
+def secure_file_write(
+    file_path: Union[str, Path],
+    content: str,
+    mode: str = "w",
+    encoding: Optional[str] = None,
+    **kwargs,
+) -> None:
+    """
+    Safely write content to a file with path validation.
+    Args:
+        file_path: The file path to write to
+        content: The content to write
+        mode: File open mode (default: 'w')
+        encoding: Text encoding (default: None for binary mode)
+        **kwargs: Additional arguments for open()
+    """
+    file_path = Path(file_path)
+    # Ensure the parent directory exists
+    file_path.parent.mkdir(parents=True, exist_ok=True)
+    # Validate the path is safe
+    if not file_path.is_absolute():
+        file_path = file_path.resolve()
+    # Write the file
+    open_kwargs = {"mode": mode}
+    if encoding:
+        open_kwargs["encoding"] = encoding
+    open_kwargs.update(kwargs)
+    with open(file_path, **open_kwargs) as f:
+        f.write(content)
+def secure_file_read(
+    file_path: Union[str, Path],
+    mode: str = "r",
+    encoding: Optional[str] = None,
+    **kwargs,
+) -> str:
+    """
+    Safely read content from a file with path validation.
+    Args:
+        file_path: The file path to read from
+        mode: File open mode (default: 'r')
+        encoding: Text encoding (default: None for binary mode)
+        **kwargs: Additional arguments for open()
+    Returns:
+        The file content
+    """
+    file_path = Path(file_path)
+    # Validate the path exists and is a file
+    if not file_path.exists():
+        raise FileNotFoundError(f"File not found: {file_path}")
+    if not file_path.is_file():
+        raise ValueError(f"Path is not a file: {file_path}")
+    # Read the file
+    open_kwargs = {"mode": mode}
+    if encoding:
+        open_kwargs["encoding"] = encoding
+    open_kwargs.update(kwargs)
+    with open(file_path, **open_kwargs) as f:
+        return f.read()
+def validate_path_safety(
+    path: Union[str, Path], base_path: Optional[Union[str, Path]] = None
+) -> bool:
+    """
+    Validate that a path is safe and doesn't contain dangerous patterns.
+    Args:
+        path: The path to validate
+        base_path: Optional base path to check against
+    Returns:
+        True if the path is safe, False otherwise
+    """
+    try:
+        path = Path(path)
+        # Check for dangerous patterns
+        path_str = str(path)
+        # Check for directory traversal patterns
+        dangerous_patterns = [
+            "..",  # Parent directory
+            "//",  # Double slashes
+            "\\",  # Backslashes (on Unix systems)
+        ]
+        for pattern in dangerous_patterns:
+            if pattern in path_str:
+                return False
+        # If base path is provided, ensure the path is within it
+        if base_path:
+            base_path = Path(base_path).resolve()
+            path = path.resolve()
+            try:
+                path.relative_to(base_path)
+            except ValueError:
+                return False
+        return True
+    except Exception:
+        return False
+# Backward compatibility functions that maintain the same interface as os.path
+def secure_join(*paths: str) -> str:
+    """
+    Secure alternative to os.path.join that prevents path injection.
+    Args:
+        *paths: Path components to join
+    Returns:
+        A safe joined path string
+    """
+    if not paths:
+        return ""
+    # Use the first path as base, others as components
+    base_path = Path(paths[0])
+    path_parts = paths[1:]
+    # Only use secure_path_join if there are potentially dangerous patterns
+    if any(re.search(r'[<>:"|?*\x00-\x1f]|\.{2,}', part) for part in path_parts):
+        result_path = secure_path_join(base_path, *path_parts)
+        return str(result_path)
+    else:
+        # Use normal path joining for safe paths
+        return str(Path(*paths))
+def secure_basename(path: str) -> str:
+    """
+    Secure alternative to os.path.basename that sanitizes the result.
+    Args:
+        path: The path to get the basename from
+    Returns:
+        A sanitized basename
+    """
+    basename = os.path.basename(path)
+    # Only sanitize if the basename contains dangerous patterns
+    if re.search(r'[<>:"|?*\x00-\x1f]|\.{2,}', basename):
+        return sanitize_filename(basename)
+    else:
+        return basename

tools/secure_regex_utils.py ADDED Viewed

	@@ -0,0 +1,292 @@

+"""
+Secure regex utilities to prevent ReDoS (Regular Expression Denial of Service) attacks.
+This module provides safe alternatives to common regex patterns that can cause
+catastrophic backtracking and performance issues.
+"""
+import re
+from typing import List, Optional
+def safe_extract_numbers_with_seconds(text: str) -> List[float]:
+    """
+    Safely extract numbers before 'seconds' from text without ReDoS vulnerability.
+    Args:
+        text: The text to search for numbers followed by 'seconds'
+    Returns:
+        List of float numbers found before 'seconds'
+    """
+    if not text or not isinstance(text, str):
+        return []
+    # Use a more specific pattern that avoids catastrophic backtracking
+    # Look for digits, optional decimal part, optional whitespace, then 'seconds'
+    pattern = r"\b(\d+(?:\.\d+)?)\s*seconds\b"
+    matches = re.findall(pattern, text)
+    try:
+        return [float(match) for match in matches]
+    except (ValueError, TypeError):
+        return []
+def safe_extract_numbers(text: str) -> List[float]:
+    """
+    Safely extract all numbers from text without ReDoS vulnerability.
+    Args:
+        text: The text to extract numbers from
+    Returns:
+        List of float numbers found in the text
+    """
+    if not text or not isinstance(text, str):
+        return []
+    # Use a simple, safe pattern that doesn't cause backtracking
+    # Match digits, optional decimal point and more digits
+    pattern = r"\b\d+(?:\.\d+)?\b"
+    matches = re.findall(pattern, text)
+    try:
+        return [float(match) for match in matches]
+    except (ValueError, TypeError):
+        return []
+def safe_extract_page_number_from_filename(filename: str) -> Optional[int]:
+    """
+    Safely extract page number from filename ending with .png.
+    Args:
+        filename: The filename to extract page number from
+    Returns:
+        Page number if found, None otherwise
+    """
+    if not filename or not isinstance(filename, str):
+        return None
+    # Use a simple, safe pattern
+    pattern = r"(\d+)\.png$"
+    match = re.search(pattern, filename)
+    if match:
+        try:
+            return int(match.group(1))
+        except (ValueError, TypeError):
+            return None
+    return None
+def safe_extract_page_number_from_path(path: str) -> Optional[int]:
+    """
+    Safely extract page number from path containing _(\d+).png pattern.
+    Args:
+        path: The path to extract page number from
+    Returns:
+        Page number if found, None otherwise
+    """
+    if not path or not isinstance(path, str):
+        return None
+    # Use a simple, safe pattern
+    pattern = r"_(\d+)\.png$"
+    match = re.search(pattern, path)
+    if match:
+        try:
+            return int(match.group(1))
+        except (ValueError, TypeError):
+            return None
+    return None
+def safe_clean_text(text: str, remove_html: bool = True) -> str:
+    """
+    Safely clean text without ReDoS vulnerability.
+    Args:
+        text: The text to clean
+        remove_html: Whether to remove HTML tags
+    Returns:
+        Cleaned text
+    """
+    if not text or not isinstance(text, str):
+        return ""
+    cleaned = text
+    if remove_html:
+        # Use a simple pattern that doesn't cause backtracking
+        cleaned = re.sub(r"<[^>]*>", "", cleaned)
+    # Clean up whitespace
+    cleaned = re.sub(r"\s+", " ", cleaned).strip()
+    return cleaned
+def safe_extract_rgb_values(text: str) -> Optional[tuple]:
+    """
+    Safely extract RGB values from text like "(255, 255, 255)".
+    Args:
+        text: The text to extract RGB values from
+    Returns:
+        Tuple of (r, g, b) values if found, None otherwise
+    """
+    if not text or not isinstance(text, str):
+        return None
+    # Use a simple, safe pattern
+    pattern = r"\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*\)"
+    match = re.match(pattern, text.strip())
+    if match:
+        try:
+            r = int(match.group(1))
+            g = int(match.group(2))
+            b = int(match.group(3))
+            # Validate RGB values
+            if 0 <= r <= 255 and 0 <= g <= 255 and 0 <= b <= 255:
+                return (r, g, b)
+        except (ValueError, TypeError):
+            pass
+    return None
+def safe_split_filename(filename: str, delimiters: List[str]) -> List[str]:
+    """
+    Safely split filename by delimiters without ReDoS vulnerability.
+    Args:
+        filename: The filename to split
+        delimiters: List of delimiter patterns to split on
+    Returns:
+        List of filename parts
+    """
+    if not filename or not isinstance(filename, str):
+        return []
+    if not delimiters:
+        return [filename]
+    # Escape special regex characters in delimiters
+    escaped_delimiters = [re.escape(delim) for delim in delimiters]
+    # Create a safe pattern
+    pattern = "|".join(escaped_delimiters)
+    try:
+        return re.split(pattern, filename)
+    except re.error:
+        # Fallback to simple string operations if regex fails
+        result = [filename]
+        for delim in delimiters:
+            new_result = []
+            for part in result:
+                new_result.extend(part.split(delim))
+            result = new_result
+        return result
+def safe_remove_leading_newlines(text: str) -> str:
+    """
+    Safely remove leading newlines without ReDoS vulnerability.
+    Args:
+        text: The text to clean
+    Returns:
+        Text with leading newlines removed
+    """
+    if not text or not isinstance(text, str):
+        return ""
+    # Use a simple pattern
+    return re.sub(r"^\n+", "", text).strip()
+def safe_remove_non_ascii(text: str) -> str:
+    """
+    Safely remove non-ASCII characters without ReDoS vulnerability.
+    Args:
+        text: The text to clean
+    Returns:
+        Text with non-ASCII characters removed
+    """
+    if not text or not isinstance(text, str):
+        return ""
+    # Use a simple pattern
+    return re.sub(r"[^\x00-\x7F]", "", text)
+def safe_extract_latest_number_from_filename(filename: str) -> Optional[int]:
+    """
+    Safely extract the latest/largest number from filename without ReDoS vulnerability.
+    Args:
+        filename: The filename to extract number from
+    Returns:
+        The largest number found, or None if no numbers found
+    """
+    if not filename or not isinstance(filename, str):
+        return None
+    # Use a simple pattern to find all numbers
+    pattern = r"\d+"
+    matches = re.findall(pattern, filename)
+    if not matches:
+        return None
+    try:
+        # Convert to integers and return the maximum
+        numbers = [int(match) for match in matches]
+        return max(numbers)
+    except (ValueError, TypeError):
+        return None
+def safe_sanitize_text(text: str, replacement: str = "_") -> str:
+    """
+    Safely sanitize text by removing dangerous characters without ReDoS vulnerability.
+    Args:
+        text: The text to sanitize
+        replacement: Character to replace dangerous characters with
+    Returns:
+        Sanitized text
+    """
+    if not text or not isinstance(text, str):
+        return ""
+    # Use a simple pattern for dangerous characters
+    dangerous_chars = r'[<>:"|?*\\/\x00-\x1f\x7f-\x9f]'
+    sanitized = re.sub(dangerous_chars, replacement, text)
+    # Remove multiple consecutive replacements
+    sanitized = re.sub(f"{re.escape(replacement)}+", replacement, sanitized)
+    # Remove leading/trailing replacements
+    sanitized = sanitized.strip(replacement)
+    return sanitized

tools/textract_batch_call.py CHANGED Viewed

@@ -32,6 +32,11 @@ from tools.config import (
 )
 from tools.file_conversion import get_input_file_names
 from tools.helper_functions import get_file_name_without_type
 DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
@@ -115,8 +120,8 @@ def analyse_document_with_textract_api(
     textract_client = session.client("textract")
     # --- 1. Upload PDF to S3 ---
-    pdf_filename = os.path.basename(local_pdf_path)
-    s3_input_key = os.path.join(s3_input_prefix, pdf_filename).replace(
         "\\", "/"
     )  # Ensure forward slashes for S3
@@ -262,14 +267,13 @@ def analyse_document_with_textract_api(
         )
         # File path
-        log_file_path = os.path.join(local_output_dir, "textract_document_jobs.csv")
-        log_file_path_job_id = os.path.join(
             local_output_dir, pdf_filename + "_textract_document_jobs_job_id.txt"
         )
         # Write latest job ID to local text file
-        with open(log_file_path_job_id, "w") as f:
-            f.write(job_id)
         # Check if file exists
         file_exists = os.path.exists(log_file_path)
@@ -447,10 +451,9 @@ def download_textract_job_files(
     output_filename_base = os.path.basename(pdf_filename)
     output_filename_base_no_ext = os.path.splitext(output_filename_base)[0]
     local_output_filename = f"{output_filename_base_no_ext}_textract.json"
-    local_output_path = os.path.join(local_output_dir, local_output_filename)
-    with open(local_output_path, "w") as f:
-        json.dump(combined_output, f)
     print(f"Combined Textract output written to {local_output_path}")
@@ -484,12 +487,12 @@ def load_pdf_job_file_from_s3(
         pdf_file_location = ""
         doc_file_name_no_extension_textbox = ""
-        s3_input_key_prefix = os.path.join(
-            load_s3_jobs_input_loc, pdf_filename
-        ).replace("\\", "/")
         s3_input_key_prefix = s3_input_key_prefix + ".pdf"
-        local_input_file_path = os.path.join(local_output_dir, pdf_filename)
         local_input_file_path = local_input_file_path + ".pdf"
         download_file_from_s3(
@@ -705,7 +708,7 @@ def poll_whole_document_textract_analysis_progress_and_download(
             # For robust handling, list objects and find the JSON(s).
             s3_output_key_prefix = (
-                os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
             )
             logging.info(
                 f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}"
@@ -848,7 +851,7 @@ def download_textract_output(
     # Find output ZIP file in S3
     output_file_key = f"{output_prefix}/{job_id}.zip"
-    local_file_path = os.path.join(local_folder, f"{job_id}.zip")
     # Download file
     try:

 )
 from tools.file_conversion import get_input_file_names
 from tools.helper_functions import get_file_name_without_type
+from tools.secure_path_utils import (
+    secure_basename,
+    secure_file_write,
+    secure_join,
+)
 DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
     textract_client = session.client("textract")
     # --- 1. Upload PDF to S3 ---
+    pdf_filename = secure_basename(local_pdf_path)
+    s3_input_key = secure_join(s3_input_prefix, pdf_filename).replace(
         "\\", "/"
     )  # Ensure forward slashes for S3
         )
         # File path
+        log_file_path = secure_join(local_output_dir, "textract_document_jobs.csv")
+        log_file_path_job_id = secure_join(
             local_output_dir, pdf_filename + "_textract_document_jobs_job_id.txt"
         )
         # Write latest job ID to local text file
+        secure_file_write(log_file_path_job_id, job_id)
         # Check if file exists
         file_exists = os.path.exists(log_file_path)
     output_filename_base = os.path.basename(pdf_filename)
     output_filename_base_no_ext = os.path.splitext(output_filename_base)[0]
     local_output_filename = f"{output_filename_base_no_ext}_textract.json"
+    local_output_path = secure_join(local_output_dir, local_output_filename)
+    secure_file_write(local_output_path, json.dumps(combined_output))
     print(f"Combined Textract output written to {local_output_path}")
         pdf_file_location = ""
         doc_file_name_no_extension_textbox = ""
+        s3_input_key_prefix = secure_join(load_s3_jobs_input_loc, pdf_filename).replace(
+            "\\", "/"
+        )
         s3_input_key_prefix = s3_input_key_prefix + ".pdf"
+        local_input_file_path = secure_join(local_output_dir, pdf_filename)
         local_input_file_path = local_input_file_path + ".pdf"
         download_file_from_s3(
             # For robust handling, list objects and find the JSON(s).
             s3_output_key_prefix = (
+                secure_join(s3_output_prefix, job_id).replace("\\", "/") + "/"
             )
             logging.info(
                 f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}"
     # Find output ZIP file in S3
     output_file_key = f"{output_prefix}/{job_id}.zip"
+    local_file_path = secure_join(local_folder, f"{job_id}.zip")
     # Download file
     try: