Commit
·
f957846
1
Parent(s):
bafcf39
General code changes and reformatting to address code vulnerabilities highlighted by codeQL scan, and black/ruff repplied to code. Fixes/optimisation of Github Actions
Browse files- .dockerignore +8 -0
- .github/README.md +1 -1
- .github/scripts/setup_test_data.py +6 -3
- .github/workflows/ci.yml +14 -8
- .github/workflows/multi-os-test.yml +10 -6
- .github/workflows/simple-test.yml +4 -0
- .github/workflows/test.yml +5 -0
- .gitignore +8 -0
- cdk/cdk_functions.py +2 -2
- test/run_tests.py +1 -1
- test/test.py +50 -41
- test/test_gui_only.py +44 -35
- tools/aws_functions.py +4 -3
- tools/aws_textract.py +3 -2
- tools/config.py +1 -1
- tools/custom_csvlogger.py +9 -3
- tools/custom_image_analyser_engine.py +2 -5
- tools/data_anonymise.py +17 -20
- tools/file_conversion.py +23 -16
- tools/file_redaction.py +22 -20
- tools/find_duplicate_pages.py +8 -5
- tools/find_duplicate_tabular.py +11 -8
- tools/helper_functions.py +7 -5
- tools/redaction_review.py +20 -19
- tools/secure_path_utils.py +267 -0
- tools/secure_regex_utils.py +292 -0
- tools/textract_batch_call.py +18 -15
.dockerignore
CHANGED
|
@@ -26,3 +26,11 @@ input/
|
|
| 26 |
feedback/
|
| 27 |
config/
|
| 28 |
usage/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
feedback/
|
| 27 |
config/
|
| 28 |
usage/
|
| 29 |
+
test/config/*
|
| 30 |
+
test/feedback/*
|
| 31 |
+
test/input/*
|
| 32 |
+
test/logs/*
|
| 33 |
+
test/output/*
|
| 34 |
+
test/tmp/*
|
| 35 |
+
test/usage/*
|
| 36 |
+
.ruff_cache/*
|
.github/README.md
CHANGED
|
@@ -27,7 +27,7 @@ This directory contains GitHub Actions workflows for automated testing of the CL
|
|
| 27 |
|
| 28 |
### 3. **Multi-OS Testing** (`.github/workflows/multi-os-test.yml`)
|
| 29 |
- **Purpose**: Cross-platform testing
|
| 30 |
-
- **OS**: Ubuntu, Windows
|
| 31 |
- **Python**: 3.10, 3.11, 3.12
|
| 32 |
- **Features**: Tests compatibility across different operating systems
|
| 33 |
|
|
|
|
| 27 |
|
| 28 |
### 3. **Multi-OS Testing** (`.github/workflows/multi-os-test.yml`)
|
| 29 |
- **Purpose**: Cross-platform testing
|
| 30 |
+
- **OS**: Ubuntu, macOS (Windows not included currently but may be reintroduced)
|
| 31 |
- **Python**: 3.10, 3.11, 3.12
|
| 32 |
- **Features**: Tests compatibility across different operating systems
|
| 33 |
|
.github/scripts/setup_test_data.py
CHANGED
|
@@ -142,14 +142,17 @@ def create_allow_deny_lists():
|
|
| 142 |
def create_ocr_output():
|
| 143 |
"""Create dummy OCR output CSV."""
|
| 144 |
ocr_data = {
|
| 145 |
-
"
|
| 146 |
-
"page_number": [1, 2, 3],
|
| 147 |
"text": [
|
| 148 |
"This is page 1 content with some text",
|
| 149 |
"This is page 2 content with different text",
|
| 150 |
"This is page 3 content with more text",
|
| 151 |
],
|
| 152 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
}
|
| 154 |
df = pd.DataFrame(ocr_data)
|
| 155 |
df.to_csv(
|
|
|
|
| 142 |
def create_ocr_output():
|
| 143 |
"""Create dummy OCR output CSV."""
|
| 144 |
ocr_data = {
|
| 145 |
+
"page": [1, 2, 3],
|
|
|
|
| 146 |
"text": [
|
| 147 |
"This is page 1 content with some text",
|
| 148 |
"This is page 2 content with different text",
|
| 149 |
"This is page 3 content with more text",
|
| 150 |
],
|
| 151 |
+
"left": [0.1, 0.3, 0.5],
|
| 152 |
+
"top": [0.95, 0.92, 0.88],
|
| 153 |
+
"width": [0.05, 0.02, 0.02],
|
| 154 |
+
"height": [0.01, 0.02, 0.02],
|
| 155 |
+
"line": [1, 2, 3],
|
| 156 |
}
|
| 157 |
df = pd.DataFrame(ocr_data)
|
| 158 |
df.to_csv(
|
.github/workflows/ci.yml
CHANGED
|
@@ -2,12 +2,18 @@ name: CI/CD Pipeline
|
|
| 2 |
|
| 3 |
on:
|
| 4 |
push:
|
| 5 |
-
branches: [ main
|
| 6 |
pull_request:
|
| 7 |
-
branches: [ main
|
| 8 |
-
schedule:
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
env:
|
| 13 |
PYTHON_VERSION: "3.11"
|
|
@@ -38,7 +44,7 @@ jobs:
|
|
| 38 |
runs-on: ubuntu-latest
|
| 39 |
strategy:
|
| 40 |
matrix:
|
| 41 |
-
python-version: [3.
|
| 42 |
|
| 43 |
steps:
|
| 44 |
- uses: actions/checkout@v4
|
|
@@ -180,9 +186,9 @@ jobs:
|
|
| 180 |
python -m pip install --upgrade pip
|
| 181 |
pip install safety bandit
|
| 182 |
|
| 183 |
-
- name: Run safety
|
| 184 |
run: |
|
| 185 |
-
safety
|
| 186 |
|
| 187 |
- name: Run bandit security check
|
| 188 |
run: |
|
|
|
|
| 2 |
|
| 3 |
on:
|
| 4 |
push:
|
| 5 |
+
branches: [ main ]
|
| 6 |
pull_request:
|
| 7 |
+
branches: [ main ]
|
| 8 |
+
#schedule:
|
| 9 |
+
# Run tests daily at 2 AM UTC
|
| 10 |
+
# - cron: '0 2 * * *'
|
| 11 |
+
|
| 12 |
+
permissions:
|
| 13 |
+
contents: read
|
| 14 |
+
actions: read
|
| 15 |
+
pull-requests: write
|
| 16 |
+
issues: write
|
| 17 |
|
| 18 |
env:
|
| 19 |
PYTHON_VERSION: "3.11"
|
|
|
|
| 44 |
runs-on: ubuntu-latest
|
| 45 |
strategy:
|
| 46 |
matrix:
|
| 47 |
+
python-version: [3.11, 3.12, 3.13]
|
| 48 |
|
| 49 |
steps:
|
| 50 |
- uses: actions/checkout@v4
|
|
|
|
| 186 |
python -m pip install --upgrade pip
|
| 187 |
pip install safety bandit
|
| 188 |
|
| 189 |
+
- name: Run safety scan
|
| 190 |
run: |
|
| 191 |
+
safety scan -r requirements.txt
|
| 192 |
|
| 193 |
- name: Run bandit security check
|
| 194 |
run: |
|
.github/workflows/multi-os-test.yml
CHANGED
|
@@ -2,23 +2,27 @@ name: Multi-OS Test
|
|
| 2 |
|
| 3 |
on:
|
| 4 |
push:
|
| 5 |
-
branches: [ main
|
| 6 |
pull_request:
|
| 7 |
-
branches: [ main
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
jobs:
|
| 10 |
test:
|
| 11 |
runs-on: ${{ matrix.os }}
|
| 12 |
strategy:
|
| 13 |
matrix:
|
| 14 |
-
os: [ubuntu-latest,
|
| 15 |
python-version: ["3.10", "3.11", "3.12"]
|
| 16 |
exclude:
|
| 17 |
# Exclude some combinations to reduce CI time
|
| 18 |
-
|
| 19 |
-
|
| 20 |
- os: macos-latest
|
| 21 |
-
python-version: "3.
|
| 22 |
|
| 23 |
steps:
|
| 24 |
- uses: actions/checkout@v4
|
|
|
|
| 2 |
|
| 3 |
on:
|
| 4 |
push:
|
| 5 |
+
branches: [ main ]
|
| 6 |
pull_request:
|
| 7 |
+
branches: [ main ]
|
| 8 |
+
|
| 9 |
+
permissions:
|
| 10 |
+
contents: read
|
| 11 |
+
actions: read
|
| 12 |
|
| 13 |
jobs:
|
| 14 |
test:
|
| 15 |
runs-on: ${{ matrix.os }}
|
| 16 |
strategy:
|
| 17 |
matrix:
|
| 18 |
+
os: [ubuntu-latest, macos-latest] # windows-latest removed for now as I have not been able to install tesseract on Windows using this method
|
| 19 |
python-version: ["3.10", "3.11", "3.12"]
|
| 20 |
exclude:
|
| 21 |
# Exclude some combinations to reduce CI time
|
| 22 |
+
#- os: windows-latest
|
| 23 |
+
# python-version: "3.10"
|
| 24 |
- os: macos-latest
|
| 25 |
+
python-version: "3.11"
|
| 26 |
|
| 27 |
steps:
|
| 28 |
- uses: actions/checkout@v4
|
.github/workflows/simple-test.yml
CHANGED
|
@@ -6,6 +6,10 @@ on:
|
|
| 6 |
pull_request:
|
| 7 |
branches: [ main, dev ]
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
jobs:
|
| 10 |
test:
|
| 11 |
runs-on: ubuntu-latest
|
|
|
|
| 6 |
pull_request:
|
| 7 |
branches: [ main, dev ]
|
| 8 |
|
| 9 |
+
permissions:
|
| 10 |
+
contents: read
|
| 11 |
+
actions: read
|
| 12 |
+
|
| 13 |
jobs:
|
| 14 |
test:
|
| 15 |
runs-on: ubuntu-latest
|
.github/workflows/test.yml
CHANGED
|
@@ -6,6 +6,11 @@ on:
|
|
| 6 |
pull_request:
|
| 7 |
branches: [ main, dev ]
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
jobs:
|
| 10 |
test:
|
| 11 |
runs-on: ubuntu-latest
|
|
|
|
| 6 |
pull_request:
|
| 7 |
branches: [ main, dev ]
|
| 8 |
|
| 9 |
+
permissions:
|
| 10 |
+
contents: read
|
| 11 |
+
actions: read
|
| 12 |
+
pull-requests: write
|
| 13 |
+
|
| 14 |
jobs:
|
| 15 |
test:
|
| 16 |
runs-on: ubuntu-latest
|
.gitignore
CHANGED
|
@@ -29,3 +29,11 @@ cdk.context.json
|
|
| 29 |
.quarto/*
|
| 30 |
/.quarto/
|
| 31 |
/_site/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
.quarto/*
|
| 30 |
/.quarto/
|
| 31 |
/_site/
|
| 32 |
+
test/config/*
|
| 33 |
+
test/feedback/*
|
| 34 |
+
test/input/*
|
| 35 |
+
test/logs/*
|
| 36 |
+
test/output/*
|
| 37 |
+
test/tmp/*
|
| 38 |
+
test/usage/*
|
| 39 |
+
.ruff_cache/*
|
cdk/cdk_functions.py
CHANGED
|
@@ -856,14 +856,14 @@ def check_for_secret(secret_name: str, secret_value: dict = ""):
|
|
| 856 |
try:
|
| 857 |
# Try to get the secret. If it doesn't exist, a ResourceNotFoundException will be raised.
|
| 858 |
secret_value = secretsmanager_client.get_secret_value(SecretId=secret_name)
|
| 859 |
-
print(
|
| 860 |
return True, secret_value
|
| 861 |
except secretsmanager_client.exceptions.ResourceNotFoundException:
|
| 862 |
print("Secret not found")
|
| 863 |
return False, {}
|
| 864 |
except Exception as e:
|
| 865 |
# Handle other potential exceptions during the get operation
|
| 866 |
-
print(f"Error checking for secret
|
| 867 |
return False, {}
|
| 868 |
|
| 869 |
|
|
|
|
| 856 |
try:
|
| 857 |
# Try to get the secret. If it doesn't exist, a ResourceNotFoundException will be raised.
|
| 858 |
secret_value = secretsmanager_client.get_secret_value(SecretId=secret_name)
|
| 859 |
+
print("Secret already exists.")
|
| 860 |
return True, secret_value
|
| 861 |
except secretsmanager_client.exceptions.ResourceNotFoundException:
|
| 862 |
print("Secret not found")
|
| 863 |
return False, {}
|
| 864 |
except Exception as e:
|
| 865 |
# Handle other potential exceptions during the get operation
|
| 866 |
+
print(f"Error checking for secret: {e}")
|
| 867 |
return False, {}
|
| 868 |
|
| 869 |
|
test/run_tests.py
CHANGED
|
@@ -12,7 +12,7 @@ import sys
|
|
| 12 |
# Add the parent directory to the path so we can import the test module
|
| 13 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 14 |
|
| 15 |
-
from test
|
| 16 |
|
| 17 |
if __name__ == "__main__":
|
| 18 |
print("Starting CLI Redaction Test Suite...")
|
|
|
|
| 12 |
# Add the parent directory to the path so we can import the test module
|
| 13 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 14 |
|
| 15 |
+
from test import run_all_tests
|
| 16 |
|
| 17 |
if __name__ == "__main__":
|
| 18 |
print("Starting CLI Redaction Test Suite...")
|
test/test.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
import shutil
|
| 3 |
import subprocess
|
| 4 |
-
import tempfile
|
| 5 |
-
import unittest
|
| 6 |
import sys
|
|
|
|
| 7 |
import threading
|
| 8 |
-
import
|
| 9 |
from typing import List, Optional
|
| 10 |
|
| 11 |
|
|
@@ -893,35 +892,40 @@ class TestGUIApp(unittest.TestCase):
|
|
| 893 |
cls.app_path = os.path.join(
|
| 894 |
os.path.dirname(os.path.dirname(__file__)), "app.py"
|
| 895 |
)
|
| 896 |
-
|
| 897 |
# Verify app.py exists
|
| 898 |
if not os.path.isfile(cls.app_path):
|
| 899 |
raise FileNotFoundError(f"App file not found: {cls.app_path}")
|
| 900 |
-
|
| 901 |
print(f"GUI test setup complete. App: {cls.app_path}")
|
| 902 |
|
| 903 |
def test_app_import_and_initialization(self):
|
| 904 |
"""Test: Import app.py and check if the Gradio app object is created successfully."""
|
| 905 |
print("\n=== Testing GUI app import and initialization ===")
|
| 906 |
-
|
| 907 |
try:
|
| 908 |
# Add the parent directory to the path so we can import app
|
| 909 |
parent_dir = os.path.dirname(os.path.dirname(__file__))
|
| 910 |
if parent_dir not in sys.path:
|
| 911 |
sys.path.insert(0, parent_dir)
|
| 912 |
-
|
| 913 |
# Import the app module
|
| 914 |
import app
|
| 915 |
-
|
| 916 |
# Check if the app object exists and is a Gradio Blocks object
|
| 917 |
-
self.assertTrue(
|
| 918 |
-
|
|
|
|
|
|
|
| 919 |
# Check if it's a Gradio Blocks instance
|
| 920 |
import gradio as gr
|
| 921 |
-
|
| 922 |
-
|
|
|
|
|
|
|
|
|
|
| 923 |
print("✅ GUI app import and initialization passed")
|
| 924 |
-
|
| 925 |
except ImportError as e:
|
| 926 |
error_msg = f"Failed to import app module: {e}"
|
| 927 |
if "gradio_image_annotation" in str(e):
|
|
@@ -935,41 +939,40 @@ class TestGUIApp(unittest.TestCase):
|
|
| 935 |
def test_app_launch_headless(self):
|
| 936 |
"""Test: Launch the app in headless mode to verify it starts without errors."""
|
| 937 |
print("\n=== Testing GUI app launch in headless mode ===")
|
| 938 |
-
|
| 939 |
try:
|
| 940 |
# Add the parent directory to the path
|
| 941 |
parent_dir = os.path.dirname(os.path.dirname(__file__))
|
| 942 |
if parent_dir not in sys.path:
|
| 943 |
sys.path.insert(0, parent_dir)
|
| 944 |
-
|
| 945 |
# Import the app module
|
|
|
|
| 946 |
import app
|
| 947 |
-
|
| 948 |
-
|
| 949 |
# Set up a flag to track if the app launched successfully
|
| 950 |
app_launched = threading.Event()
|
| 951 |
launch_error = None
|
| 952 |
-
|
| 953 |
def launch_app():
|
| 954 |
try:
|
| 955 |
# Launch the app in headless mode with a short timeout
|
| 956 |
app.app.launch(
|
| 957 |
show_error=True,
|
| 958 |
inbrowser=False, # Don't open browser
|
| 959 |
-
server_port=0,
|
| 960 |
-
quiet=True,
|
| 961 |
-
prevent_thread_lock=True # Don't block the main thread
|
| 962 |
)
|
| 963 |
app_launched.set()
|
| 964 |
-
except Exception
|
| 965 |
-
launch_error = e
|
| 966 |
app_launched.set()
|
| 967 |
-
|
| 968 |
# Start the app in a separate thread
|
| 969 |
launch_thread = threading.Thread(target=launch_app)
|
| 970 |
launch_thread.daemon = True
|
| 971 |
launch_thread.start()
|
| 972 |
-
|
| 973 |
# Wait for the app to launch (with timeout)
|
| 974 |
if app_launched.wait(timeout=10): # 10 second timeout
|
| 975 |
if launch_error:
|
|
@@ -978,7 +981,7 @@ class TestGUIApp(unittest.TestCase):
|
|
| 978 |
print("✅ GUI app launch in headless mode passed")
|
| 979 |
else:
|
| 980 |
self.fail("App launch timed out after 10 seconds")
|
| 981 |
-
|
| 982 |
except Exception as e:
|
| 983 |
error_msg = f"Unexpected error during app launch test: {e}"
|
| 984 |
if "gradio_image_annotation" in str(e):
|
|
@@ -990,33 +993,39 @@ class TestGUIApp(unittest.TestCase):
|
|
| 990 |
def test_app_configuration_loading(self):
|
| 991 |
"""Test: Verify that the app can load its configuration without errors."""
|
| 992 |
print("\n=== Testing GUI app configuration loading ===")
|
| 993 |
-
|
| 994 |
try:
|
| 995 |
# Add the parent directory to the path
|
| 996 |
parent_dir = os.path.dirname(os.path.dirname(__file__))
|
| 997 |
if parent_dir not in sys.path:
|
| 998 |
sys.path.insert(0, parent_dir)
|
| 999 |
-
|
| 1000 |
-
# Import the app module
|
| 1001 |
-
import app
|
| 1002 |
-
|
| 1003 |
# Check if key configuration variables are accessible
|
| 1004 |
# These should be imported from tools.config
|
| 1005 |
from tools.config import (
|
|
|
|
| 1006 |
GRADIO_SERVER_PORT,
|
| 1007 |
MAX_FILE_SIZE,
|
| 1008 |
-
|
| 1009 |
-
PII_DETECTION_MODELS
|
| 1010 |
)
|
| 1011 |
-
|
| 1012 |
# Verify these are not None/empty
|
| 1013 |
-
self.assertIsNotNone(
|
|
|
|
|
|
|
| 1014 |
self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
|
| 1015 |
-
self.assertIsNotNone(
|
| 1016 |
-
|
| 1017 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1018 |
print("✅ GUI app configuration loading passed")
|
| 1019 |
-
|
| 1020 |
except ImportError as e:
|
| 1021 |
error_msg = f"Failed to import configuration: {e}"
|
| 1022 |
if "gradio_image_annotation" in str(e):
|
|
@@ -1048,11 +1057,11 @@ def run_all_tests():
|
|
| 1048 |
# Create test suite
|
| 1049 |
loader = unittest.TestLoader()
|
| 1050 |
suite = unittest.TestSuite()
|
| 1051 |
-
|
| 1052 |
# Add CLI tests
|
| 1053 |
cli_suite = loader.loadTestsFromTestCase(TestCLIRedactExamples)
|
| 1054 |
suite.addTests(cli_suite)
|
| 1055 |
-
|
| 1056 |
# Add GUI tests
|
| 1057 |
gui_suite = loader.loadTestsFromTestCase(TestGUIApp)
|
| 1058 |
suite.addTests(gui_suite)
|
|
|
|
| 1 |
import os
|
| 2 |
import shutil
|
| 3 |
import subprocess
|
|
|
|
|
|
|
| 4 |
import sys
|
| 5 |
+
import tempfile
|
| 6 |
import threading
|
| 7 |
+
import unittest
|
| 8 |
from typing import List, Optional
|
| 9 |
|
| 10 |
|
|
|
|
| 892 |
cls.app_path = os.path.join(
|
| 893 |
os.path.dirname(os.path.dirname(__file__)), "app.py"
|
| 894 |
)
|
| 895 |
+
|
| 896 |
# Verify app.py exists
|
| 897 |
if not os.path.isfile(cls.app_path):
|
| 898 |
raise FileNotFoundError(f"App file not found: {cls.app_path}")
|
| 899 |
+
|
| 900 |
print(f"GUI test setup complete. App: {cls.app_path}")
|
| 901 |
|
| 902 |
def test_app_import_and_initialization(self):
|
| 903 |
"""Test: Import app.py and check if the Gradio app object is created successfully."""
|
| 904 |
print("\n=== Testing GUI app import and initialization ===")
|
| 905 |
+
|
| 906 |
try:
|
| 907 |
# Add the parent directory to the path so we can import app
|
| 908 |
parent_dir = os.path.dirname(os.path.dirname(__file__))
|
| 909 |
if parent_dir not in sys.path:
|
| 910 |
sys.path.insert(0, parent_dir)
|
| 911 |
+
|
| 912 |
# Import the app module
|
| 913 |
import app
|
| 914 |
+
|
| 915 |
# Check if the app object exists and is a Gradio Blocks object
|
| 916 |
+
self.assertTrue(
|
| 917 |
+
hasattr(app, "app"), "App object should exist in the module"
|
| 918 |
+
)
|
| 919 |
+
|
| 920 |
# Check if it's a Gradio Blocks instance
|
| 921 |
import gradio as gr
|
| 922 |
+
|
| 923 |
+
self.assertIsInstance(
|
| 924 |
+
app.app, gr.Blocks, "App should be a Gradio Blocks instance"
|
| 925 |
+
)
|
| 926 |
+
|
| 927 |
print("✅ GUI app import and initialization passed")
|
| 928 |
+
|
| 929 |
except ImportError as e:
|
| 930 |
error_msg = f"Failed to import app module: {e}"
|
| 931 |
if "gradio_image_annotation" in str(e):
|
|
|
|
| 939 |
def test_app_launch_headless(self):
|
| 940 |
"""Test: Launch the app in headless mode to verify it starts without errors."""
|
| 941 |
print("\n=== Testing GUI app launch in headless mode ===")
|
| 942 |
+
|
| 943 |
try:
|
| 944 |
# Add the parent directory to the path
|
| 945 |
parent_dir = os.path.dirname(os.path.dirname(__file__))
|
| 946 |
if parent_dir not in sys.path:
|
| 947 |
sys.path.insert(0, parent_dir)
|
| 948 |
+
|
| 949 |
# Import the app module
|
| 950 |
+
|
| 951 |
import app
|
| 952 |
+
|
|
|
|
| 953 |
# Set up a flag to track if the app launched successfully
|
| 954 |
app_launched = threading.Event()
|
| 955 |
launch_error = None
|
| 956 |
+
|
| 957 |
def launch_app():
|
| 958 |
try:
|
| 959 |
# Launch the app in headless mode with a short timeout
|
| 960 |
app.app.launch(
|
| 961 |
show_error=True,
|
| 962 |
inbrowser=False, # Don't open browser
|
| 963 |
+
server_port=0, # Use any available port
|
| 964 |
+
quiet=True, # Suppress output
|
| 965 |
+
prevent_thread_lock=True, # Don't block the main thread
|
| 966 |
)
|
| 967 |
app_launched.set()
|
| 968 |
+
except Exception:
|
|
|
|
| 969 |
app_launched.set()
|
| 970 |
+
|
| 971 |
# Start the app in a separate thread
|
| 972 |
launch_thread = threading.Thread(target=launch_app)
|
| 973 |
launch_thread.daemon = True
|
| 974 |
launch_thread.start()
|
| 975 |
+
|
| 976 |
# Wait for the app to launch (with timeout)
|
| 977 |
if app_launched.wait(timeout=10): # 10 second timeout
|
| 978 |
if launch_error:
|
|
|
|
| 981 |
print("✅ GUI app launch in headless mode passed")
|
| 982 |
else:
|
| 983 |
self.fail("App launch timed out after 10 seconds")
|
| 984 |
+
|
| 985 |
except Exception as e:
|
| 986 |
error_msg = f"Unexpected error during app launch test: {e}"
|
| 987 |
if "gradio_image_annotation" in str(e):
|
|
|
|
| 993 |
def test_app_configuration_loading(self):
|
| 994 |
"""Test: Verify that the app can load its configuration without errors."""
|
| 995 |
print("\n=== Testing GUI app configuration loading ===")
|
| 996 |
+
|
| 997 |
try:
|
| 998 |
# Add the parent directory to the path
|
| 999 |
parent_dir = os.path.dirname(os.path.dirname(__file__))
|
| 1000 |
if parent_dir not in sys.path:
|
| 1001 |
sys.path.insert(0, parent_dir)
|
| 1002 |
+
|
| 1003 |
+
# Import the app module (not needed?)
|
| 1004 |
+
# import app
|
| 1005 |
+
|
| 1006 |
# Check if key configuration variables are accessible
|
| 1007 |
# These should be imported from tools.config
|
| 1008 |
from tools.config import (
|
| 1009 |
+
DEFAULT_LANGUAGE,
|
| 1010 |
GRADIO_SERVER_PORT,
|
| 1011 |
MAX_FILE_SIZE,
|
| 1012 |
+
PII_DETECTION_MODELS,
|
|
|
|
| 1013 |
)
|
| 1014 |
+
|
| 1015 |
# Verify these are not None/empty
|
| 1016 |
+
self.assertIsNotNone(
|
| 1017 |
+
GRADIO_SERVER_PORT, "GRADIO_SERVER_PORT should be configured"
|
| 1018 |
+
)
|
| 1019 |
self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
|
| 1020 |
+
self.assertIsNotNone(
|
| 1021 |
+
DEFAULT_LANGUAGE, "DEFAULT_LANGUAGE should be configured"
|
| 1022 |
+
)
|
| 1023 |
+
self.assertIsNotNone(
|
| 1024 |
+
PII_DETECTION_MODELS, "PII_DETECTION_MODELS should be configured"
|
| 1025 |
+
)
|
| 1026 |
+
|
| 1027 |
print("✅ GUI app configuration loading passed")
|
| 1028 |
+
|
| 1029 |
except ImportError as e:
|
| 1030 |
error_msg = f"Failed to import configuration: {e}"
|
| 1031 |
if "gradio_image_annotation" in str(e):
|
|
|
|
| 1057 |
# Create test suite
|
| 1058 |
loader = unittest.TestLoader()
|
| 1059 |
suite = unittest.TestSuite()
|
| 1060 |
+
|
| 1061 |
# Add CLI tests
|
| 1062 |
cli_suite = loader.loadTestsFromTestCase(TestCLIRedactExamples)
|
| 1063 |
suite.addTests(cli_suite)
|
| 1064 |
+
|
| 1065 |
# Add GUI tests
|
| 1066 |
gui_suite = loader.loadTestsFromTestCase(TestGUIApp)
|
| 1067 |
suite.addTests(gui_suite)
|
test/test_gui_only.py
CHANGED
|
@@ -8,9 +8,8 @@ Run this script to verify that the Gradio interface can be imported and initiali
|
|
| 8 |
|
| 9 |
import os
|
| 10 |
import sys
|
| 11 |
-
import unittest
|
| 12 |
import threading
|
| 13 |
-
import
|
| 14 |
|
| 15 |
# Add the parent directory to the path so we can import the app
|
| 16 |
parent_dir = os.path.dirname(os.path.dirname(__file__))
|
|
@@ -25,30 +24,35 @@ class TestGUIAppOnly(unittest.TestCase):
|
|
| 25 |
def setUpClass(cls):
|
| 26 |
"""Set up test environment for GUI tests."""
|
| 27 |
cls.app_path = os.path.join(parent_dir, "app.py")
|
| 28 |
-
|
| 29 |
# Verify app.py exists
|
| 30 |
if not os.path.isfile(cls.app_path):
|
| 31 |
raise FileNotFoundError(f"App file not found: {cls.app_path}")
|
| 32 |
-
|
| 33 |
print(f"GUI test setup complete. App: {cls.app_path}")
|
| 34 |
|
| 35 |
def test_app_import_and_initialization(self):
|
| 36 |
"""Test: Import app.py and check if the Gradio app object is created successfully."""
|
| 37 |
print("\n=== Testing GUI app import and initialization ===")
|
| 38 |
-
|
| 39 |
try:
|
| 40 |
# Import the app module
|
| 41 |
import app
|
| 42 |
-
|
| 43 |
# Check if the app object exists and is a Gradio Blocks object
|
| 44 |
-
self.assertTrue(
|
| 45 |
-
|
|
|
|
|
|
|
| 46 |
# Check if it's a Gradio Blocks instance
|
| 47 |
import gradio as gr
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
| 50 |
print("✅ GUI app import and initialization passed")
|
| 51 |
-
|
| 52 |
except ImportError as e:
|
| 53 |
error_msg = f"Failed to import app module: {e}"
|
| 54 |
if "gradio_image_annotation" in str(e):
|
|
@@ -62,36 +66,35 @@ class TestGUIAppOnly(unittest.TestCase):
|
|
| 62 |
def test_app_launch_headless(self):
|
| 63 |
"""Test: Launch the app in headless mode to verify it starts without errors."""
|
| 64 |
print("\n=== Testing GUI app launch in headless mode ===")
|
| 65 |
-
|
| 66 |
try:
|
| 67 |
# Import the app module
|
|
|
|
| 68 |
import app
|
| 69 |
-
|
| 70 |
-
|
| 71 |
# Set up a flag to track if the app launched successfully
|
| 72 |
app_launched = threading.Event()
|
| 73 |
launch_error = None
|
| 74 |
-
|
| 75 |
def launch_app():
|
| 76 |
try:
|
| 77 |
# Launch the app in headless mode with a short timeout
|
| 78 |
app.app.launch(
|
| 79 |
show_error=True,
|
| 80 |
inbrowser=False, # Don't open browser
|
| 81 |
-
server_port=0,
|
| 82 |
-
quiet=True,
|
| 83 |
-
prevent_thread_lock=True # Don't block the main thread
|
| 84 |
)
|
| 85 |
app_launched.set()
|
| 86 |
-
except Exception
|
| 87 |
-
launch_error = e
|
| 88 |
app_launched.set()
|
| 89 |
-
|
| 90 |
# Start the app in a separate thread
|
| 91 |
launch_thread = threading.Thread(target=launch_app)
|
| 92 |
launch_thread.daemon = True
|
| 93 |
launch_thread.start()
|
| 94 |
-
|
| 95 |
# Wait for the app to launch (with timeout)
|
| 96 |
if app_launched.wait(timeout=10): # 10 second timeout
|
| 97 |
if launch_error:
|
|
@@ -100,7 +103,7 @@ class TestGUIAppOnly(unittest.TestCase):
|
|
| 100 |
print("✅ GUI app launch in headless mode passed")
|
| 101 |
else:
|
| 102 |
self.fail("App launch timed out after 10 seconds")
|
| 103 |
-
|
| 104 |
except Exception as e:
|
| 105 |
error_msg = f"Unexpected error during app launch test: {e}"
|
| 106 |
if "gradio_image_annotation" in str(e):
|
|
@@ -112,28 +115,34 @@ class TestGUIAppOnly(unittest.TestCase):
|
|
| 112 |
def test_app_configuration_loading(self):
|
| 113 |
"""Test: Verify that the app can load its configuration without errors."""
|
| 114 |
print("\n=== Testing GUI app configuration loading ===")
|
| 115 |
-
|
| 116 |
try:
|
| 117 |
-
# Import the app module
|
| 118 |
-
import app
|
| 119 |
-
|
| 120 |
# Check if key configuration variables are accessible
|
| 121 |
# These should be imported from tools.config
|
| 122 |
from tools.config import (
|
|
|
|
| 123 |
GRADIO_SERVER_PORT,
|
| 124 |
MAX_FILE_SIZE,
|
| 125 |
-
|
| 126 |
-
PII_DETECTION_MODELS
|
| 127 |
)
|
| 128 |
-
|
| 129 |
# Verify these are not None/empty
|
| 130 |
-
self.assertIsNotNone(
|
|
|
|
|
|
|
| 131 |
self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
|
| 132 |
-
self.assertIsNotNone(
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
print("✅ GUI app configuration loading passed")
|
| 136 |
-
|
| 137 |
except ImportError as e:
|
| 138 |
error_msg = f"Failed to import configuration: {e}"
|
| 139 |
if "gradio_image_annotation" in str(e):
|
|
|
|
| 8 |
|
| 9 |
import os
|
| 10 |
import sys
|
|
|
|
| 11 |
import threading
|
| 12 |
+
import unittest
|
| 13 |
|
| 14 |
# Add the parent directory to the path so we can import the app
|
| 15 |
parent_dir = os.path.dirname(os.path.dirname(__file__))
|
|
|
|
| 24 |
def setUpClass(cls):
|
| 25 |
"""Set up test environment for GUI tests."""
|
| 26 |
cls.app_path = os.path.join(parent_dir, "app.py")
|
| 27 |
+
|
| 28 |
# Verify app.py exists
|
| 29 |
if not os.path.isfile(cls.app_path):
|
| 30 |
raise FileNotFoundError(f"App file not found: {cls.app_path}")
|
| 31 |
+
|
| 32 |
print(f"GUI test setup complete. App: {cls.app_path}")
|
| 33 |
|
| 34 |
def test_app_import_and_initialization(self):
|
| 35 |
"""Test: Import app.py and check if the Gradio app object is created successfully."""
|
| 36 |
print("\n=== Testing GUI app import and initialization ===")
|
| 37 |
+
|
| 38 |
try:
|
| 39 |
# Import the app module
|
| 40 |
import app
|
| 41 |
+
|
| 42 |
# Check if the app object exists and is a Gradio Blocks object
|
| 43 |
+
self.assertTrue(
|
| 44 |
+
hasattr(app, "app"), "App object should exist in the module"
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
# Check if it's a Gradio Blocks instance
|
| 48 |
import gradio as gr
|
| 49 |
+
|
| 50 |
+
self.assertIsInstance(
|
| 51 |
+
app.app, gr.Blocks, "App should be a Gradio Blocks instance"
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
print("✅ GUI app import and initialization passed")
|
| 55 |
+
|
| 56 |
except ImportError as e:
|
| 57 |
error_msg = f"Failed to import app module: {e}"
|
| 58 |
if "gradio_image_annotation" in str(e):
|
|
|
|
| 66 |
def test_app_launch_headless(self):
|
| 67 |
"""Test: Launch the app in headless mode to verify it starts without errors."""
|
| 68 |
print("\n=== Testing GUI app launch in headless mode ===")
|
| 69 |
+
|
| 70 |
try:
|
| 71 |
# Import the app module
|
| 72 |
+
|
| 73 |
import app
|
| 74 |
+
|
|
|
|
| 75 |
# Set up a flag to track if the app launched successfully
|
| 76 |
app_launched = threading.Event()
|
| 77 |
launch_error = None
|
| 78 |
+
|
| 79 |
def launch_app():
|
| 80 |
try:
|
| 81 |
# Launch the app in headless mode with a short timeout
|
| 82 |
app.app.launch(
|
| 83 |
show_error=True,
|
| 84 |
inbrowser=False, # Don't open browser
|
| 85 |
+
server_port=0, # Use any available port
|
| 86 |
+
quiet=True, # Suppress output
|
| 87 |
+
prevent_thread_lock=True, # Don't block the main thread
|
| 88 |
)
|
| 89 |
app_launched.set()
|
| 90 |
+
except Exception:
|
|
|
|
| 91 |
app_launched.set()
|
| 92 |
+
|
| 93 |
# Start the app in a separate thread
|
| 94 |
launch_thread = threading.Thread(target=launch_app)
|
| 95 |
launch_thread.daemon = True
|
| 96 |
launch_thread.start()
|
| 97 |
+
|
| 98 |
# Wait for the app to launch (with timeout)
|
| 99 |
if app_launched.wait(timeout=10): # 10 second timeout
|
| 100 |
if launch_error:
|
|
|
|
| 103 |
print("✅ GUI app launch in headless mode passed")
|
| 104 |
else:
|
| 105 |
self.fail("App launch timed out after 10 seconds")
|
| 106 |
+
|
| 107 |
except Exception as e:
|
| 108 |
error_msg = f"Unexpected error during app launch test: {e}"
|
| 109 |
if "gradio_image_annotation" in str(e):
|
|
|
|
| 115 |
def test_app_configuration_loading(self):
|
| 116 |
"""Test: Verify that the app can load its configuration without errors."""
|
| 117 |
print("\n=== Testing GUI app configuration loading ===")
|
| 118 |
+
|
| 119 |
try:
|
| 120 |
+
# Import the app module (not necessary here?)
|
| 121 |
+
# import app
|
| 122 |
+
|
| 123 |
# Check if key configuration variables are accessible
|
| 124 |
# These should be imported from tools.config
|
| 125 |
from tools.config import (
|
| 126 |
+
DEFAULT_LANGUAGE,
|
| 127 |
GRADIO_SERVER_PORT,
|
| 128 |
MAX_FILE_SIZE,
|
| 129 |
+
PII_DETECTION_MODELS,
|
|
|
|
| 130 |
)
|
| 131 |
+
|
| 132 |
# Verify these are not None/empty
|
| 133 |
+
self.assertIsNotNone(
|
| 134 |
+
GRADIO_SERVER_PORT, "GRADIO_SERVER_PORT should be configured"
|
| 135 |
+
)
|
| 136 |
self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
|
| 137 |
+
self.assertIsNotNone(
|
| 138 |
+
DEFAULT_LANGUAGE, "DEFAULT_LANGUAGE should be configured"
|
| 139 |
+
)
|
| 140 |
+
self.assertIsNotNone(
|
| 141 |
+
PII_DETECTION_MODELS, "PII_DETECTION_MODELS should be configured"
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
print("✅ GUI app configuration loading passed")
|
| 145 |
+
|
| 146 |
except ImportError as e:
|
| 147 |
error_msg = f"Failed to import configuration: {e}"
|
| 148 |
if "gradio_image_annotation" in str(e):
|
tools/aws_functions.py
CHANGED
|
@@ -10,6 +10,7 @@ from tools.config import (
|
|
| 10 |
RUN_AWS_FUNCTIONS,
|
| 11 |
SAVE_LOGS_TO_CSV,
|
| 12 |
)
|
|
|
|
| 13 |
|
| 14 |
PandasDataFrame = Type[pd.DataFrame]
|
| 15 |
|
|
@@ -90,7 +91,7 @@ def download_folder_from_s3(
|
|
| 90 |
for obj in response.get("Contents", []):
|
| 91 |
# Extract object key and construct local file path
|
| 92 |
object_key = obj["Key"]
|
| 93 |
-
local_file_path =
|
| 94 |
local_folder, os.path.relpath(object_key, s3_folder)
|
| 95 |
)
|
| 96 |
|
|
@@ -143,8 +144,8 @@ def download_files_from_s3(
|
|
| 143 |
print("Found filenames in AWS folder: ", filenames)
|
| 144 |
|
| 145 |
for filename in filenames:
|
| 146 |
-
object_key =
|
| 147 |
-
local_file_path =
|
| 148 |
|
| 149 |
# Create directories if necessary
|
| 150 |
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
|
|
|
| 10 |
RUN_AWS_FUNCTIONS,
|
| 11 |
SAVE_LOGS_TO_CSV,
|
| 12 |
)
|
| 13 |
+
from tools.secure_path_utils import secure_join
|
| 14 |
|
| 15 |
PandasDataFrame = Type[pd.DataFrame]
|
| 16 |
|
|
|
|
| 91 |
for obj in response.get("Contents", []):
|
| 92 |
# Extract object key and construct local file path
|
| 93 |
object_key = obj["Key"]
|
| 94 |
+
local_file_path = secure_join(
|
| 95 |
local_folder, os.path.relpath(object_key, s3_folder)
|
| 96 |
)
|
| 97 |
|
|
|
|
| 144 |
print("Found filenames in AWS folder: ", filenames)
|
| 145 |
|
| 146 |
for filename in filenames:
|
| 147 |
+
object_key = secure_join(s3_folder, filename)
|
| 148 |
+
local_file_path = secure_join(local_folder, filename)
|
| 149 |
|
| 150 |
# Create directories if necessary
|
| 151 |
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
tools/aws_textract.py
CHANGED
|
@@ -16,6 +16,7 @@ from tools.config import (
|
|
| 16 |
RUN_AWS_FUNCTIONS,
|
| 17 |
)
|
| 18 |
from tools.custom_image_analyser_engine import CustomImageRecognizerResult, OCRResult
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
def extract_textract_metadata(response: object):
|
|
@@ -478,8 +479,8 @@ def load_and_convert_textract_json(
|
|
| 478 |
log_files_output_paths.append(textract_json_file_path)
|
| 479 |
|
| 480 |
try:
|
| 481 |
-
|
| 482 |
-
|
| 483 |
except json.JSONDecodeError:
|
| 484 |
print("Error: Failed to parse Textract JSON file. Returning empty data.")
|
| 485 |
return {}, True, log_files_output_paths # Indicate failure
|
|
|
|
| 16 |
RUN_AWS_FUNCTIONS,
|
| 17 |
)
|
| 18 |
from tools.custom_image_analyser_engine import CustomImageRecognizerResult, OCRResult
|
| 19 |
+
from tools.secure_path_utils import secure_file_read
|
| 20 |
|
| 21 |
|
| 22 |
def extract_textract_metadata(response: object):
|
|
|
|
| 479 |
log_files_output_paths.append(textract_json_file_path)
|
| 480 |
|
| 481 |
try:
|
| 482 |
+
json_content = secure_file_read(textract_json_file_path, encoding="utf-8")
|
| 483 |
+
textract_data = json.loads(json_content)
|
| 484 |
except json.JSONDecodeError:
|
| 485 |
print("Error: Failed to parse Textract JSON file. Returning empty data.")
|
| 486 |
return {}, True, log_files_output_paths # Indicate failure
|
tools/config.py
CHANGED
|
@@ -382,7 +382,7 @@ CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var(
|
|
| 382 |
) # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
|
| 383 |
|
| 384 |
PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var(
|
| 385 |
-
"PREPROCESS_LOCAL_OCR_IMAGES", "
|
| 386 |
) # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this doesn't necessarily imporove results, and greatly slows down extraction.
|
| 387 |
|
| 388 |
# Entities for redaction
|
|
|
|
| 382 |
) # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
|
| 383 |
|
| 384 |
PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var(
|
| 385 |
+
"PREPROCESS_LOCAL_OCR_IMAGES", "False"
|
| 386 |
) # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this doesn't necessarily imporove results, and greatly slows down extraction.
|
| 387 |
|
| 388 |
# Entities for redaction
|
tools/custom_csvlogger.py
CHANGED
|
@@ -2,7 +2,6 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
import csv
|
| 4 |
import os
|
| 5 |
-
import re
|
| 6 |
import time
|
| 7 |
import uuid
|
| 8 |
from collections.abc import Sequence
|
|
@@ -105,10 +104,17 @@ class CSVLogger_custom(FlaggingCallback):
|
|
| 105 |
self.dataset_filepath = self.flagging_dir / self.dataset_file_name
|
| 106 |
elif dataset_files:
|
| 107 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
latest_file = max(
|
| 109 |
-
dataset_files,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
)
|
| 111 |
-
latest_num = int(re.findall(r"\d+", latest_file.stem)[0])
|
| 112 |
|
| 113 |
with open(latest_file, newline="", encoding="utf-8") as csvfile:
|
| 114 |
reader = csv.reader(csvfile)
|
|
|
|
| 2 |
|
| 3 |
import csv
|
| 4 |
import os
|
|
|
|
| 5 |
import time
|
| 6 |
import uuid
|
| 7 |
from collections.abc import Sequence
|
|
|
|
| 104 |
self.dataset_filepath = self.flagging_dir / self.dataset_file_name
|
| 105 |
elif dataset_files:
|
| 106 |
try:
|
| 107 |
+
from tools.secure_regex_utils import (
|
| 108 |
+
safe_extract_latest_number_from_filename,
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
latest_file = max(
|
| 112 |
+
dataset_files,
|
| 113 |
+
key=lambda f: safe_extract_latest_number_from_filename(f.stem) or 0,
|
| 114 |
+
)
|
| 115 |
+
latest_num = (
|
| 116 |
+
safe_extract_latest_number_from_filename(latest_file.stem) or 0
|
| 117 |
)
|
|
|
|
| 118 |
|
| 119 |
with open(latest_file, newline="", encoding="utf-8") as csvfile:
|
| 120 |
reader = csv.reader(csvfile)
|
tools/custom_image_analyser_engine.py
CHANGED
|
@@ -524,12 +524,9 @@ class CustomImageAnalyzerEngine:
|
|
| 524 |
# Remove or replace invalid filename characters
|
| 525 |
# Windows: < > : " | ? * \ /
|
| 526 |
# Unix: / (forward slash)
|
| 527 |
-
|
| 528 |
-
invalid_chars = r'[<>:"|?*\\/\x00-\x1f\x7f-\x9f]'
|
| 529 |
-
sanitized = re.sub(invalid_chars, "_", text)
|
| 530 |
|
| 531 |
-
|
| 532 |
-
sanitized = re.sub(r"_+", "_", sanitized)
|
| 533 |
|
| 534 |
# Remove leading/trailing underscores and spaces
|
| 535 |
sanitized = sanitized.strip("_ ")
|
|
|
|
| 524 |
# Remove or replace invalid filename characters
|
| 525 |
# Windows: < > : " | ? * \ /
|
| 526 |
# Unix: / (forward slash)
|
| 527 |
+
from tools.secure_regex_utils import safe_sanitize_text
|
|
|
|
|
|
|
| 528 |
|
| 529 |
+
sanitized = safe_sanitize_text(text)
|
|
|
|
| 530 |
|
| 531 |
# Remove leading/trailing underscores and spaces
|
| 532 |
sanitized = sanitized.strip("_ ")
|
tools/data_anonymise.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import base64
|
| 2 |
import os
|
| 3 |
-
import re
|
| 4 |
import secrets
|
| 5 |
import time
|
| 6 |
import unicodedata
|
|
@@ -20,7 +19,7 @@ from presidio_analyzer import (
|
|
| 20 |
AnalyzerEngine,
|
| 21 |
BatchAnalyzerEngine,
|
| 22 |
DictAnalyzerResult,
|
| 23 |
-
RecognizerResult
|
| 24 |
)
|
| 25 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
| 26 |
from presidio_anonymizer.entities import OperatorConfig
|
|
@@ -57,6 +56,7 @@ from tools.load_spacy_model_custom_recognisers import (
|
|
| 57 |
|
| 58 |
# Use custom version of analyze_dict to be able to track progress
|
| 59 |
from tools.presidio_analyzer_custom import analyze_dict
|
|
|
|
| 60 |
|
| 61 |
if DO_INITIAL_TABULAR_DATA_CLEAN == "True":
|
| 62 |
DO_INITIAL_TABULAR_DATA_CLEAN = True
|
|
@@ -406,22 +406,21 @@ def handle_docx_anonymisation(
|
|
| 406 |
base_name = os.path.basename(file_path)
|
| 407 |
file_name_without_ext = os.path.splitext(base_name)[0]
|
| 408 |
|
| 409 |
-
output_docx_path =
|
| 410 |
output_folder, f"{file_name_without_ext}_redacted.docx"
|
| 411 |
)
|
| 412 |
-
log_file_path =
|
| 413 |
output_folder, f"{file_name_without_ext}_redacted_log.txt"
|
| 414 |
)
|
| 415 |
|
| 416 |
-
output_xlsx_path =
|
| 417 |
output_folder, f"{file_name_without_ext}_redacted.csv"
|
| 418 |
)
|
| 419 |
|
| 420 |
anonymised_df.to_csv(output_xlsx_path, encoding="utf-8-sig", index=None)
|
| 421 |
doc.save(output_docx_path)
|
| 422 |
|
| 423 |
-
|
| 424 |
-
f.write(decision_log)
|
| 425 |
|
| 426 |
return output_docx_path, log_file_path, output_xlsx_path, comprehend_query_number
|
| 427 |
|
|
@@ -542,8 +541,6 @@ def anonymise_files_with_open_text(
|
|
| 542 |
print(
|
| 543 |
"Connecting to Comprehend using AWS access key and secret keys from textboxes."
|
| 544 |
)
|
| 545 |
-
print("aws_access_key_textbox:", aws_access_key_textbox)
|
| 546 |
-
print("aws_secret_access_key:", aws_secret_key_textbox)
|
| 547 |
comprehend_client = boto3.client(
|
| 548 |
"comprehend",
|
| 549 |
aws_access_key_id=aws_access_key_textbox,
|
|
@@ -801,7 +798,10 @@ def anonymise_files_with_open_text(
|
|
| 801 |
+ "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
|
| 802 |
)
|
| 803 |
|
| 804 |
-
|
|
|
|
|
|
|
|
|
|
| 805 |
|
| 806 |
return (
|
| 807 |
out_message_out,
|
|
@@ -1004,8 +1004,7 @@ def tabular_anonymise_wrapper_func(
|
|
| 1004 |
+ excel_sheet_name
|
| 1005 |
+ "_decision_process_output.txt"
|
| 1006 |
)
|
| 1007 |
-
|
| 1008 |
-
f.write(decision_process_output_str)
|
| 1009 |
|
| 1010 |
else:
|
| 1011 |
anon_export_file_name = (
|
|
@@ -1016,8 +1015,7 @@ def tabular_anonymise_wrapper_func(
|
|
| 1016 |
decision_process_log_output_file = (
|
| 1017 |
anon_export_file_name + "_decision_process_output.txt"
|
| 1018 |
)
|
| 1019 |
-
|
| 1020 |
-
f.write(decision_process_output_str)
|
| 1021 |
|
| 1022 |
out_file_paths.append(anon_export_file_name)
|
| 1023 |
log_files_output_paths.append(decision_process_log_output_file)
|
|
@@ -1296,11 +1294,9 @@ def anonymise_script(
|
|
| 1296 |
redact_config = {"DEFAULT": OperatorConfig("redact")}
|
| 1297 |
hash_config = {"DEFAULT": OperatorConfig("hash")}
|
| 1298 |
mask_config = {
|
| 1299 |
-
"DEFAULT": OperatorConfig(
|
| 1300 |
-
"masking_char": "*",
|
| 1301 |
-
|
| 1302 |
-
"from_end": True
|
| 1303 |
-
})
|
| 1304 |
}
|
| 1305 |
people_encrypt_config = {
|
| 1306 |
"PERSON": OperatorConfig("encrypt", {"key": key_string})
|
|
@@ -1343,7 +1339,8 @@ def anonymise_script(
|
|
| 1343 |
combined_config = {**chosen_mask_config}
|
| 1344 |
|
| 1345 |
anonymizer_results = batch_anonymizer.anonymize_dict(
|
| 1346 |
-
analyzer_results, operators=combined_config
|
|
|
|
| 1347 |
|
| 1348 |
scrubbed_df = pd.DataFrame(anonymizer_results)
|
| 1349 |
|
|
|
|
| 1 |
import base64
|
| 2 |
import os
|
|
|
|
| 3 |
import secrets
|
| 4 |
import time
|
| 5 |
import unicodedata
|
|
|
|
| 19 |
AnalyzerEngine,
|
| 20 |
BatchAnalyzerEngine,
|
| 21 |
DictAnalyzerResult,
|
| 22 |
+
RecognizerResult,
|
| 23 |
)
|
| 24 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
| 25 |
from presidio_anonymizer.entities import OperatorConfig
|
|
|
|
| 56 |
|
| 57 |
# Use custom version of analyze_dict to be able to track progress
|
| 58 |
from tools.presidio_analyzer_custom import analyze_dict
|
| 59 |
+
from tools.secure_path_utils import secure_file_write, secure_join
|
| 60 |
|
| 61 |
if DO_INITIAL_TABULAR_DATA_CLEAN == "True":
|
| 62 |
DO_INITIAL_TABULAR_DATA_CLEAN = True
|
|
|
|
| 406 |
base_name = os.path.basename(file_path)
|
| 407 |
file_name_without_ext = os.path.splitext(base_name)[0]
|
| 408 |
|
| 409 |
+
output_docx_path = secure_join(
|
| 410 |
output_folder, f"{file_name_without_ext}_redacted.docx"
|
| 411 |
)
|
| 412 |
+
log_file_path = secure_join(
|
| 413 |
output_folder, f"{file_name_without_ext}_redacted_log.txt"
|
| 414 |
)
|
| 415 |
|
| 416 |
+
output_xlsx_path = secure_join(
|
| 417 |
output_folder, f"{file_name_without_ext}_redacted.csv"
|
| 418 |
)
|
| 419 |
|
| 420 |
anonymised_df.to_csv(output_xlsx_path, encoding="utf-8-sig", index=None)
|
| 421 |
doc.save(output_docx_path)
|
| 422 |
|
| 423 |
+
secure_file_write(log_file_path, decision_log, encoding="utf-8-sig")
|
|
|
|
| 424 |
|
| 425 |
return output_docx_path, log_file_path, output_xlsx_path, comprehend_query_number
|
| 426 |
|
|
|
|
| 541 |
print(
|
| 542 |
"Connecting to Comprehend using AWS access key and secret keys from textboxes."
|
| 543 |
)
|
|
|
|
|
|
|
| 544 |
comprehend_client = boto3.client(
|
| 545 |
"comprehend",
|
| 546 |
aws_access_key_id=aws_access_key_textbox,
|
|
|
|
| 798 |
+ "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
|
| 799 |
)
|
| 800 |
|
| 801 |
+
from tools.secure_regex_utils import safe_remove_leading_newlines
|
| 802 |
+
|
| 803 |
+
out_message_out = safe_remove_leading_newlines(out_message_out)
|
| 804 |
+
out_message_out = out_message_out.lstrip(". ")
|
| 805 |
|
| 806 |
return (
|
| 807 |
out_message_out,
|
|
|
|
| 1004 |
+ excel_sheet_name
|
| 1005 |
+ "_decision_process_output.txt"
|
| 1006 |
)
|
| 1007 |
+
secure_file_write(decision_process_log_output_file, decision_process_output_str)
|
|
|
|
| 1008 |
|
| 1009 |
else:
|
| 1010 |
anon_export_file_name = (
|
|
|
|
| 1015 |
decision_process_log_output_file = (
|
| 1016 |
anon_export_file_name + "_decision_process_output.txt"
|
| 1017 |
)
|
| 1018 |
+
secure_file_write(decision_process_log_output_file, decision_process_output_str)
|
|
|
|
| 1019 |
|
| 1020 |
out_file_paths.append(anon_export_file_name)
|
| 1021 |
log_files_output_paths.append(decision_process_log_output_file)
|
|
|
|
| 1294 |
redact_config = {"DEFAULT": OperatorConfig("redact")}
|
| 1295 |
hash_config = {"DEFAULT": OperatorConfig("hash")}
|
| 1296 |
mask_config = {
|
| 1297 |
+
"DEFAULT": OperatorConfig(
|
| 1298 |
+
"mask", {"masking_char": "*", "chars_to_mask": 100, "from_end": True}
|
| 1299 |
+
)
|
|
|
|
|
|
|
| 1300 |
}
|
| 1301 |
people_encrypt_config = {
|
| 1302 |
"PERSON": OperatorConfig("encrypt", {"key": key_string})
|
|
|
|
| 1339 |
combined_config = {**chosen_mask_config}
|
| 1340 |
|
| 1341 |
anonymizer_results = batch_anonymizer.anonymize_dict(
|
| 1342 |
+
analyzer_results, operators=combined_config
|
| 1343 |
+
)
|
| 1344 |
|
| 1345 |
scrubbed_df = pd.DataFrame(anonymizer_results)
|
| 1346 |
|
tools/file_conversion.py
CHANGED
|
@@ -34,6 +34,7 @@ from tools.config import (
|
|
| 34 |
TEXTRACT_TEXT_EXTRACT_OPTION,
|
| 35 |
)
|
| 36 |
from tools.helper_functions import get_file_name_without_type, read_file
|
|
|
|
| 37 |
|
| 38 |
# from tools.aws_textract import load_and_convert_textract_json
|
| 39 |
|
|
@@ -143,8 +144,8 @@ def process_single_page_for_image_conversion(
|
|
| 143 |
if create_images is True:
|
| 144 |
try:
|
| 145 |
# Construct the full output directory path
|
| 146 |
-
image_output_dir =
|
| 147 |
-
out_path =
|
| 148 |
image_output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png"
|
| 149 |
)
|
| 150 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
@@ -914,8 +915,8 @@ def prepare_image_or_pdf(
|
|
| 914 |
|
| 915 |
if (file_extension in [".json"]) & (prepare_for_review is True):
|
| 916 |
if isinstance(file_path, str):
|
| 917 |
-
|
| 918 |
-
|
| 919 |
else:
|
| 920 |
# Assuming file_path is a NamedString or similar
|
| 921 |
all_annotations_object = json.loads(
|
|
@@ -936,7 +937,7 @@ def prepare_image_or_pdf(
|
|
| 936 |
else:
|
| 937 |
output_textract_json_file_name = file_path_without_ext + ".json"
|
| 938 |
|
| 939 |
-
out_textract_path =
|
| 940 |
output_folder, output_textract_json_file_name
|
| 941 |
)
|
| 942 |
|
|
@@ -956,7 +957,7 @@ def prepare_image_or_pdf(
|
|
| 956 |
# if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
|
| 957 |
# else: output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
|
| 958 |
|
| 959 |
-
out_ocr_results_with_words_path =
|
| 960 |
output_folder, output_ocr_results_with_words_json_file_name
|
| 961 |
)
|
| 962 |
|
|
@@ -1026,10 +1027,12 @@ def prepare_image_or_pdf(
|
|
| 1026 |
if all_annotations_object:
|
| 1027 |
|
| 1028 |
# Get list of page numbers
|
|
|
|
|
|
|
| 1029 |
image_file_paths_pages = [
|
| 1030 |
-
|
| 1031 |
for s in image_file_paths
|
| 1032 |
-
if
|
| 1033 |
]
|
| 1034 |
image_file_paths_pages = [int(i) for i in image_file_paths_pages]
|
| 1035 |
|
|
@@ -1046,15 +1049,19 @@ def prepare_image_or_pdf(
|
|
| 1046 |
try:
|
| 1047 |
if not annotation:
|
| 1048 |
annotation = {"image": "", "boxes": []}
|
| 1049 |
-
annotation_page_number =
|
| 1050 |
-
|
| 1051 |
)
|
|
|
|
|
|
|
| 1052 |
else:
|
| 1053 |
-
annotation_page_number =
|
| 1054 |
-
|
| 1055 |
-
|
| 1056 |
-
)
|
| 1057 |
)
|
|
|
|
|
|
|
| 1058 |
except Exception as e:
|
| 1059 |
print("Extracting page number from image failed due to:", e)
|
| 1060 |
annotation_page_number = 0
|
|
@@ -1110,7 +1117,7 @@ def prepare_image_or_pdf(
|
|
| 1110 |
if file_extension in [".zip"]:
|
| 1111 |
|
| 1112 |
# Assume it's a Textract response object. Copy it to the output folder so it can be used later.
|
| 1113 |
-
out_folder =
|
| 1114 |
output_folder, file_path_without_ext + "_textract.json"
|
| 1115 |
)
|
| 1116 |
|
|
@@ -1125,7 +1132,7 @@ def prepare_image_or_pdf(
|
|
| 1125 |
json_filename = json_files[0]
|
| 1126 |
|
| 1127 |
# Extract the JSON file to the same directory as the ZIP file
|
| 1128 |
-
extracted_path =
|
| 1129 |
os.path.dirname(file_path), json_filename
|
| 1130 |
)
|
| 1131 |
zip_ref.extract(json_filename, os.path.dirname(file_path))
|
|
|
|
| 34 |
TEXTRACT_TEXT_EXTRACT_OPTION,
|
| 35 |
)
|
| 36 |
from tools.helper_functions import get_file_name_without_type, read_file
|
| 37 |
+
from tools.secure_path_utils import secure_file_read, secure_join
|
| 38 |
|
| 39 |
# from tools.aws_textract import load_and_convert_textract_json
|
| 40 |
|
|
|
|
| 144 |
if create_images is True:
|
| 145 |
try:
|
| 146 |
# Construct the full output directory path
|
| 147 |
+
image_output_dir = secure_join(os.getcwd(), input_folder)
|
| 148 |
+
out_path = secure_join(
|
| 149 |
image_output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png"
|
| 150 |
)
|
| 151 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
|
|
| 915 |
|
| 916 |
if (file_extension in [".json"]) & (prepare_for_review is True):
|
| 917 |
if isinstance(file_path, str):
|
| 918 |
+
json_content = secure_file_read(file_path)
|
| 919 |
+
all_annotations_object = json.loads(json_content)
|
| 920 |
else:
|
| 921 |
# Assuming file_path is a NamedString or similar
|
| 922 |
all_annotations_object = json.loads(
|
|
|
|
| 937 |
else:
|
| 938 |
output_textract_json_file_name = file_path_without_ext + ".json"
|
| 939 |
|
| 940 |
+
out_textract_path = secure_join(
|
| 941 |
output_folder, output_textract_json_file_name
|
| 942 |
)
|
| 943 |
|
|
|
|
| 957 |
# if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
|
| 958 |
# else: output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
|
| 959 |
|
| 960 |
+
out_ocr_results_with_words_path = secure_join(
|
| 961 |
output_folder, output_ocr_results_with_words_json_file_name
|
| 962 |
)
|
| 963 |
|
|
|
|
| 1027 |
if all_annotations_object:
|
| 1028 |
|
| 1029 |
# Get list of page numbers
|
| 1030 |
+
from tools.secure_regex_utils import safe_extract_page_number_from_path
|
| 1031 |
+
|
| 1032 |
image_file_paths_pages = [
|
| 1033 |
+
safe_extract_page_number_from_path(s)
|
| 1034 |
for s in image_file_paths
|
| 1035 |
+
if safe_extract_page_number_from_path(s) is not None
|
| 1036 |
]
|
| 1037 |
image_file_paths_pages = [int(i) for i in image_file_paths_pages]
|
| 1038 |
|
|
|
|
| 1049 |
try:
|
| 1050 |
if not annotation:
|
| 1051 |
annotation = {"image": "", "boxes": []}
|
| 1052 |
+
annotation_page_number = (
|
| 1053 |
+
safe_extract_page_number_from_path(image_file_path)
|
| 1054 |
)
|
| 1055 |
+
if annotation_page_number is None:
|
| 1056 |
+
continue
|
| 1057 |
else:
|
| 1058 |
+
annotation_page_number = (
|
| 1059 |
+
safe_extract_page_number_from_path(
|
| 1060 |
+
annotation["image"]
|
| 1061 |
+
)
|
| 1062 |
)
|
| 1063 |
+
if annotation_page_number is None:
|
| 1064 |
+
continue
|
| 1065 |
except Exception as e:
|
| 1066 |
print("Extracting page number from image failed due to:", e)
|
| 1067 |
annotation_page_number = 0
|
|
|
|
| 1117 |
if file_extension in [".zip"]:
|
| 1118 |
|
| 1119 |
# Assume it's a Textract response object. Copy it to the output folder so it can be used later.
|
| 1120 |
+
out_folder = secure_join(
|
| 1121 |
output_folder, file_path_without_ext + "_textract.json"
|
| 1122 |
)
|
| 1123 |
|
|
|
|
| 1132 |
json_filename = json_files[0]
|
| 1133 |
|
| 1134 |
# Extract the JSON file to the same directory as the ZIP file
|
| 1135 |
+
extracted_path = secure_join(
|
| 1136 |
os.path.dirname(file_path), json_filename
|
| 1137 |
)
|
| 1138 |
zip_ref.extract(json_filename, os.path.dirname(file_path))
|
tools/file_redaction.py
CHANGED
|
@@ -2,7 +2,6 @@ import copy
|
|
| 2 |
import io
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
-
import re
|
| 6 |
import time
|
| 7 |
from collections import defaultdict # For efficient grouping
|
| 8 |
from typing import Any, Dict, List, Optional, Tuple
|
|
@@ -94,6 +93,7 @@ from tools.load_spacy_model_custom_recognisers import (
|
|
| 94 |
nlp_analyser,
|
| 95 |
score_threshold,
|
| 96 |
)
|
|
|
|
| 97 |
|
| 98 |
ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
|
| 99 |
if not MAX_IMAGE_PIXELS:
|
|
@@ -130,11 +130,10 @@ def sum_numbers_before_seconds(string: str):
|
|
| 130 |
The sum of all numbers before 'seconds' in the string.
|
| 131 |
"""
|
| 132 |
|
| 133 |
-
# Extract numbers before 'seconds' using
|
| 134 |
-
|
| 135 |
|
| 136 |
-
|
| 137 |
-
numbers = [float(num.split()[0]) for num in numbers]
|
| 138 |
|
| 139 |
# Sum up the extracted numbers
|
| 140 |
sum_of_numbers = round(sum(numbers), 1)
|
|
@@ -445,7 +444,9 @@ def choose_and_run_redactor(
|
|
| 445 |
elif out_message:
|
| 446 |
combined_out_message = combined_out_message + "\n" + out_message
|
| 447 |
|
| 448 |
-
|
|
|
|
|
|
|
| 449 |
|
| 450 |
end_message = "\n\nPlease review and modify the suggested redaction outputs on the 'Review redactions' tab of the app (you can find this under the introduction text at the top of the page)."
|
| 451 |
|
|
@@ -1304,8 +1305,9 @@ def choose_and_run_redactor(
|
|
| 1304 |
output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
|
| 1305 |
)
|
| 1306 |
|
| 1307 |
-
|
| 1308 |
-
|
|
|
|
| 1309 |
|
| 1310 |
# Add the request metadata to the log outputs if not there already
|
| 1311 |
if all_textract_request_metadata_file_path not in log_files_output_paths:
|
|
@@ -2785,10 +2787,10 @@ def redact_image_pdf(
|
|
| 2785 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 2786 |
if original_textract_data != textract_data:
|
| 2787 |
# Write the updated existing textract data back to the JSON file
|
| 2788 |
-
|
| 2789 |
-
|
| 2790 |
-
|
| 2791 |
-
|
| 2792 |
|
| 2793 |
if textract_json_file_path not in log_files_output_paths:
|
| 2794 |
log_files_output_paths.append(textract_json_file_path)
|
|
@@ -2848,10 +2850,10 @@ def redact_image_pdf(
|
|
| 2848 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 2849 |
# Write the updated existing textract data back to the JSON file
|
| 2850 |
if original_textract_data != textract_data:
|
| 2851 |
-
|
| 2852 |
-
|
| 2853 |
-
|
| 2854 |
-
|
| 2855 |
|
| 2856 |
if textract_json_file_path not in log_files_output_paths:
|
| 2857 |
log_files_output_paths.append(textract_json_file_path)
|
|
@@ -2907,10 +2909,10 @@ def redact_image_pdf(
|
|
| 2907 |
# Write the updated existing textract data back to the JSON file
|
| 2908 |
|
| 2909 |
if original_textract_data != textract_data:
|
| 2910 |
-
|
| 2911 |
-
|
| 2912 |
-
|
| 2913 |
-
|
| 2914 |
|
| 2915 |
if textract_json_file_path not in log_files_output_paths:
|
| 2916 |
log_files_output_paths.append(textract_json_file_path)
|
|
|
|
| 2 |
import io
|
| 3 |
import json
|
| 4 |
import os
|
|
|
|
| 5 |
import time
|
| 6 |
from collections import defaultdict # For efficient grouping
|
| 7 |
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
| 93 |
nlp_analyser,
|
| 94 |
score_threshold,
|
| 95 |
)
|
| 96 |
+
from tools.secure_path_utils import secure_file_write
|
| 97 |
|
| 98 |
ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
|
| 99 |
if not MAX_IMAGE_PIXELS:
|
|
|
|
| 130 |
The sum of all numbers before 'seconds' in the string.
|
| 131 |
"""
|
| 132 |
|
| 133 |
+
# Extract numbers before 'seconds' using secure regex
|
| 134 |
+
from tools.secure_regex_utils import safe_extract_numbers_with_seconds
|
| 135 |
|
| 136 |
+
numbers = safe_extract_numbers_with_seconds(string)
|
|
|
|
| 137 |
|
| 138 |
# Sum up the extracted numbers
|
| 139 |
sum_of_numbers = round(sum(numbers), 1)
|
|
|
|
| 444 |
elif out_message:
|
| 445 |
combined_out_message = combined_out_message + "\n" + out_message
|
| 446 |
|
| 447 |
+
from tools.secure_regex_utils import safe_remove_leading_newlines
|
| 448 |
+
|
| 449 |
+
combined_out_message = safe_remove_leading_newlines(combined_out_message)
|
| 450 |
|
| 451 |
end_message = "\n\nPlease review and modify the suggested redaction outputs on the 'Review redactions' tab of the app (you can find this under the introduction text at the top of the page)."
|
| 452 |
|
|
|
|
| 1305 |
output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
|
| 1306 |
)
|
| 1307 |
|
| 1308 |
+
secure_file_write(
|
| 1309 |
+
all_textract_request_metadata_file_path, all_request_metadata_str
|
| 1310 |
+
)
|
| 1311 |
|
| 1312 |
# Add the request metadata to the log outputs if not there already
|
| 1313 |
if all_textract_request_metadata_file_path not in log_files_output_paths:
|
|
|
|
| 2787 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 2788 |
if original_textract_data != textract_data:
|
| 2789 |
# Write the updated existing textract data back to the JSON file
|
| 2790 |
+
secure_file_write(
|
| 2791 |
+
textract_json_file_path,
|
| 2792 |
+
json.dumps(textract_data, separators=(",", ":")),
|
| 2793 |
+
)
|
| 2794 |
|
| 2795 |
if textract_json_file_path not in log_files_output_paths:
|
| 2796 |
log_files_output_paths.append(textract_json_file_path)
|
|
|
|
| 2850 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 2851 |
# Write the updated existing textract data back to the JSON file
|
| 2852 |
if original_textract_data != textract_data:
|
| 2853 |
+
secure_file_write(
|
| 2854 |
+
textract_json_file_path,
|
| 2855 |
+
json.dumps(textract_data, separators=(",", ":")),
|
| 2856 |
+
)
|
| 2857 |
|
| 2858 |
if textract_json_file_path not in log_files_output_paths:
|
| 2859 |
log_files_output_paths.append(textract_json_file_path)
|
|
|
|
| 2909 |
# Write the updated existing textract data back to the JSON file
|
| 2910 |
|
| 2911 |
if original_textract_data != textract_data:
|
| 2912 |
+
secure_file_write(
|
| 2913 |
+
textract_json_file_path,
|
| 2914 |
+
json.dumps(textract_data, separators=(",", ":")),
|
| 2915 |
+
)
|
| 2916 |
|
| 2917 |
if textract_json_file_path not in log_files_output_paths:
|
| 2918 |
log_files_output_paths.append(textract_json_file_path)
|
tools/find_duplicate_pages.py
CHANGED
|
@@ -521,8 +521,9 @@ def clean_and_stem_text_series(df: pd.DataFrame, column: str):
|
|
| 521 |
"""
|
| 522 |
|
| 523 |
def _clean_text(raw_text):
|
| 524 |
-
|
| 525 |
-
|
|
|
|
| 526 |
clean = " ".join(clean.split())
|
| 527 |
# Join the cleaned words back into a string
|
| 528 |
return clean
|
|
@@ -1271,9 +1272,11 @@ def apply_whole_page_redactions_from_list(
|
|
| 1271 |
|
| 1272 |
list_whole_pages_to_redact = []
|
| 1273 |
for annotation in new_annotations_with_bounding_boxes:
|
| 1274 |
-
|
| 1275 |
-
|
| 1276 |
-
|
|
|
|
|
|
|
| 1277 |
list_whole_pages_to_redact.append(page)
|
| 1278 |
else:
|
| 1279 |
print(
|
|
|
|
| 521 |
"""
|
| 522 |
|
| 523 |
def _clean_text(raw_text):
|
| 524 |
+
from tools.secure_regex_utils import safe_clean_text
|
| 525 |
+
|
| 526 |
+
clean = safe_clean_text(raw_text, remove_html=True)
|
| 527 |
clean = " ".join(clean.split())
|
| 528 |
# Join the cleaned words back into a string
|
| 529 |
return clean
|
|
|
|
| 1272 |
|
| 1273 |
list_whole_pages_to_redact = []
|
| 1274 |
for annotation in new_annotations_with_bounding_boxes:
|
| 1275 |
+
from tools.secure_regex_utils import safe_extract_page_number_from_path
|
| 1276 |
+
|
| 1277 |
+
page_num = safe_extract_page_number_from_path(annotation["image"])
|
| 1278 |
+
if page_num is not None:
|
| 1279 |
+
page = page_num + 1
|
| 1280 |
list_whole_pages_to_redact.append(page)
|
| 1281 |
else:
|
| 1282 |
print(
|
tools/find_duplicate_tabular.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import os
|
| 2 |
-
import re
|
| 3 |
import time
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Dict, List, Tuple
|
|
@@ -19,6 +18,7 @@ from tools.config import (
|
|
| 19 |
from tools.data_anonymise import initial_clean
|
| 20 |
from tools.helper_functions import OUTPUT_FOLDER, read_file
|
| 21 |
from tools.load_spacy_model_custom_recognisers import nlp
|
|
|
|
| 22 |
|
| 23 |
if REMOVE_DUPLICATE_ROWS == "True":
|
| 24 |
REMOVE_DUPLICATE_ROWS = True
|
|
@@ -345,9 +345,12 @@ def save_tabular_duplicate_results(
|
|
| 345 |
original_file_extension = os.path.splitext(original_file)[-1]
|
| 346 |
if original_file_extension in [".xlsx", ".xls"]:
|
| 347 |
|
| 348 |
-
# Split the string using
|
| 349 |
-
|
| 350 |
-
|
|
|
|
|
|
|
|
|
|
| 351 |
# The sheet name is the last part after splitting
|
| 352 |
file_sheet_name = parts[-1]
|
| 353 |
|
|
@@ -430,12 +433,12 @@ def save_tabular_duplicate_results(
|
|
| 430 |
file_ext = os.path.splitext(file_name)[-1]
|
| 431 |
|
| 432 |
if file_ext in [".parquet"]:
|
| 433 |
-
output_path =
|
| 434 |
output_folder, f"{file_base_name}_deduplicated.parquet"
|
| 435 |
)
|
| 436 |
df_cleaned.to_parquet(output_path, index=False)
|
| 437 |
else:
|
| 438 |
-
output_path =
|
| 439 |
output_folder, f"{file_base_name}_deduplicated.csv"
|
| 440 |
)
|
| 441 |
df_cleaned.to_csv(
|
|
@@ -451,7 +454,7 @@ def save_tabular_duplicate_results(
|
|
| 451 |
# Create output filename
|
| 452 |
file_base_name = os.path.splitext(os.path.basename(file_path))[0]
|
| 453 |
file_ext = os.path.splitext(file_path)[-1]
|
| 454 |
-
output_path =
|
| 455 |
output_folder, f"{file_base_name}_deduplicated{file_ext}"
|
| 456 |
)
|
| 457 |
|
|
@@ -513,7 +516,7 @@ def remove_duplicate_rows_from_tabular_data(
|
|
| 513 |
file_stem = os.path.splitext(file_name)[0]
|
| 514 |
file_ext = os.path.splitext(file_name)[-1]
|
| 515 |
|
| 516 |
-
output_path =
|
| 517 |
|
| 518 |
if file_ext in [".xlsx", ".xls"]:
|
| 519 |
df_cleaned.to_excel(
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import time
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import Dict, List, Tuple
|
|
|
|
| 18 |
from tools.data_anonymise import initial_clean
|
| 19 |
from tools.helper_functions import OUTPUT_FOLDER, read_file
|
| 20 |
from tools.load_spacy_model_custom_recognisers import nlp
|
| 21 |
+
from tools.secure_path_utils import secure_join
|
| 22 |
|
| 23 |
if REMOVE_DUPLICATE_ROWS == "True":
|
| 24 |
REMOVE_DUPLICATE_ROWS = True
|
|
|
|
| 345 |
original_file_extension = os.path.splitext(original_file)[-1]
|
| 346 |
if original_file_extension in [".xlsx", ".xls"]:
|
| 347 |
|
| 348 |
+
# Split the string using secure regex to handle both .xlsx_ and .xls_ delimiters
|
| 349 |
+
from tools.secure_regex_utils import safe_split_filename
|
| 350 |
+
|
| 351 |
+
parts = safe_split_filename(
|
| 352 |
+
os.path.basename(file_name), [".xlsx_", ".xls_"]
|
| 353 |
+
)
|
| 354 |
# The sheet name is the last part after splitting
|
| 355 |
file_sheet_name = parts[-1]
|
| 356 |
|
|
|
|
| 433 |
file_ext = os.path.splitext(file_name)[-1]
|
| 434 |
|
| 435 |
if file_ext in [".parquet"]:
|
| 436 |
+
output_path = secure_join(
|
| 437 |
output_folder, f"{file_base_name}_deduplicated.parquet"
|
| 438 |
)
|
| 439 |
df_cleaned.to_parquet(output_path, index=False)
|
| 440 |
else:
|
| 441 |
+
output_path = secure_join(
|
| 442 |
output_folder, f"{file_base_name}_deduplicated.csv"
|
| 443 |
)
|
| 444 |
df_cleaned.to_csv(
|
|
|
|
| 454 |
# Create output filename
|
| 455 |
file_base_name = os.path.splitext(os.path.basename(file_path))[0]
|
| 456 |
file_ext = os.path.splitext(file_path)[-1]
|
| 457 |
+
output_path = secure_join(
|
| 458 |
output_folder, f"{file_base_name}_deduplicated{file_ext}"
|
| 459 |
)
|
| 460 |
|
|
|
|
| 516 |
file_stem = os.path.splitext(file_name)[0]
|
| 517 |
file_ext = os.path.splitext(file_name)[-1]
|
| 518 |
|
| 519 |
+
output_path = secure_join(output_folder, f"{file_stem}_deduplicated{file_ext}")
|
| 520 |
|
| 521 |
if file_ext in [".xlsx", ".xls"]:
|
| 522 |
df_cleaned.to_excel(
|
tools/helper_functions.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import os
|
| 2 |
-
import re
|
| 3 |
import unicodedata
|
| 4 |
from math import ceil
|
| 5 |
from typing import List
|
|
@@ -33,6 +32,7 @@ from tools.config import (
|
|
| 33 |
aws_comprehend_language_choices,
|
| 34 |
textract_language_choices,
|
| 35 |
)
|
|
|
|
| 36 |
|
| 37 |
|
| 38 |
def _get_env_list(env_var_name: str) -> List[str]:
|
|
@@ -348,7 +348,7 @@ def put_columns_in_df(in_file: List[str]):
|
|
| 348 |
def check_for_existing_textract_file(
|
| 349 |
doc_file_name_no_extension_textbox: str, output_folder: str = OUTPUT_FOLDER
|
| 350 |
):
|
| 351 |
-
textract_output_path =
|
| 352 |
output_folder, doc_file_name_no_extension_textbox + "_textract.json"
|
| 353 |
)
|
| 354 |
|
|
@@ -377,7 +377,7 @@ def check_for_relevant_ocr_output_with_words(
|
|
| 377 |
|
| 378 |
doc_file_with_ending = doc_file_name_no_extension_textbox + file_ending
|
| 379 |
|
| 380 |
-
local_ocr_output_path =
|
| 381 |
|
| 382 |
if os.path.exists(local_ocr_output_path):
|
| 383 |
print("Existing OCR with words analysis output file found.")
|
|
@@ -591,7 +591,9 @@ def clean_unicode_text(text: str):
|
|
| 591 |
# Step 3: Optionally remove non-ASCII characters if needed
|
| 592 |
# This regex removes any remaining non-ASCII characters, if desired.
|
| 593 |
# Comment this line if you want to keep all Unicode characters.
|
| 594 |
-
|
|
|
|
|
|
|
| 595 |
|
| 596 |
return cleaned_text
|
| 597 |
|
|
@@ -603,7 +605,7 @@ def load_all_output_files(folder_path: str = OUTPUT_FOLDER) -> List[str]:
|
|
| 603 |
# List all files in the specified folder
|
| 604 |
for filename in os.listdir(folder_path):
|
| 605 |
# Construct full file path
|
| 606 |
-
full_path =
|
| 607 |
# Check if it's a file (not a directory)
|
| 608 |
if os.path.isfile(full_path):
|
| 609 |
file_paths.append(full_path)
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import unicodedata
|
| 3 |
from math import ceil
|
| 4 |
from typing import List
|
|
|
|
| 32 |
aws_comprehend_language_choices,
|
| 33 |
textract_language_choices,
|
| 34 |
)
|
| 35 |
+
from tools.secure_path_utils import secure_join
|
| 36 |
|
| 37 |
|
| 38 |
def _get_env_list(env_var_name: str) -> List[str]:
|
|
|
|
| 348 |
def check_for_existing_textract_file(
|
| 349 |
doc_file_name_no_extension_textbox: str, output_folder: str = OUTPUT_FOLDER
|
| 350 |
):
|
| 351 |
+
textract_output_path = secure_join(
|
| 352 |
output_folder, doc_file_name_no_extension_textbox + "_textract.json"
|
| 353 |
)
|
| 354 |
|
|
|
|
| 377 |
|
| 378 |
doc_file_with_ending = doc_file_name_no_extension_textbox + file_ending
|
| 379 |
|
| 380 |
+
local_ocr_output_path = secure_join(output_folder, doc_file_with_ending)
|
| 381 |
|
| 382 |
if os.path.exists(local_ocr_output_path):
|
| 383 |
print("Existing OCR with words analysis output file found.")
|
|
|
|
| 591 |
# Step 3: Optionally remove non-ASCII characters if needed
|
| 592 |
# This regex removes any remaining non-ASCII characters, if desired.
|
| 593 |
# Comment this line if you want to keep all Unicode characters.
|
| 594 |
+
from tools.secure_regex_utils import safe_remove_non_ascii
|
| 595 |
+
|
| 596 |
+
cleaned_text = safe_remove_non_ascii(normalized_text)
|
| 597 |
|
| 598 |
return cleaned_text
|
| 599 |
|
|
|
|
| 605 |
# List all files in the specified folder
|
| 606 |
for filename in os.listdir(folder_path):
|
| 607 |
# Construct full file path
|
| 608 |
+
full_path = secure_join(folder_path, filename)
|
| 609 |
# Check if it's a file (not a directory)
|
| 610 |
if os.path.isfile(full_path):
|
| 611 |
file_paths.append(full_path)
|
tools/redaction_review.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import os
|
| 2 |
import random
|
| 3 |
-
import re
|
| 4 |
import string
|
| 5 |
import uuid
|
| 6 |
from datetime import datetime, timedelta, timezone
|
|
@@ -37,6 +36,9 @@ from tools.file_conversion import (
|
|
| 37 |
)
|
| 38 |
from tools.file_redaction import redact_page_with_pymupdf
|
| 39 |
from tools.helper_functions import detect_file_type, get_file_name_without_type
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
if not MAX_IMAGE_PIXELS:
|
| 42 |
Image.MAX_IMAGE_PIXELS = None
|
|
@@ -535,10 +537,14 @@ def update_annotator_page_from_review_df(
|
|
| 535 |
for i, page_state_entry in enumerate(out_image_annotations_state):
|
| 536 |
# Assuming page_state_entry has a 'page' key (1-based)
|
| 537 |
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
page_no = 0
|
| 543 |
|
| 544 |
if (
|
|
@@ -834,15 +840,11 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
| 834 |
valid = False
|
| 835 |
if isinstance(colour_label, str):
|
| 836 |
label_str = colour_label.strip()
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
)
|
| 840 |
-
if
|
| 841 |
-
r_val, g_val, b_val =
|
| 842 |
-
int(match.group(1)),
|
| 843 |
-
int(match.group(2)),
|
| 844 |
-
int(match.group(3)),
|
| 845 |
-
)
|
| 846 |
if 0 <= r_val <= 255 and 0 <= g_val <= 255 and 0 <= b_val <= 255:
|
| 847 |
valid = True
|
| 848 |
elif isinstance(colour_label, (tuple, list)) and len(colour_label) == 3:
|
|
@@ -2568,9 +2570,9 @@ def create_xfdf(
|
|
| 2568 |
pymupdf_page = pymupdf_doc.load_page(page_python_format)
|
| 2569 |
|
| 2570 |
if document_cropboxes and page_python_format < len(document_cropboxes):
|
| 2571 |
-
|
| 2572 |
-
|
| 2573 |
-
)
|
| 2574 |
if match and len(match) == 4:
|
| 2575 |
rect_values = list(map(float, match))
|
| 2576 |
pymupdf_page.set_cropbox(Rect(*rect_values))
|
|
@@ -2722,8 +2724,7 @@ def convert_df_to_xfdf(
|
|
| 2722 |
|
| 2723 |
output_path = output_folder + file_path_name + "_adobe.xfdf"
|
| 2724 |
|
| 2725 |
-
|
| 2726 |
-
f.write(xfdf_content)
|
| 2727 |
|
| 2728 |
output_paths.append(output_path)
|
| 2729 |
|
|
|
|
| 1 |
import os
|
| 2 |
import random
|
|
|
|
| 3 |
import string
|
| 4 |
import uuid
|
| 5 |
from datetime import datetime, timedelta, timezone
|
|
|
|
| 36 |
)
|
| 37 |
from tools.file_redaction import redact_page_with_pymupdf
|
| 38 |
from tools.helper_functions import detect_file_type, get_file_name_without_type
|
| 39 |
+
from tools.secure_path_utils import (
|
| 40 |
+
secure_file_write,
|
| 41 |
+
)
|
| 42 |
|
| 43 |
if not MAX_IMAGE_PIXELS:
|
| 44 |
Image.MAX_IMAGE_PIXELS = None
|
|
|
|
| 537 |
for i, page_state_entry in enumerate(out_image_annotations_state):
|
| 538 |
# Assuming page_state_entry has a 'page' key (1-based)
|
| 539 |
|
| 540 |
+
from tools.secure_regex_utils import (
|
| 541 |
+
safe_extract_page_number_from_filename,
|
| 542 |
+
)
|
| 543 |
+
|
| 544 |
+
page_no = safe_extract_page_number_from_filename(
|
| 545 |
+
page_state_entry["image"]
|
| 546 |
+
)
|
| 547 |
+
if page_no is None:
|
| 548 |
page_no = 0
|
| 549 |
|
| 550 |
if (
|
|
|
|
| 840 |
valid = False
|
| 841 |
if isinstance(colour_label, str):
|
| 842 |
label_str = colour_label.strip()
|
| 843 |
+
from tools.secure_regex_utils import safe_extract_rgb_values
|
| 844 |
+
|
| 845 |
+
rgb_values = safe_extract_rgb_values(label_str)
|
| 846 |
+
if rgb_values:
|
| 847 |
+
r_val, g_val, b_val = rgb_values
|
|
|
|
|
|
|
|
|
|
|
|
|
| 848 |
if 0 <= r_val <= 255 and 0 <= g_val <= 255 and 0 <= b_val <= 255:
|
| 849 |
valid = True
|
| 850 |
elif isinstance(colour_label, (tuple, list)) and len(colour_label) == 3:
|
|
|
|
| 2570 |
pymupdf_page = pymupdf_doc.load_page(page_python_format)
|
| 2571 |
|
| 2572 |
if document_cropboxes and page_python_format < len(document_cropboxes):
|
| 2573 |
+
from tools.secure_regex_utils import safe_extract_numbers
|
| 2574 |
+
|
| 2575 |
+
match = safe_extract_numbers(document_cropboxes[page_python_format])
|
| 2576 |
if match and len(match) == 4:
|
| 2577 |
rect_values = list(map(float, match))
|
| 2578 |
pymupdf_page.set_cropbox(Rect(*rect_values))
|
|
|
|
| 2724 |
|
| 2725 |
output_path = output_folder + file_path_name + "_adobe.xfdf"
|
| 2726 |
|
| 2727 |
+
secure_file_write(output_path, xfdf_content, encoding="utf-8")
|
|
|
|
| 2728 |
|
| 2729 |
output_paths.append(output_path)
|
| 2730 |
|
tools/secure_path_utils.py
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Secure path utilities to prevent path injection attacks.
|
| 3 |
+
|
| 4 |
+
This module provides secure alternatives to os.path operations that validate
|
| 5 |
+
and sanitize file paths to prevent directory traversal and other path-based attacks.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
import os
|
| 10 |
+
import re
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Optional, Union
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def sanitize_filename(filename: str, max_length: int = 255) -> str:
|
| 18 |
+
"""
|
| 19 |
+
Sanitize a filename to prevent path injection attacks.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
filename: The filename to sanitize
|
| 23 |
+
max_length: Maximum length of the sanitized filename
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
A sanitized filename safe for use in file operations
|
| 27 |
+
|
| 28 |
+
Raises:
|
| 29 |
+
ValueError: If the filename cannot be sanitized safely
|
| 30 |
+
"""
|
| 31 |
+
if not filename or not isinstance(filename, str):
|
| 32 |
+
raise ValueError("Filename must be a non-empty string")
|
| 33 |
+
|
| 34 |
+
# Remove any path separators and normalize
|
| 35 |
+
filename = os.path.basename(filename)
|
| 36 |
+
|
| 37 |
+
# Remove or replace dangerous characters
|
| 38 |
+
# Keep alphanumeric, dots, hyphens, underscores, spaces, parentheses, brackets, and other safe chars
|
| 39 |
+
# Only remove truly dangerous characters like path separators and control chars
|
| 40 |
+
sanitized = re.sub(r'[<>:"|?*\x00-\x1f]', "_", filename)
|
| 41 |
+
|
| 42 |
+
# Remove multiple consecutive dots (except for file extensions)
|
| 43 |
+
sanitized = re.sub(r"\.{2,}", ".", sanitized)
|
| 44 |
+
|
| 45 |
+
# Remove leading/trailing dots and spaces
|
| 46 |
+
sanitized = sanitized.strip(". ")
|
| 47 |
+
|
| 48 |
+
# Ensure it's not empty after sanitization
|
| 49 |
+
if not sanitized:
|
| 50 |
+
sanitized = "sanitized_file"
|
| 51 |
+
|
| 52 |
+
# Truncate if too long, preserving extension
|
| 53 |
+
if len(sanitized) > max_length:
|
| 54 |
+
name, ext = os.path.splitext(sanitized)
|
| 55 |
+
max_name_length = max_length - len(ext)
|
| 56 |
+
sanitized = name[:max_name_length] + ext
|
| 57 |
+
|
| 58 |
+
return sanitized
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def secure_path_join(base_path: Union[str, Path], *path_parts: str) -> Path:
|
| 62 |
+
"""
|
| 63 |
+
Safely join paths while preventing directory traversal attacks.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
base_path: The base directory path
|
| 67 |
+
*path_parts: Additional path components to join
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
A Path object representing the safe joined path
|
| 71 |
+
|
| 72 |
+
Raises:
|
| 73 |
+
ValueError: If any path component contains dangerous characters
|
| 74 |
+
PermissionError: If the resulting path would escape the base directory
|
| 75 |
+
"""
|
| 76 |
+
base_path = Path(base_path).resolve()
|
| 77 |
+
|
| 78 |
+
# Sanitize each path part - only sanitize if it contains dangerous patterns
|
| 79 |
+
sanitized_parts = []
|
| 80 |
+
for part in path_parts:
|
| 81 |
+
if not part:
|
| 82 |
+
continue
|
| 83 |
+
# Only sanitize if the part contains dangerous patterns
|
| 84 |
+
if re.search(r'[<>:"|?*\x00-\x1f]|\.{2,}', part):
|
| 85 |
+
sanitized_part = sanitize_filename(part)
|
| 86 |
+
else:
|
| 87 |
+
sanitized_part = part
|
| 88 |
+
sanitized_parts.append(sanitized_part)
|
| 89 |
+
|
| 90 |
+
# Join the paths
|
| 91 |
+
result_path = base_path
|
| 92 |
+
for part in sanitized_parts:
|
| 93 |
+
result_path = result_path / part
|
| 94 |
+
|
| 95 |
+
# Resolve the final path
|
| 96 |
+
result_path = result_path.resolve()
|
| 97 |
+
|
| 98 |
+
# Security check: ensure the result is within the base directory
|
| 99 |
+
try:
|
| 100 |
+
result_path.relative_to(base_path)
|
| 101 |
+
except ValueError:
|
| 102 |
+
raise PermissionError(f"Path would escape base directory: {result_path}")
|
| 103 |
+
|
| 104 |
+
return result_path
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def secure_file_write(
|
| 108 |
+
file_path: Union[str, Path],
|
| 109 |
+
content: str,
|
| 110 |
+
mode: str = "w",
|
| 111 |
+
encoding: Optional[str] = None,
|
| 112 |
+
**kwargs,
|
| 113 |
+
) -> None:
|
| 114 |
+
"""
|
| 115 |
+
Safely write content to a file with path validation.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
file_path: The file path to write to
|
| 119 |
+
content: The content to write
|
| 120 |
+
mode: File open mode (default: 'w')
|
| 121 |
+
encoding: Text encoding (default: None for binary mode)
|
| 122 |
+
**kwargs: Additional arguments for open()
|
| 123 |
+
"""
|
| 124 |
+
file_path = Path(file_path)
|
| 125 |
+
|
| 126 |
+
# Ensure the parent directory exists
|
| 127 |
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
| 128 |
+
|
| 129 |
+
# Validate the path is safe
|
| 130 |
+
if not file_path.is_absolute():
|
| 131 |
+
file_path = file_path.resolve()
|
| 132 |
+
|
| 133 |
+
# Write the file
|
| 134 |
+
open_kwargs = {"mode": mode}
|
| 135 |
+
if encoding:
|
| 136 |
+
open_kwargs["encoding"] = encoding
|
| 137 |
+
open_kwargs.update(kwargs)
|
| 138 |
+
|
| 139 |
+
with open(file_path, **open_kwargs) as f:
|
| 140 |
+
f.write(content)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def secure_file_read(
|
| 144 |
+
file_path: Union[str, Path],
|
| 145 |
+
mode: str = "r",
|
| 146 |
+
encoding: Optional[str] = None,
|
| 147 |
+
**kwargs,
|
| 148 |
+
) -> str:
|
| 149 |
+
"""
|
| 150 |
+
Safely read content from a file with path validation.
|
| 151 |
+
|
| 152 |
+
Args:
|
| 153 |
+
file_path: The file path to read from
|
| 154 |
+
mode: File open mode (default: 'r')
|
| 155 |
+
encoding: Text encoding (default: None for binary mode)
|
| 156 |
+
**kwargs: Additional arguments for open()
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
The file content
|
| 160 |
+
"""
|
| 161 |
+
file_path = Path(file_path)
|
| 162 |
+
|
| 163 |
+
# Validate the path exists and is a file
|
| 164 |
+
if not file_path.exists():
|
| 165 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
| 166 |
+
|
| 167 |
+
if not file_path.is_file():
|
| 168 |
+
raise ValueError(f"Path is not a file: {file_path}")
|
| 169 |
+
|
| 170 |
+
# Read the file
|
| 171 |
+
open_kwargs = {"mode": mode}
|
| 172 |
+
if encoding:
|
| 173 |
+
open_kwargs["encoding"] = encoding
|
| 174 |
+
open_kwargs.update(kwargs)
|
| 175 |
+
|
| 176 |
+
with open(file_path, **open_kwargs) as f:
|
| 177 |
+
return f.read()
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def validate_path_safety(
|
| 181 |
+
path: Union[str, Path], base_path: Optional[Union[str, Path]] = None
|
| 182 |
+
) -> bool:
|
| 183 |
+
"""
|
| 184 |
+
Validate that a path is safe and doesn't contain dangerous patterns.
|
| 185 |
+
|
| 186 |
+
Args:
|
| 187 |
+
path: The path to validate
|
| 188 |
+
base_path: Optional base path to check against
|
| 189 |
+
|
| 190 |
+
Returns:
|
| 191 |
+
True if the path is safe, False otherwise
|
| 192 |
+
"""
|
| 193 |
+
try:
|
| 194 |
+
path = Path(path)
|
| 195 |
+
|
| 196 |
+
# Check for dangerous patterns
|
| 197 |
+
path_str = str(path)
|
| 198 |
+
|
| 199 |
+
# Check for directory traversal patterns
|
| 200 |
+
dangerous_patterns = [
|
| 201 |
+
"..", # Parent directory
|
| 202 |
+
"//", # Double slashes
|
| 203 |
+
"\\", # Backslashes (on Unix systems)
|
| 204 |
+
]
|
| 205 |
+
|
| 206 |
+
for pattern in dangerous_patterns:
|
| 207 |
+
if pattern in path_str:
|
| 208 |
+
return False
|
| 209 |
+
|
| 210 |
+
# If base path is provided, ensure the path is within it
|
| 211 |
+
if base_path:
|
| 212 |
+
base_path = Path(base_path).resolve()
|
| 213 |
+
path = path.resolve()
|
| 214 |
+
try:
|
| 215 |
+
path.relative_to(base_path)
|
| 216 |
+
except ValueError:
|
| 217 |
+
return False
|
| 218 |
+
|
| 219 |
+
return True
|
| 220 |
+
|
| 221 |
+
except Exception:
|
| 222 |
+
return False
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
# Backward compatibility functions that maintain the same interface as os.path
|
| 226 |
+
def secure_join(*paths: str) -> str:
|
| 227 |
+
"""
|
| 228 |
+
Secure alternative to os.path.join that prevents path injection.
|
| 229 |
+
|
| 230 |
+
Args:
|
| 231 |
+
*paths: Path components to join
|
| 232 |
+
|
| 233 |
+
Returns:
|
| 234 |
+
A safe joined path string
|
| 235 |
+
"""
|
| 236 |
+
if not paths:
|
| 237 |
+
return ""
|
| 238 |
+
|
| 239 |
+
# Use the first path as base, others as components
|
| 240 |
+
base_path = Path(paths[0])
|
| 241 |
+
path_parts = paths[1:]
|
| 242 |
+
|
| 243 |
+
# Only use secure_path_join if there are potentially dangerous patterns
|
| 244 |
+
if any(re.search(r'[<>:"|?*\x00-\x1f]|\.{2,}', part) for part in path_parts):
|
| 245 |
+
result_path = secure_path_join(base_path, *path_parts)
|
| 246 |
+
return str(result_path)
|
| 247 |
+
else:
|
| 248 |
+
# Use normal path joining for safe paths
|
| 249 |
+
return str(Path(*paths))
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def secure_basename(path: str) -> str:
|
| 253 |
+
"""
|
| 254 |
+
Secure alternative to os.path.basename that sanitizes the result.
|
| 255 |
+
|
| 256 |
+
Args:
|
| 257 |
+
path: The path to get the basename from
|
| 258 |
+
|
| 259 |
+
Returns:
|
| 260 |
+
A sanitized basename
|
| 261 |
+
"""
|
| 262 |
+
basename = os.path.basename(path)
|
| 263 |
+
# Only sanitize if the basename contains dangerous patterns
|
| 264 |
+
if re.search(r'[<>:"|?*\x00-\x1f]|\.{2,}', basename):
|
| 265 |
+
return sanitize_filename(basename)
|
| 266 |
+
else:
|
| 267 |
+
return basename
|
tools/secure_regex_utils.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Secure regex utilities to prevent ReDoS (Regular Expression Denial of Service) attacks.
|
| 3 |
+
|
| 4 |
+
This module provides safe alternatives to common regex patterns that can cause
|
| 5 |
+
catastrophic backtracking and performance issues.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
from typing import List, Optional
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def safe_extract_numbers_with_seconds(text: str) -> List[float]:
|
| 13 |
+
"""
|
| 14 |
+
Safely extract numbers before 'seconds' from text without ReDoS vulnerability.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
text: The text to search for numbers followed by 'seconds'
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
List of float numbers found before 'seconds'
|
| 21 |
+
"""
|
| 22 |
+
if not text or not isinstance(text, str):
|
| 23 |
+
return []
|
| 24 |
+
|
| 25 |
+
# Use a more specific pattern that avoids catastrophic backtracking
|
| 26 |
+
# Look for digits, optional decimal part, optional whitespace, then 'seconds'
|
| 27 |
+
pattern = r"\b(\d+(?:\.\d+)?)\s*seconds\b"
|
| 28 |
+
|
| 29 |
+
matches = re.findall(pattern, text)
|
| 30 |
+
try:
|
| 31 |
+
return [float(match) for match in matches]
|
| 32 |
+
except (ValueError, TypeError):
|
| 33 |
+
return []
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def safe_extract_numbers(text: str) -> List[float]:
|
| 37 |
+
"""
|
| 38 |
+
Safely extract all numbers from text without ReDoS vulnerability.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
text: The text to extract numbers from
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
List of float numbers found in the text
|
| 45 |
+
"""
|
| 46 |
+
if not text or not isinstance(text, str):
|
| 47 |
+
return []
|
| 48 |
+
|
| 49 |
+
# Use a simple, safe pattern that doesn't cause backtracking
|
| 50 |
+
# Match digits, optional decimal point and more digits
|
| 51 |
+
pattern = r"\b\d+(?:\.\d+)?\b"
|
| 52 |
+
|
| 53 |
+
matches = re.findall(pattern, text)
|
| 54 |
+
try:
|
| 55 |
+
return [float(match) for match in matches]
|
| 56 |
+
except (ValueError, TypeError):
|
| 57 |
+
return []
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def safe_extract_page_number_from_filename(filename: str) -> Optional[int]:
|
| 61 |
+
"""
|
| 62 |
+
Safely extract page number from filename ending with .png.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
filename: The filename to extract page number from
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
Page number if found, None otherwise
|
| 69 |
+
"""
|
| 70 |
+
if not filename or not isinstance(filename, str):
|
| 71 |
+
return None
|
| 72 |
+
|
| 73 |
+
# Use a simple, safe pattern
|
| 74 |
+
pattern = r"(\d+)\.png$"
|
| 75 |
+
match = re.search(pattern, filename)
|
| 76 |
+
|
| 77 |
+
if match:
|
| 78 |
+
try:
|
| 79 |
+
return int(match.group(1))
|
| 80 |
+
except (ValueError, TypeError):
|
| 81 |
+
return None
|
| 82 |
+
|
| 83 |
+
return None
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def safe_extract_page_number_from_path(path: str) -> Optional[int]:
|
| 87 |
+
"""
|
| 88 |
+
Safely extract page number from path containing _(\d+).png pattern.
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
path: The path to extract page number from
|
| 92 |
+
|
| 93 |
+
Returns:
|
| 94 |
+
Page number if found, None otherwise
|
| 95 |
+
"""
|
| 96 |
+
if not path or not isinstance(path, str):
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
# Use a simple, safe pattern
|
| 100 |
+
pattern = r"_(\d+)\.png$"
|
| 101 |
+
match = re.search(pattern, path)
|
| 102 |
+
|
| 103 |
+
if match:
|
| 104 |
+
try:
|
| 105 |
+
return int(match.group(1))
|
| 106 |
+
except (ValueError, TypeError):
|
| 107 |
+
return None
|
| 108 |
+
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def safe_clean_text(text: str, remove_html: bool = True) -> str:
|
| 113 |
+
"""
|
| 114 |
+
Safely clean text without ReDoS vulnerability.
|
| 115 |
+
|
| 116 |
+
Args:
|
| 117 |
+
text: The text to clean
|
| 118 |
+
remove_html: Whether to remove HTML tags
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
Cleaned text
|
| 122 |
+
"""
|
| 123 |
+
if not text or not isinstance(text, str):
|
| 124 |
+
return ""
|
| 125 |
+
|
| 126 |
+
cleaned = text
|
| 127 |
+
|
| 128 |
+
if remove_html:
|
| 129 |
+
# Use a simple pattern that doesn't cause backtracking
|
| 130 |
+
cleaned = re.sub(r"<[^>]*>", "", cleaned)
|
| 131 |
+
|
| 132 |
+
# Clean up whitespace
|
| 133 |
+
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
| 134 |
+
|
| 135 |
+
return cleaned
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def safe_extract_rgb_values(text: str) -> Optional[tuple]:
|
| 139 |
+
"""
|
| 140 |
+
Safely extract RGB values from text like "(255, 255, 255)".
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
text: The text to extract RGB values from
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
Tuple of (r, g, b) values if found, None otherwise
|
| 147 |
+
"""
|
| 148 |
+
if not text or not isinstance(text, str):
|
| 149 |
+
return None
|
| 150 |
+
|
| 151 |
+
# Use a simple, safe pattern
|
| 152 |
+
pattern = r"\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*\)"
|
| 153 |
+
match = re.match(pattern, text.strip())
|
| 154 |
+
|
| 155 |
+
if match:
|
| 156 |
+
try:
|
| 157 |
+
r = int(match.group(1))
|
| 158 |
+
g = int(match.group(2))
|
| 159 |
+
b = int(match.group(3))
|
| 160 |
+
|
| 161 |
+
# Validate RGB values
|
| 162 |
+
if 0 <= r <= 255 and 0 <= g <= 255 and 0 <= b <= 255:
|
| 163 |
+
return (r, g, b)
|
| 164 |
+
except (ValueError, TypeError):
|
| 165 |
+
pass
|
| 166 |
+
|
| 167 |
+
return None
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def safe_split_filename(filename: str, delimiters: List[str]) -> List[str]:
|
| 171 |
+
"""
|
| 172 |
+
Safely split filename by delimiters without ReDoS vulnerability.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
filename: The filename to split
|
| 176 |
+
delimiters: List of delimiter patterns to split on
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
List of filename parts
|
| 180 |
+
"""
|
| 181 |
+
if not filename or not isinstance(filename, str):
|
| 182 |
+
return []
|
| 183 |
+
|
| 184 |
+
if not delimiters:
|
| 185 |
+
return [filename]
|
| 186 |
+
|
| 187 |
+
# Escape special regex characters in delimiters
|
| 188 |
+
escaped_delimiters = [re.escape(delim) for delim in delimiters]
|
| 189 |
+
|
| 190 |
+
# Create a safe pattern
|
| 191 |
+
pattern = "|".join(escaped_delimiters)
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
return re.split(pattern, filename)
|
| 195 |
+
except re.error:
|
| 196 |
+
# Fallback to simple string operations if regex fails
|
| 197 |
+
result = [filename]
|
| 198 |
+
for delim in delimiters:
|
| 199 |
+
new_result = []
|
| 200 |
+
for part in result:
|
| 201 |
+
new_result.extend(part.split(delim))
|
| 202 |
+
result = new_result
|
| 203 |
+
return result
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def safe_remove_leading_newlines(text: str) -> str:
|
| 207 |
+
"""
|
| 208 |
+
Safely remove leading newlines without ReDoS vulnerability.
|
| 209 |
+
|
| 210 |
+
Args:
|
| 211 |
+
text: The text to clean
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
Text with leading newlines removed
|
| 215 |
+
"""
|
| 216 |
+
if not text or not isinstance(text, str):
|
| 217 |
+
return ""
|
| 218 |
+
|
| 219 |
+
# Use a simple pattern
|
| 220 |
+
return re.sub(r"^\n+", "", text).strip()
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def safe_remove_non_ascii(text: str) -> str:
|
| 224 |
+
"""
|
| 225 |
+
Safely remove non-ASCII characters without ReDoS vulnerability.
|
| 226 |
+
|
| 227 |
+
Args:
|
| 228 |
+
text: The text to clean
|
| 229 |
+
|
| 230 |
+
Returns:
|
| 231 |
+
Text with non-ASCII characters removed
|
| 232 |
+
"""
|
| 233 |
+
if not text or not isinstance(text, str):
|
| 234 |
+
return ""
|
| 235 |
+
|
| 236 |
+
# Use a simple pattern
|
| 237 |
+
return re.sub(r"[^\x00-\x7F]", "", text)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def safe_extract_latest_number_from_filename(filename: str) -> Optional[int]:
|
| 241 |
+
"""
|
| 242 |
+
Safely extract the latest/largest number from filename without ReDoS vulnerability.
|
| 243 |
+
|
| 244 |
+
Args:
|
| 245 |
+
filename: The filename to extract number from
|
| 246 |
+
|
| 247 |
+
Returns:
|
| 248 |
+
The largest number found, or None if no numbers found
|
| 249 |
+
"""
|
| 250 |
+
if not filename or not isinstance(filename, str):
|
| 251 |
+
return None
|
| 252 |
+
|
| 253 |
+
# Use a simple pattern to find all numbers
|
| 254 |
+
pattern = r"\d+"
|
| 255 |
+
matches = re.findall(pattern, filename)
|
| 256 |
+
|
| 257 |
+
if not matches:
|
| 258 |
+
return None
|
| 259 |
+
|
| 260 |
+
try:
|
| 261 |
+
# Convert to integers and return the maximum
|
| 262 |
+
numbers = [int(match) for match in matches]
|
| 263 |
+
return max(numbers)
|
| 264 |
+
except (ValueError, TypeError):
|
| 265 |
+
return None
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def safe_sanitize_text(text: str, replacement: str = "_") -> str:
|
| 269 |
+
"""
|
| 270 |
+
Safely sanitize text by removing dangerous characters without ReDoS vulnerability.
|
| 271 |
+
|
| 272 |
+
Args:
|
| 273 |
+
text: The text to sanitize
|
| 274 |
+
replacement: Character to replace dangerous characters with
|
| 275 |
+
|
| 276 |
+
Returns:
|
| 277 |
+
Sanitized text
|
| 278 |
+
"""
|
| 279 |
+
if not text or not isinstance(text, str):
|
| 280 |
+
return ""
|
| 281 |
+
|
| 282 |
+
# Use a simple pattern for dangerous characters
|
| 283 |
+
dangerous_chars = r'[<>:"|?*\\/\x00-\x1f\x7f-\x9f]'
|
| 284 |
+
sanitized = re.sub(dangerous_chars, replacement, text)
|
| 285 |
+
|
| 286 |
+
# Remove multiple consecutive replacements
|
| 287 |
+
sanitized = re.sub(f"{re.escape(replacement)}+", replacement, sanitized)
|
| 288 |
+
|
| 289 |
+
# Remove leading/trailing replacements
|
| 290 |
+
sanitized = sanitized.strip(replacement)
|
| 291 |
+
|
| 292 |
+
return sanitized
|
tools/textract_batch_call.py
CHANGED
|
@@ -32,6 +32,11 @@ from tools.config import (
|
|
| 32 |
)
|
| 33 |
from tools.file_conversion import get_input_file_names
|
| 34 |
from tools.helper_functions import get_file_name_without_type
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
|
| 37 |
|
|
@@ -115,8 +120,8 @@ def analyse_document_with_textract_api(
|
|
| 115 |
textract_client = session.client("textract")
|
| 116 |
|
| 117 |
# --- 1. Upload PDF to S3 ---
|
| 118 |
-
pdf_filename =
|
| 119 |
-
s3_input_key =
|
| 120 |
"\\", "/"
|
| 121 |
) # Ensure forward slashes for S3
|
| 122 |
|
|
@@ -262,14 +267,13 @@ def analyse_document_with_textract_api(
|
|
| 262 |
)
|
| 263 |
|
| 264 |
# File path
|
| 265 |
-
log_file_path =
|
| 266 |
-
log_file_path_job_id =
|
| 267 |
local_output_dir, pdf_filename + "_textract_document_jobs_job_id.txt"
|
| 268 |
)
|
| 269 |
|
| 270 |
# Write latest job ID to local text file
|
| 271 |
-
|
| 272 |
-
f.write(job_id)
|
| 273 |
|
| 274 |
# Check if file exists
|
| 275 |
file_exists = os.path.exists(log_file_path)
|
|
@@ -447,10 +451,9 @@ def download_textract_job_files(
|
|
| 447 |
output_filename_base = os.path.basename(pdf_filename)
|
| 448 |
output_filename_base_no_ext = os.path.splitext(output_filename_base)[0]
|
| 449 |
local_output_filename = f"{output_filename_base_no_ext}_textract.json"
|
| 450 |
-
local_output_path =
|
| 451 |
|
| 452 |
-
|
| 453 |
-
json.dump(combined_output, f)
|
| 454 |
|
| 455 |
print(f"Combined Textract output written to {local_output_path}")
|
| 456 |
|
|
@@ -484,12 +487,12 @@ def load_pdf_job_file_from_s3(
|
|
| 484 |
pdf_file_location = ""
|
| 485 |
doc_file_name_no_extension_textbox = ""
|
| 486 |
|
| 487 |
-
s3_input_key_prefix =
|
| 488 |
-
|
| 489 |
-
)
|
| 490 |
s3_input_key_prefix = s3_input_key_prefix + ".pdf"
|
| 491 |
|
| 492 |
-
local_input_file_path =
|
| 493 |
local_input_file_path = local_input_file_path + ".pdf"
|
| 494 |
|
| 495 |
download_file_from_s3(
|
|
@@ -705,7 +708,7 @@ def poll_whole_document_textract_analysis_progress_and_download(
|
|
| 705 |
# For robust handling, list objects and find the JSON(s).
|
| 706 |
|
| 707 |
s3_output_key_prefix = (
|
| 708 |
-
|
| 709 |
)
|
| 710 |
logging.info(
|
| 711 |
f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}"
|
|
@@ -848,7 +851,7 @@ def download_textract_output(
|
|
| 848 |
|
| 849 |
# Find output ZIP file in S3
|
| 850 |
output_file_key = f"{output_prefix}/{job_id}.zip"
|
| 851 |
-
local_file_path =
|
| 852 |
|
| 853 |
# Download file
|
| 854 |
try:
|
|
|
|
| 32 |
)
|
| 33 |
from tools.file_conversion import get_input_file_names
|
| 34 |
from tools.helper_functions import get_file_name_without_type
|
| 35 |
+
from tools.secure_path_utils import (
|
| 36 |
+
secure_basename,
|
| 37 |
+
secure_file_write,
|
| 38 |
+
secure_join,
|
| 39 |
+
)
|
| 40 |
|
| 41 |
DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
|
| 42 |
|
|
|
|
| 120 |
textract_client = session.client("textract")
|
| 121 |
|
| 122 |
# --- 1. Upload PDF to S3 ---
|
| 123 |
+
pdf_filename = secure_basename(local_pdf_path)
|
| 124 |
+
s3_input_key = secure_join(s3_input_prefix, pdf_filename).replace(
|
| 125 |
"\\", "/"
|
| 126 |
) # Ensure forward slashes for S3
|
| 127 |
|
|
|
|
| 267 |
)
|
| 268 |
|
| 269 |
# File path
|
| 270 |
+
log_file_path = secure_join(local_output_dir, "textract_document_jobs.csv")
|
| 271 |
+
log_file_path_job_id = secure_join(
|
| 272 |
local_output_dir, pdf_filename + "_textract_document_jobs_job_id.txt"
|
| 273 |
)
|
| 274 |
|
| 275 |
# Write latest job ID to local text file
|
| 276 |
+
secure_file_write(log_file_path_job_id, job_id)
|
|
|
|
| 277 |
|
| 278 |
# Check if file exists
|
| 279 |
file_exists = os.path.exists(log_file_path)
|
|
|
|
| 451 |
output_filename_base = os.path.basename(pdf_filename)
|
| 452 |
output_filename_base_no_ext = os.path.splitext(output_filename_base)[0]
|
| 453 |
local_output_filename = f"{output_filename_base_no_ext}_textract.json"
|
| 454 |
+
local_output_path = secure_join(local_output_dir, local_output_filename)
|
| 455 |
|
| 456 |
+
secure_file_write(local_output_path, json.dumps(combined_output))
|
|
|
|
| 457 |
|
| 458 |
print(f"Combined Textract output written to {local_output_path}")
|
| 459 |
|
|
|
|
| 487 |
pdf_file_location = ""
|
| 488 |
doc_file_name_no_extension_textbox = ""
|
| 489 |
|
| 490 |
+
s3_input_key_prefix = secure_join(load_s3_jobs_input_loc, pdf_filename).replace(
|
| 491 |
+
"\\", "/"
|
| 492 |
+
)
|
| 493 |
s3_input_key_prefix = s3_input_key_prefix + ".pdf"
|
| 494 |
|
| 495 |
+
local_input_file_path = secure_join(local_output_dir, pdf_filename)
|
| 496 |
local_input_file_path = local_input_file_path + ".pdf"
|
| 497 |
|
| 498 |
download_file_from_s3(
|
|
|
|
| 708 |
# For robust handling, list objects and find the JSON(s).
|
| 709 |
|
| 710 |
s3_output_key_prefix = (
|
| 711 |
+
secure_join(s3_output_prefix, job_id).replace("\\", "/") + "/"
|
| 712 |
)
|
| 713 |
logging.info(
|
| 714 |
f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}"
|
|
|
|
| 851 |
|
| 852 |
# Find output ZIP file in S3
|
| 853 |
output_file_key = f"{output_prefix}/{job_id}.zip"
|
| 854 |
+
local_file_path = secure_join(local_folder, f"{job_id}.zip")
|
| 855 |
|
| 856 |
# Download file
|
| 857 |
try:
|