Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on 6 days ago

Commit

aa5c211

1 Parent(s): bcf1a65

Fix to tabular redaction, added tabular deduplication. Updated cli call capability for both

Browse files

Files changed (17) hide show

Dockerfile +2 -2
README.md +1 -1
app.py +0 -0
cli_redact.py +308 -0
index.md +0 -8
lambda_entrypoint.py +136 -107
pyproject.toml +4 -3
requirements.txt +3 -2
tools/cli_redact.py +0 -164
tools/config.py +64 -13
tools/custom_csvlogger.py +6 -23
tools/data_anonymise.py +101 -28
tools/example_cli_calls.txt +7 -1
tools/file_redaction.py +18 -20
tools/find_duplicate_pages.py +3 -5
tools/find_duplicate_tabular.py +422 -0
tools/helper_functions.py +14 -20

Dockerfile CHANGED Viewed

@@ -1,5 +1,5 @@
 # Stage 1: Build dependencies and download models
-FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
 # Install system dependencies
 RUN apt-get update \
@@ -24,7 +24,7 @@ COPY lambda_entrypoint.py .
 COPY entrypoint.sh .
 # Stage 2: Final runtime image
-FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
 # Set build-time and runtime environment variable
 ARG APP_MODE=gradio

 # Stage 1: Build dependencies and download models
+FROM public.ecr.aws/docker/library/python:3.11.13-slim-bookworm AS builder
 # Install system dependencies
 RUN apt-get update \
 COPY entrypoint.sh .
 # Stage 2: Final runtime image
+FROM public.ecr.aws/docker/library/python:3.11.13-slim-bookworm
 # Set build-time and runtime environment variable
 ARG APP_MODE=gradio

README.md CHANGED Viewed

@@ -10,7 +10,7 @@ license: agpl-3.0
 ---
 # Document redaction
-version: 1.0.0
 Redact personally identifiable information (PII) from documents (pdf, images), Word files (.docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.

 ---
 # Document redaction
+version: 1.1.0
 Redact personally identifiable information (PII) from documents (pdf, images), Word files (.docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

cli_redact.py ADDED Viewed

	@@ -0,0 +1,308 @@

+import argparse
+import os
+import pandas as pd
+from tools.config import get_or_create_env_var, LOCAL_PII_OPTION, AWS_PII_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, INPUT_FOLDER, OUTPUT_FOLDER, DEFAULT_LANGUAGE, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST
+from tools.helper_functions import ensure_output_folder_exists
+from tools.file_conversion import prepare_image_or_pdf
+from tools.file_redaction import choose_and_run_redactor
+from tools.data_anonymise import anonymise_files_with_open_text
+from tools.helper_functions import _get_env_list
+from tools.load_spacy_model_custom_recognisers import custom_entities
+from tools.find_duplicate_pages import run_duplicate_analysis, run_full_search_and_analysis
+from tools.find_duplicate_tabular import run_tabular_duplicate_analysis
+# --- Constants and Configuration ---
+if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
+if FULL_COMPREHEND_ENTITY_LIST: FULL_COMPREHEND_ENTITY_LIST = _get_env_list(FULL_COMPREHEND_ENTITY_LIST)
+if CHOSEN_REDACT_ENTITIES: CHOSEN_REDACT_ENTITIES = _get_env_list(CHOSEN_REDACT_ENTITIES)
+if FULL_ENTITY_LIST: FULL_ENTITY_LIST = _get_env_list(FULL_ENTITY_LIST)
+# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
+CHOSEN_COMPREHEND_ENTITIES.extend(custom_entities)
+FULL_COMPREHEND_ENTITY_LIST.extend(custom_entities)
+chosen_redact_entities = CHOSEN_REDACT_ENTITIES
+full_entity_list = FULL_ENTITY_LIST
+chosen_comprehend_entities = CHOSEN_COMPREHEND_ENTITIES
+full_comprehend_entity_list = FULL_COMPREHEND_ENTITY_LIST
+# --- Main CLI Function ---
+def main(direct_mode_args=None):
+    """
+    A unified command-line interface to prepare, redact, and anonymise various document types.
+    Args:
+        direct_mode_args (dict, optional): Dictionary of arguments for direct mode execution.
+                                          If provided, uses these instead of parsing command line arguments.
+    """
+    parser = argparse.ArgumentParser(
+        description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
+        formatter_class=argparse.RawTextHelpFormatter,
+        epilog='''
+Examples:
+  # Redact a PDF with default settings:
+  python cli_redact.py --input_file document.pdf
+  # Redact specific pages with custom OCR:
+  python cli_redact.py --input_file document.pdf --page_min 1 --page_max 10 --ocr_method "AWS Textract service - all PDF types"
+  # Anonymize Excel file with specific columns:
+  python cli_redact.py --input_file data.xlsx --columns "Name" "Email" --anon_strat "replace with 'REDACTED'"
+  # Use AWS services with custom settings:
+  python cli_redact.py --input_file document.pdf --pii_detector "AWS Comprehend" --aws_access_key YOUR_KEY --aws_secret_key YOUR_SECRET
+  # Advanced redaction with custom word list:
+  python cli_redact.py --input_file document.pdf --in_deny_list "CompanyName" "ProjectCode" --deny_list custom_terms.csv
+  # Find duplicate pages in OCR files:
+  python cli_redact.py --task deduplicate --input_file ocr_output.csv --duplicate_type pages --similarity_threshold 0.95
+  # Find duplicate content with search query:
+  python cli_redact.py --task deduplicate --input_file ocr_output.csv --duplicate_type pages --search_query "confidential information"
+  # Find duplicate rows in tabular data:
+  python cli_redact.py --task deduplicate --input_file data.csv --duplicate_type tabular --text_columns "Name" "Description"
+        '''
+    )
+    # --- Task Selection ---
+    task_group = parser.add_argument_group('Task Selection')
+    task_group.add_argument('--task',
+    choices=['redact', 'deduplicate'],
+    default='redact',
+    help='Task to perform: redact (PII redaction/anonymization) or deduplicate (find duplicate content).')
+    # --- General Arguments (apply to all file types) ---
+    general_group = parser.add_argument_group('General Options')
+    general_group.add_argument('--input_file', required=True, help='Path to the input file to process.')
+    general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
+    general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
+    general_group.add_argument('--allow_list', help='Path to a CSV file with words to exclude from redaction.')
+    general_group.add_argument('--pii_detector',
+    choices=[LOCAL_PII_OPTION, AWS_PII_OPTION],
+    default=LOCAL_PII_OPTION,
+    help='Core PII detection method (Local or AWS).')
+    general_group.add_argument('--aws_access_key', default='', help='Your AWS Access Key ID.')
+    general_group.add_argument('--aws_secret_key', default='', help='Your AWS Secret Access Key.')
+    general_group.add_argument('--aws_region', default='', help='AWS region for cloud services.')
+    general_group.add_argument('--s3_bucket', default='', help='S3 bucket name for cloud operations.')
+    general_group.add_argument('--do_initial_clean', action='store_true', help='Perform initial text cleaning for tabular data.')
+    general_group.add_argument('--save_logs_to_csv', action='store_true', help='Save processing logs to CSV files.')
+    general_group.add_argument('--display_file_names_in_logs', action='store_true', help='Include file names in log outputs.')
+    # --- PDF/Image Redaction Arguments ---
+    pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
+    pdf_group.add_argument('--ocr_method',
+    choices=[SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION],
+    default=TESSERACT_TEXT_EXTRACT_OPTION,
+    help='OCR method for text extraction from images.')
+    pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
+    pdf_group.add_argument('--page_max', type=int, default=999, help='Last page to redact.')
+    pdf_group.add_argument('--prepare_for_review', action='store_true', help='Prepare files for reviewing redactions.')
+    pdf_group.add_argument('--prepare_images', action='store_true', default=True, help='Enable image creation for PDF pages.')
+    pdf_group.add_argument('--no_images', action='store_false', dest='prepare_images', help='Disable image creation for PDF pages.')
+    pdf_group.add_argument('--images_dpi', type=float, default=300.0, help='DPI for image processing.')
+    pdf_group.add_argument('--max_image_pixels', type=int, help='Maximum image pixels for processing.')
+    pdf_group.add_argument('--load_truncated_images', action='store_true', help='Load truncated images during processing.')
+    pdf_group.add_argument('--chosen_local_ocr_model', choices=['tesseract', 'hybrid', 'paddle'], default='tesseract', help='Local OCR model to use.')
+    pdf_group.add_argument('--preprocess_local_ocr_images', action='store_true', help='Preprocess images before OCR.')
+    pdf_group.add_argument('--compress_redacted_pdf', action='store_true', help='Compress the final redacted PDF.')
+    pdf_group.add_argument('--return_pdf_end_of_redaction', action='store_true', default=True, help='Return PDF at end of redaction process.')
+    pdf_group.add_argument('--in_deny_list', nargs='+', default=list(), help='Custom words to recognize for redaction.')
+    pdf_group.add_argument('--redact_whole_page_list', nargs='+', default=list(), help='Pages to redact completely.')
+    pdf_group.add_argument('--handwrite_signature_checkbox', nargs='+', default=['Extract handwriting', 'Extract signatures'], help='Handwriting and signature extraction options.')
+    # --- Word/Tabular Anonymisation Arguments ---
+    tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
+    tabular_group.add_argument('--anon_strat', choices=['redact', 'encrypt', 'hash', 'replace with \'REDACTED\'', 'replace with <ENTITY_NAME>', 'redact completely', 'mask', 'fake_first_name'], default='redact', help='The anonymisation strategy to apply.')
+    tabular_group.add_argument('--columns', nargs='+', default=list(), help='A list of column names to anonymise in tabular data.')
+    tabular_group.add_argument('--excel_sheets', nargs='+', default=list(), help='Specific Excel sheet names to process.')
+    tabular_group.add_argument('--deny_list', help='Path to a CSV file with specific terms/phrases to redact.')
+    tabular_group.add_argument('--fuzzy_mistakes', type=int, default=1, help='Number of allowed spelling mistakes for fuzzy matching.')
+    # --- Duplicate Detection Arguments ---
+    duplicate_group = parser.add_argument_group('Duplicate Detection Options')
+    duplicate_group.add_argument('--duplicate_type', choices=['pages', 'tabular'], default='pages', help='Type of duplicate detection: pages (for OCR files) or tabular (for CSV/Excel files).')
+    duplicate_group.add_argument('--similarity_threshold', type=float, default=0.95, help='Similarity threshold (0-1) to consider content as duplicates.')
+    duplicate_group.add_argument('--min_word_count', type=int, default=3, help='Minimum word count for text to be considered in duplicate analysis.')
+    duplicate_group.add_argument('--min_consecutive_pages', type=int, default=1, help='Minimum number of consecutive pages to consider as a match.')
+    duplicate_group.add_argument('--greedy_match', action='store_true', default=True, help='Use greedy matching strategy for consecutive pages.')
+    duplicate_group.add_argument('--combine_pages', action='store_true', default=True, help='Combine text from the same page number within a file.')
+    duplicate_group.add_argument('--search_query', help='Search query text to find specific duplicate content (for page duplicates).')
+    duplicate_group.add_argument('--text_columns', nargs='+', default=list(), help='Specific text columns to analyze for duplicates (for tabular data).')
+    # Parse arguments - either from command line or direct mode
+    if direct_mode_args:
+        # Use direct mode arguments
+        args = argparse.Namespace(**direct_mode_args)
+    else:
+        # Parse command line arguments
+        args = parser.parse_args()
+    # --- Initial Setup ---
+    ensure_output_folder_exists(args.output_dir)
+    _, file_extension = os.path.splitext(args.input_file)
+    file_extension = file_extension.lower()
+    # Load allow/deny lists
+    allow_list = pd.read_csv(args.allow_list) if args.allow_list else pd.DataFrame()
+    deny_list = pd.read_csv(args.deny_list).iloc[:, 0].tolist() if args.deny_list else []
+    # --- Route to the Correct Workflow Based on Task and File Type ---
+    # Task 1: Redaction/Anonymization
+    if args.task == 'redact':
+        # Workflow 1: PDF/Image Redaction
+        if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
+            print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
+            try:
+                # Step 1: Prepare the document
+                print("\nStep 1: Preparing document...")
+                (
+                    prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
+                    image_annotations, _, original_cropboxes, page_sizes, textract_output_found, _, _, _, _
+                ) = prepare_image_or_pdf(
+                    file_paths=[args.input_file], text_extract_method=args.ocr_method,
+                    all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
+                    first_loop_state=True, prepare_for_review=args.prepare_for_review,
+                    output_folder=args.output_dir, prepare_images=args.prepare_images
+                )
+                print(f"Preparation complete. {prep_summary}")
+                # Step 2: Redact the prepared document
+                print("\nStep 2: Running redaction...")
+                (
+                    output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _
+                ) = choose_and_run_redactor(
+                    file_paths=[args.input_file], prepared_pdf_file_paths=prepared_pdf_paths,
+                    pdf_image_file_paths=image_file_paths, chosen_redact_entities=chosen_redact_entities,
+                    chosen_redact_comprehend_entities=chosen_comprehend_entities, text_extraction_method=args.ocr_method,
+                    in_allow_list=allow_list, in_deny_list=args.in_deny_list,
+                    redact_whole_page_list=args.redact_whole_page_list, first_loop_state=True,
+                    page_min=args.page_min, page_max=args.page_max, handwrite_signature_checkbox=args.handwrite_signature_checkbox,
+                    pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
+                    document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
+                    aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
+                    language=args.language, output_folder=args.output_dir
+                )
+                print("\n--- Redaction Process Complete ---")
+                print(f"Summary: {output_summary}")
+                print(f"\nOutput files saved to: {args.output_dir}")
+                print("Generated Files:", sorted(output_files))
+                if log_files: print("Log Files:", sorted(log_files))
+            except Exception as e:
+                print(f"\nAn error occurred during the PDF/Image redaction workflow: {e}")
+        # Workflow 2: Word/Tabular Data Anonymisation
+        elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
+            print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
+            try:
+                # Run the anonymisation function directly
+                output_summary, output_files, _, _, log_files, _, _ = anonymise_files_with_open_text(
+                    file_paths=[args.input_file],
+                    in_text="", # Not used for file-based operations
+                    anon_strat=args.anon_strat,
+                    chosen_cols=args.columns,
+                    chosen_redact_entities=chosen_redact_entities,
+                    in_allow_list=allow_list,
+                    in_excel_sheets=args.excel_sheets,
+                    first_loop_state=True,
+                    output_folder=args.output_dir,
+                    in_deny_list=deny_list,
+                    max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
+                    pii_identification_method=args.pii_detector,
+                    chosen_redact_comprehend_entities=chosen_comprehend_entities,
+                    aws_access_key_textbox=args.aws_access_key,
+                    aws_secret_key_textbox=args.aws_secret_key,
+                    language=args.language,
+                    do_initial_clean=args.do_initial_clean
+                )
+                print("\n--- Anonymisation Process Complete ---")
+                print(f"Summary: {output_summary}")
+                print(f"\nOutput files saved to: {args.output_dir}")
+                print("Generated Files:", sorted(output_files))
+                if log_files: print("Log Files:", sorted(log_files))
+            except Exception as e:
+                print(f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}")
+        else:
+            print(f"Error: Unsupported file type '{file_extension}' for redaction.")
+            print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
+            print("Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet")
+    # Task 2: Duplicate Detection
+    elif args.task == 'deduplicate':
+        print("--- Starting Duplicate Detection Workflow... ---")
+        try:
+            if args.duplicate_type == 'pages':
+                # Page duplicate detection
+                if file_extension == '.csv':
+                    print("--- Detected OCR CSV file. Starting Page Duplicate Detection... ---")
+                    if args.search_query:
+                        # Use search-based duplicate detection
+                        print(f"Searching for duplicates of: '{args.search_query}'")
+                        # Note: This would require the OCR data to be loaded first
+                        # For now, we'll use the general duplicate analysis
+                        print("Note: Search-based duplicate detection requires OCR data preparation.")
+                        print("Using general duplicate analysis instead.")
+                    # Load the CSV file as a list for the duplicate analysis function
+                    results_df, output_paths, full_data_by_file = run_duplicate_analysis(
+                        files=[args.input_file],
+                        threshold=args.similarity_threshold,
+                        min_words=args.min_word_count,
+                        min_consecutive=args.min_consecutive_pages,
+                        greedy_match=args.greedy_match,
+                        combine_pages=args.combine_pages
+                    )
+                    print("\n--- Page Duplicate Detection Complete ---")
+                    print(f"Found {len(results_df)} duplicate matches")
+                    print(f"\nOutput files saved to: {args.output_dir}")
+                    if output_paths: print("Generated Files:", sorted(output_paths))
+                else:
+                    print(f"Error: Page duplicate detection requires CSV files with OCR data.")
+                    print("Please provide a CSV file containing OCR output data.")
+            elif args.duplicate_type == 'tabular':
+                # Tabular duplicate detection
+                if file_extension in ['.csv', '.xlsx', '.xls', '.parquet']:
+                    print("--- Detected tabular file. Starting Tabular Duplicate Detection... ---")
+                    results_df, output_paths, full_data_by_file = run_tabular_duplicate_analysis(
+                        files=[args.input_file],
+                        threshold=args.similarity_threshold,
+                        min_words=args.min_word_count,
+                        text_columns=args.text_columns if args.text_columns else None,
+                        output_folder=args.output_dir
+                    )
+                    print("\n--- Tabular Duplicate Detection Complete ---")
+                    print(f"Found {len(results_df)} duplicate matches")
+                    print(f"\nOutput files saved to: {args.output_dir}")
+                    if output_paths: print("Generated Files:", sorted(output_paths))
+                else:
+                    print(f"Error: Tabular duplicate detection requires CSV, Excel, or Parquet files.")
+                    print("Supported types: .csv, .xlsx, .xls, .parquet")
+            else:
+                print(f"Error: Invalid duplicate type '{args.duplicate_type}'.")
+                print("Valid options: 'pages' or 'tabular'")
+        except Exception as e:
+            print(f"\nAn error occurred during the duplicate detection workflow: {e}")
+    else:
+        print(f"Error: Invalid task '{args.task}'.")
+        print("Valid options: 'redact' or 'deduplicate'")
+if __name__ == "__main__":
+    main()

index.md DELETED Viewed

@@ -1,8 +0,0 @@
----
-layout: default
-title: Home
-redirect_from:
-  - "/"
----
-{% include_relative README.md %}

lambda_entrypoint.py CHANGED Viewed

@@ -1,120 +1,149 @@
 import boto3
 import os
-import subprocess
-print("In lambda_entrypoint function")
-try:
-    s3_client = boto3.client("s3", region_name="eu-west-2")
-    print("s3_client is initialized:", s3_client)
-except Exception as e:
-    print(f"Error initializing s3_client: {e}")
-    raise e
-TMP_DIR = "/tmp/"
-run_direct_mode = os.getenv("RUN_DIRECT_MODE", "0")
-if run_direct_mode == "0":
-        # Gradio App execution
-        from app import app, max_queue_size, max_file_size  # Replace with actual import if needed
-        from tools.auth import authenticate_user
-        if os.getenv("COGNITO_AUTH", "0") == "1":
-            app.queue(max_size=max_queue_size).launch(show_error=True, auth=authenticate_user, max_file_size=max_file_size)
-        else:
-            app.queue(max_size=max_queue_size).launch(show_error=True, inbrowser=True, max_file_size=max_file_size)
 def download_file_from_s3(bucket_name, key, download_path):
     """Download a file from S3 to the local filesystem."""
-    s3_client.download_file(bucket_name, key, download_path)
-    print(f"Downloaded {key} to {download_path}")
-def upload_file_to_s3(file_path, bucket_name, key):
-    """Upload a file to S3."""
-    s3_client.upload_file(file_path, bucket_name, key)
-    print(f"Uploaded {file_path} to {key}")
 def lambda_handler(event, context):
-    print("In lambda_handler function")
-    # Create necessary directories
-    os.makedirs(os.path.join(TMP_DIR, "input"), exist_ok=True)
-    os.makedirs(os.path.join(TMP_DIR, "output"), exist_ok=True)
-    print("Got to record loop")
-    print("Event records is:", event["Records"])
-    # Extract S3 bucket and object key from the Records
-    for record in event.get("Records", [{}]):
-        bucket_name = record.get("s3", {}).get("bucket", {}).get("name")
-        input_key = record.get("s3", {}).get("object", {}).get("key")
-        print(f"Processing file {input_key} from bucket {bucket_name}")
-        # Extract additional arguments
-        arguments = event.get("arguments", {})
-        if not input_key:
-            input_key = arguments.get("input_file", "")
-        ocr_method = arguments.get("ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)")
-        pii_detector = arguments.get("pii_detector", "AWS Comprehend")
-        page_min = str(arguments.get("page_min", 0))
-        page_max = str(arguments.get("page_max", 0))
-        allow_list = arguments.get("allow_list", None)
-        output_dir = arguments.get("output_dir", os.path.join(TMP_DIR, "output"))
-        print(f"OCR Method: {ocr_method}")
-        print(f"PII Detector: {pii_detector}")
-        print(f"Page Range: {page_min} - {page_max}")
-        print(f"Allow List: {allow_list}")
-        print(f"Output Directory: {output_dir}")
-        # Download input file
-        input_file_path = os.path.join(TMP_DIR, "input", os.path.basename(input_key))
-        download_file_from_s3(bucket_name, input_key, input_file_path)
-        # Construct command
-        command = [
-            "python",
-            "app.py",
-            "--input_file", input_file_path,
-            "--ocr_method", ocr_method,
-            "--pii_detector", pii_detector,
-            "--page_min", page_min,
-            "--page_max", page_max,
-            "--output_dir", output_dir,
-        ]
-        # Add allow_list only if provided
-        if allow_list:
-            allow_list_path = os.path.join(TMP_DIR, "allow_list.csv")
-            download_file_from_s3(bucket_name, allow_list, allow_list_path)
-            command.extend(["--allow_list", allow_list_path])
-        print(f"Running command: {command}")
-        try:
-            result = subprocess.run(command, capture_output=True, text=True, check=True)
-            print("Processing succeeded.")
-            print(result.stdout)
-        except subprocess.CalledProcessError as e:
-            print("Error during processing:", e.stderr)
-            raise e
-        except Exception as e:
-            print(f"Unexpected error: {str(e)}")
-            raise e
-        print("Now uploading files from:", output_dir)
-        # Upload output files back to S3
-        for root, _, files in os.walk(output_dir):
-            for file_name in files:
-                print("file_name:", file_name)
-                local_file_path = os.path.join(root, file_name)
-                output_key = f"output/{file_name}"
-                print("Output location is:", output_key)
-                upload_file_to_s3(local_file_path, bucket_name, output_key)
-    return {"statusCode": 200, "body": "Processing complete."}

 import boto3
 import os
+import json
+# Import the main function from your CLI script
+from cli_redact import main as cli_main
+print("Lambda entrypoint loading...")
+# Initialize S3 client outside the handler for connection reuse
+s3_client = boto3.client("s3", region_name=os.getenv("AWS_REGION", "eu-west-2"))
+print("S3 client initialized")
+# Lambda's only writable directory
+TMP_DIR = "/tmp"
+INPUT_DIR = os.path.join(TMP_DIR, "input")
+OUTPUT_DIR = os.path.join(TMP_DIR, "output")
 def download_file_from_s3(bucket_name, key, download_path):
     """Download a file from S3 to the local filesystem."""
+    try:
+        s3_client.download_file(bucket_name, key, download_path)
+        print(f"Successfully downloaded s3://{bucket_name}/{key} to {download_path}")
+    except Exception as e:
+        print(f"Error downloading from S3: {e}")
+        raise
+def upload_directory_to_s3(local_directory, bucket_name, s3_prefix):
+    """Upload all files from a local directory to an S3 prefix."""
+    for root, _, files in os.walk(local_directory):
+        for file_name in files:
+            local_file_path = os.path.join(root, file_name)
+            # Create a relative path to maintain directory structure if needed
+            relative_path = os.path.relpath(local_file_path, local_directory)
+            output_key = os.path.join(s3_prefix, relative_path)
+            try:
+                s3_client.upload_file(local_file_path, bucket_name, output_key)
+                print(f"Successfully uploaded {local_file_path} to s3://{bucket_name}/{output_key}")
+            except Exception as e:
+                print(f"Error uploading to S3: {e}")
+                raise
 def lambda_handler(event, context):
+    print(f"Received event: {json.dumps(event)}")
+    # 1. Setup temporary directories
+    os.makedirs(INPUT_DIR, exist_ok=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    # 2. Extract information from the event
+    # Assumes the event is triggered by S3 and may contain an 'arguments' payload
+    try:
+        record = event['Records'][0]
+        bucket_name = record['s3']['bucket']['name']
+        input_key = record['s3']['object']['key']
+        # The user metadata can be used to pass arguments
+        # This is more robust than embedding them in the main event body
+        response = s3_client.head_object(Bucket=bucket_name, Key=input_key)
+        metadata = response.get('Metadata', {})
+        # Arguments can be passed as a JSON string in metadata
+        arguments = json.loads(metadata.get('arguments', '{}'))
+    except (KeyError, IndexError) as e:
+        print(f"Could not parse S3 event record: {e}. Checking for direct invocation payload.")
+        # Fallback for direct invocation (e.g., from Step Functions or manual test)
+        bucket_name = event.get('bucket_name')
+        input_key = event.get('input_key')
+        arguments = event.get('arguments', {})
+        if not all([bucket_name, input_key]):
+            raise ValueError("Missing 'bucket_name' or 'input_key' in direct invocation event.")
+    print(f"Processing s3://{bucket_name}/{input_key}")
+    print(f"With arguments: {arguments}")
+    # 3. Download the main input file
+    input_file_path = os.path.join(INPUT_DIR, os.path.basename(input_key))
+    download_file_from_s3(bucket_name, input_key, input_file_path)
+    # 4. Prepare arguments for the CLI function
+    # This dictionary should mirror the one in your app.py's "direct mode"
+    cli_args = {
+        'task': arguments.get('task', 'redact'),
+        'input_file': input_file_path,
+        'output_dir': OUTPUT_DIR,
+        'language': arguments.get('language', 'en_core_web_sm'),
+        'pii_detector': arguments.get('pii_detector', 'Local Spacy model'), # Default to local
+        'ocr_method': arguments.get('ocr_method', 'Tesseract OCR - all PDF types'),
+        'page_min': int(arguments.get('page_min', 0)),
+        'page_max': int(arguments.get('page_max', 999)),
+        # Handle optional files like allow/deny lists
+        'allow_list': None,
+        'deny_list': None,
+        # Deduplication specific arguments
+        'duplicate_type': arguments.get('duplicate_type', 'pages'),
+        'similarity_threshold': float(arguments.get('similarity_threshold', 0.95)),
+        'min_word_count': int(arguments.get('min_word_count', 3)),
+        'search_query': arguments.get('search_query'),
+        'text_columns': arguments.get('text_columns', []),
+        # Add other arguments from your app.py as needed, using .get() for safety
+        'anon_strat': arguments.get('anon_strat', 'redact'),
+        'columns': arguments.get('columns', []),
+        'aws_access_key': None, # Best practice: use IAM Role instead of keys
+        'aws_secret_key': None,
+        'aws_region': os.getenv("AWS_REGION", "eu-west-2"),
+        's3_bucket': bucket_name,
+        # Set defaults for boolean flags
+        'prepare_images': True,
+        'compress_redacted_pdf': False,
+        'return_pdf_end_of_redaction': True
+    }
+    # Download optional files if they are specified
+    allow_list_key = arguments.get('allow_list')
+    if allow_list_key:
+        allow_list_path = os.path.join(INPUT_DIR, 'allow_list.csv')
+        download_file_from_s3(bucket_name, allow_list_key, allow_list_path)
+        cli_args['allow_list'] = allow_list_path
+    deny_list_key = arguments.get('deny_list')
+    if deny_list_key:
+        deny_list_path = os.path.join(INPUT_DIR, 'deny_list.csv')
+        download_file_from_s3(bucket_name, deny_list_key, deny_list_path)
+        cli_args['deny_list'] = deny_list_path
+    # 5. Execute the main application logic
+    try:
+        print("--- Starting CLI Redact Main Function ---")
+        print(f"Arguments passed to cli_main: {cli_args}")
+        cli_main(direct_mode_args=cli_args)
+        print("--- CLI Redact Main Function Finished ---")
+    except Exception as e:
+        print(f"An error occurred during CLI execution: {e}")
+        # Optionally, re-raise the exception to make the Lambda fail
+        raise
+    # 6. Upload results back to S3
+    output_s3_prefix = f"output/{os.path.splitext(os.path.basename(input_key))[0]}"
+    print(f"Uploading contents of {OUTPUT_DIR} to s3://{bucket_name}/{output_s3_prefix}/")
+    upload_directory_to_s3(OUTPUT_DIR, bucket_name, output_s3_prefix)
+    return {
+        "statusCode": 200,
+        "body": json.dumps(f"Processing complete for {input_key}. Output saved to s3://{bucket_name}/{output_s3_prefix}/")
+    }

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "doc_redaction"
-version = "1.0.0"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -23,7 +23,7 @@ dependencies = [
     "spacy==3.8.7",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
-    "gradio==5.44.0",
     "boto3==1.40.10",
     "pyarrow==21.0.0",
     "openpyxl==3.1.5",
@@ -37,7 +37,8 @@ dependencies = [
     "awslambdaric==3.1.1",
     "python-docx==1.2.0",
     "paddlepaddle==3.1.0",
-    "paddleocr==3.1.1"
 ]
 [project.urls]

 [project]
 name = "doc_redaction"
+version = "1.1.0"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"
     "spacy==3.8.7",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
+    "gradio==5.45.0",
     "boto3==1.40.10",
     "pyarrow==21.0.0",
     "openpyxl==3.1.5",
     "awslambdaric==3.1.1",
     "python-docx==1.2.0",
     "paddlepaddle==3.1.0",
+    "paddleocr==3.1.1",
+    "polars==1.33.1"
 ]
 [project.urls]

requirements.txt CHANGED Viewed

@@ -6,11 +6,12 @@ presidio_analyzer==2.2.359
 presidio_anonymizer==2.2.359
 presidio-image-redactor==0.0.57
 pikepdf==9.10.2
-pandas==2.3.1
 scikit-learn==1.7.1
 spacy==3.8.7
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
-gradio==5.44.0
 boto3==1.40.10
 pyarrow==21.0.0
 openpyxl==3.1.5

 presidio_anonymizer==2.2.359
 presidio-image-redactor==0.0.57
 pikepdf==9.10.2
+pandas==2.3.2
 scikit-learn==1.7.1
 spacy==3.8.7
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+gradio==5.45.0
+polars==1.33.1
 boto3==1.40.10
 pyarrow==21.0.0
 openpyxl==3.1.5

tools/cli_redact.py DELETED Viewed

@@ -1,164 +0,0 @@
-import argparse
-import os
-import pandas as pd
-from tools.config import get_or_create_env_var, LOCAL_PII_OPTION, AWS_PII_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION
-from tools.helper_functions import ensure_output_folder_exists
-from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
-from tools.file_redaction import choose_and_run_redactor
-from tools.anonymisation import anonymise_files_with_open_text
-# --- Constants and Configuration ---
-INPUT_FOLDER = 'input/'
-OUTPUT_FOLDER = 'output/'
-DEFAULT_LANGUAGE = 'en'
-# Define entities for redaction
-chosen_comprehend_entities = [
-    'BANK_ACCOUNT_NUMBER', 'BANK_ROUTING', 'CREDIT_DEBIT_NUMBER',
-    'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY', 'PIN', 'EMAIL', 'ADDRESS',
-    'NAME', 'PHONE', 'PASSPORT_NUMBER', 'DRIVER_ID', 'USERNAME', 'PASSWORD',
-    'IP_ADDRESS', 'MAC_ADDRESS', 'LICENSE_PLATE', 'VEHICLE_IDENTIFICATION_NUMBER',
-    'UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER',
-    'SWIFT_CODE', 'UK_NATIONAL_HEALTH_SERVICE_NUMBER'
-]
-chosen_redact_entities = [
-    "TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"
-]
-# --- Main CLI Function ---
-def main():
-    """
-    A unified command-line interface to prepare, redact, and anonymise various document types.
-    """
-    parser = argparse.ArgumentParser(
-        description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
-        formatter_class=argparse.RawTextHelpFormatter
-    )
-    # --- General Arguments (apply to all file types) ---
-    general_group = parser.add_argument_group('General Options')
-    general_group.add_argument('--input_file', required=True, help='Path to the input file to process.')
-    general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
-    general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
-    general_group.add_argument('--allow_list', help='Path to a CSV file with words to exclude from redaction.')
-    general_group.add_argument('--pii_detector',
-    choices=[LOCAL_PII_OPTION, AWS_PII_OPTION],
-    default=LOCAL_PII_OPTION,
-    help='Core PII detection method (Local or AWS).')
-    general_group.add_argument('--aws_access_key', default='', help='Your AWS Access Key ID.')
-    general_group.add_argument('--aws_secret_key', default='', help='Your AWS Secret Access Key.')
-    # --- PDF/Image Redaction Arguments ---
-    pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
-    pdf_group.add_argument('--ocr_method',
-    choices=[SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION],
-    default=TESSERACT_TEXT_EXTRACT_OPTION,
-    help='OCR method for text extraction from images.')
-    pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
-    pdf_group.add_argument('--page_max', type=int, default=999, help='Last page to redact.')
-    pdf_group.add_argument('--prepare_for_review', action='store_true', help='Prepare files for reviewing redactions.')
-    pdf_group.add_argument('--no_images', action='store_false', dest='prepare_images', help='Disable image creation for PDF pages.')
-    # --- Word/Tabular Anonymisation Arguments ---
-    tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
-    tabular_group.add_argument('--anon_strat', choices=['redact', 'encrypt', 'hash'], default='redact', help='The anonymisation strategy to apply.')
-    tabular_group.add_argument('--columns', nargs='+', default=[], help='A list of column names to anonymise in tabular data.')
-    tabular_group.add_argument('--excel_sheets', nargs='+', default=[], help='Specific Excel sheet names to process.')
-    tabular_group.add_argument('--deny_list', help='Path to a CSV file with specific terms/phrases to redact.')
-    tabular_group.add_argument('--fuzzy_mistakes', type=int, default=1, help='Number of allowed spelling mistakes for fuzzy matching.')
-    args = parser.parse_args()
-    # --- Initial Setup ---
-    ensure_output_folder_exists(args.output_dir)
-    _, file_extension = os.path.splitext(args.input_file)
-    file_extension = file_extension.lower()
-    # Load allow/deny lists
-    allow_list = pd.read_csv(args.allow_list) if args.allow_list else pd.DataFrame()
-    deny_list = pd.read_csv(args.deny_list).iloc[:, 0].tolist() if args.deny_list else []
-    # --- Route to the Correct Workflow Based on File Type ---
-    # Workflow 1: PDF/Image Redaction
-    if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
-        print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
-        try:
-            # Step 1: Prepare the document
-            print("\nStep 1: Preparing document...")
-            (
-                prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
-                image_annotations, _, original_cropboxes, page_sizes, textract_output_found, _, _, _, _
-            ) = prepare_image_or_pdf(
-                file_paths=[args.input_file], text_extract_method=args.ocr_method,
-                all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
-                first_loop_state=True, prepare_for_review=args.prepare_for_review,
-                output_folder=args.output_dir, prepare_images=args.prepare_images
-            )
-            print(f"Preparation complete. {prep_summary}")
-            # Step 2: Redact the prepared document
-            print("\nStep 2: Running redaction...")
-            (
-                output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _
-            ) = choose_and_run_redactor(
-                file_paths=[args.input_file], prepared_pdf_file_paths=prepared_pdf_paths,
-                pdf_image_file_paths=image_file_paths, chosen_redact_entities=chosen_redact_entities,
-                chosen_redact_comprehend_entities=chosen_comprehend_entities, text_extraction_method=args.ocr_method,
-                in_allow_list=allow_list, first_loop_state=True, page_min=args.page_min, page_max=args.page_max,
-                pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
-                document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
-                aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
-                language=args.language, output_folder=args.output_dir
-            )
-            print("\n--- Redaction Process Complete ---")
-            print(f"Summary: {output_summary}")
-            print(f"\nOutput files saved to: {args.output_dir}")
-            print("Generated Files:", sorted(output_files))
-            if log_files: print("Log Files:", sorted(log_files))
-        except Exception as e:
-            print(f"\nAn error occurred during the PDF/Image redaction workflow: {e}")
-    # Workflow 2: Word/Tabular Data Anonymisation
-    elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
-        print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
-        try:
-            # Run the anonymisation function directly
-            output_summary, output_files, _, _, log_files, _, _ = anonymise_files_with_open_text(
-                file_paths=[args.input_file],
-                in_text="", # Not used for file-based operations
-                anon_strat=args.anon_strat,
-                chosen_cols=args.columns,
-                chosen_redact_entities=chosen_redact_entities,
-                in_allow_list=allow_list,
-                in_excel_sheets=args.excel_sheets,
-                first_loop_state=True,
-                output_folder=args.output_dir,
-                in_deny_list=deny_list,
-                max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
-                pii_identification_method=args.pii_detector,
-                chosen_redact_comprehend_entities=chosen_comprehend_entities,
-                aws_access_key_textbox=args.aws_access_key,
-                aws_secret_key_textbox=args.aws_secret_key,
-                language=args.language
-            )
-            print("\n--- Anonymisation Process Complete ---")
-            print(f"Summary: {output_summary}")
-            print(f"\nOutput files saved to: {args.output_dir}")
-            print("Generated Files:", sorted(output_files))
-            if log_files: print("Log Files:", sorted(log_files))
-        except Exception as e:
-            print(f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}")
-    else:
-        print(f"Error: Unsupported file type '{file_extension}'.")
-        print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
-        print("Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet")
-if __name__ == "__main__":
-    main()

tools/config.py CHANGED Viewed

@@ -105,7 +105,7 @@ CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
 ###
 # Image options
 ###
-IMAGES_DPI = get_or_create_env_var('IMAGES_DPI', '300.0')
 LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True')
 MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to None if blank in file_conversion.py
@@ -232,6 +232,7 @@ if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == 'True':
     aws_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION)
 TEXT_EXTRACTION_MODELS = local_model_options + aws_model_options
 SHOW_LOCAL_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_LOCAL_PII_DETECTION_OPTIONS', 'True')
 SHOW_AWS_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_AWS_PII_DETECTION_OPTIONS', 'True')
@@ -266,6 +267,11 @@ TABULAR_PII_DETECTION_MODELS = PII_DETECTION_MODELS.copy()
 if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
     TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
 ### Local OCR model - Tesseract vs PaddleOCR
 CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var('CHOSEN_LOCAL_OCR_MODEL', "tesseract") # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
@@ -281,11 +287,20 @@ CHOSEN_REDACT_ENTITIES = get_or_create_env_var('CHOSEN_REDACT_ENTITIES', "['TITL
 FULL_ENTITY_LIST = get_or_create_env_var('FULL_ENTITY_LIST', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']")
 # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
-PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
-MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999')
 CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
@@ -306,10 +321,19 @@ aws_comprehend_language_choices = get_or_create_env_var("aws_comprehend_language
 MAPPED_LANGUAGE_CHOICES = get_or_create_env_var("MAPPED_LANGUAGE_CHOICES", "['english', 'french', 'german', 'spanish', 'italian', 'dutch', 'portuguese', 'chinese', 'japanese', 'korean', 'lithuanian', 'macedonian', 'norwegian_bokmaal', 'polish', 'romanian', 'russian', 'slovenian', 'swedish', 'catalan', 'ukrainian']")
 LANGUAGE_CHOICES = get_or_create_env_var("LANGUAGE_CHOICES", "['en', 'fr', 'de', 'es', 'it', 'nl', 'pt', 'zh', 'ja', 'ko', 'lt', 'mk', 'nb', 'pl', 'ro', 'ru', 'sl', 'sv', 'ca', 'uk']")
-### File output options
 RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
 COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
@@ -319,27 +343,35 @@ COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") #
 ###
 TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tmp/tld/')
-try:
-    extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
-except:
-    extract = TLDExtract(cache_dir=None)
 # Get some environment variables and Launch the Gradio app
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
 MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
-MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
 GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
 ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
-DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3')
-GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', '')
 ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
@@ -348,8 +380,27 @@ S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_a
 if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
 else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
-FILE_INPUT_HEIGHT = get_or_create_env_var('FILE_INPUT_HEIGHT', '200')
 ###
 # COST CODE OPTIONS

 ###
 # Image options
 ###
+IMAGES_DPI = float(get_or_create_env_var('IMAGES_DPI', '300.0'))
 LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True')
 MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to None if blank in file_conversion.py
     aws_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION)
 TEXT_EXTRACTION_MODELS = local_model_options + aws_model_options
+DO_INITIAL_TABULAR_DATA_CLEAN = get_or_create_env_var('DO_INITIAL_TABULAR_DATA_CLEAN', 'True')
 SHOW_LOCAL_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_LOCAL_PII_DETECTION_OPTIONS', 'True')
 SHOW_AWS_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_AWS_PII_DETECTION_OPTIONS', 'True')
 if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
     TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
+DEFAULT_TEXT_COLUMNS = get_or_create_env_var('DEFAULT_TEXT_COLUMNS', "[]")
+DEFAULT_EXCEL_SHEETS = get_or_create_env_var('DEFAULT_EXCEL_SHEETS', "[]")
+DEFAULT_TABULAR_ANONYMISATION_STRATEGY = get_or_create_env_var('DEFAULT_TABULAR_ANONYMISATION_STRATEGY', "redact completely")
 ### Local OCR model - Tesseract vs PaddleOCR
 CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var('CHOSEN_LOCAL_OCR_MODEL', "tesseract") # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
 FULL_ENTITY_LIST = get_or_create_env_var('FULL_ENTITY_LIST', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']")
+DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var('DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX', "['Extract handwriting']")
+DEFAULT_SEARCH_QUERY = get_or_create_env_var('DEFAULT_SEARCH_QUERY', '')
+DEFAULT_FUZZY_SPELLING_MISTAKES_NUM = int(get_or_create_env_var('DEFAULT_FUZZY_SPELLING_MISTAKES_NUM', '1'))
+DEFAULT_PAGE_MIN = int(get_or_create_env_var('DEFAULT_PAGE_MIN', '0'))
+DEFAULT_PAGE_MAX = int(get_or_create_env_var('DEFAULT_PAGE_MAX', '999'))
 # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
+PAGE_BREAK_VALUE = int(get_or_create_env_var('PAGE_BREAK_VALUE', '99999'))
+MAX_TIME_VALUE = int(get_or_create_env_var('MAX_TIME_VALUE', '999999'))
 CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
 MAPPED_LANGUAGE_CHOICES = get_or_create_env_var("MAPPED_LANGUAGE_CHOICES", "['english', 'french', 'german', 'spanish', 'italian', 'dutch', 'portuguese', 'chinese', 'japanese', 'korean', 'lithuanian', 'macedonian', 'norwegian_bokmaal', 'polish', 'romanian', 'russian', 'slovenian', 'swedish', 'catalan', 'ukrainian']")
 LANGUAGE_CHOICES = get_or_create_env_var("LANGUAGE_CHOICES", "['en', 'fr', 'de', 'es', 'it', 'nl', 'pt', 'zh', 'ja', 'ko', 'lt', 'mk', 'nb', 'pl', 'ro', 'ru', 'sl', 'sv', 'ca', 'uk']")
+###
+# Duplicate detection settings
+###
+DEFAULT_DUPLICATE_DETECTION_THRESHOLD = float(get_or_create_env_var("DEFAULT_DUPLICATE_DETECTION_THRESHOLD", "0.95"))
+DEFAULT_MIN_CONSECUTIVE_PAGES = int(get_or_create_env_var("DEFAULT_MIN_CONSECUTIVE_PAGES", "1"))
+USE_GREEDY_DUPLICATE_DETECTION = get_or_create_env_var("USE_GREEDY_DUPLICATE_DETECTION", "True")
+DEFAULT_COMBINE_PAGES = get_or_create_env_var("DEFAULT_COMBINE_PAGES", "True")
+DEFAULT_MIN_WORD_COUNT = int(get_or_create_env_var("DEFAULT_MIN_WORD_COUNT", "10"))
+###
+# File output options
+###
 RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
 COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
 ###
 TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tmp/tld/')
+try: extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
+except: extract = TLDExtract(cache_dir=None)
 # Get some environment variables and Launch the Gradio app
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
+# Direct mode configuration options
+DIRECT_MODE_TASK = get_or_create_env_var('DIRECT_MODE_TASK', 'redact')  # 'redact' or 'deduplicate'
+DIRECT_MODE_INPUT_FILE = get_or_create_env_var('DIRECT_MODE_INPUT_FILE', '')  # Path to input file
+DIRECT_MODE_OUTPUT_DIR = get_or_create_env_var('DIRECT_MODE_OUTPUT_DIR', OUTPUT_FOLDER)  # Output directory
+DIRECT_MODE_DUPLICATE_TYPE = get_or_create_env_var('DIRECT_MODE_DUPLICATE_TYPE', 'pages')  # 'pages' or 'tabular'
 MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
+MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb').lower()
 GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
 ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
+DEFAULT_CONCURRENCY_LIMIT = int(get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3'))
+FILE_INPUT_HEIGHT = get_or_create_env_var('FILE_INPUT_HEIGHT', '200')
+### ALLOW LIST
+GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', 'False')
 ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
 if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
 else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
+### DENY LIST
+GET_DEFAULT_DENY_LIST = get_or_create_env_var('GET_DEFAULT_DENY_LIST', 'False')
+S3_DENY_LIST_PATH = get_or_create_env_var('S3_DENY_LIST_PATH', '') # default_deny_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
+DENY_LIST_PATH = get_or_create_env_var('DENY_LIST_PATH', '') # config/default_deny_list.csv
+if DENY_LIST_PATH: OUTPUT_DENY_LIST_PATH = DENY_LIST_PATH
+else: OUTPUT_DENY_LIST_PATH = 'config/default_deny_list.csv'
+### WHOLE PAGE REDACTION LIST
+GET_DEFAULT_WHOLE_PAGE_REDACTION_LIST = get_or_create_env_var('GET_DEFAULT_WHOLE_PAGE_REDACTION_LIST', 'False')
+S3_WHOLE_PAGE_REDACTION_LIST_PATH = get_or_create_env_var('S3_WHOLE_PAGE_REDACTION_LIST_PATH', '') # default_whole_page_redaction_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
+WHOLE_PAGE_REDACTION_LIST_PATH = get_or_create_env_var('WHOLE_PAGE_REDACTION_LIST_PATH', '') # config/default_whole_page_redaction_list.csv
+if WHOLE_PAGE_REDACTION_LIST_PATH: OUTPUT_WHOLE_PAGE_REDACTION_LIST_PATH = WHOLE_PAGE_REDACTION_LIST_PATH
+else: OUTPUT_WHOLE_PAGE_REDACTION_LIST_PATH = 'config/default_whole_page_redaction_list.csv'
 ###
 # COST CODE OPTIONS

tools/custom_csvlogger.py CHANGED Viewed

@@ -78,7 +78,7 @@ class CSVLogger_custom(FlaggingCallback):
         if replacement_headers:
             if additional_headers is None:
-                additional_headers = []
             if len(replacement_headers) != len(self.components):
                 raise ValueError(
@@ -143,18 +143,16 @@ class CSVLogger_custom(FlaggingCallback):
     replacement_headers: list[str] | None = None
 ) -> int:
         if self.first_time:
-            print("First time creating file")
-            additional_headers = []
             if flag_option is not None:
                 additional_headers.append("flag")
             if username is not None:
                 additional_headers.append("username")
             additional_headers.append("id")
-            #additional_headers.append("timestamp")
             self._create_dataset_file(additional_headers=additional_headers, replacement_headers=replacement_headers)
             self.first_time = False
-        csv_data = []
         for idx, (component, sample) in enumerate(
             zip(self.components, flag_data, strict=False)
         ):
@@ -214,7 +212,6 @@ class CSVLogger_custom(FlaggingCallback):
                 try:
                     print("Connecting to DynamoDB via existing SSO connection")
                     dynamodb = boto3.resource('dynamodb', region_name=AWS_REGION)
-                    #client = boto3.client('dynamodb')
                     test_connection = dynamodb.meta.client.list_tables()
@@ -224,8 +221,6 @@ class CSVLogger_custom(FlaggingCallback):
                         print("Trying DynamoDB credentials from environment variables")
                         dynamodb = boto3.resource('dynamodb',aws_access_key_id=AWS_ACCESS_KEY,
                             aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
-                        # client = boto3.client('dynamodb',aws_access_key_id=AWS_ACCESS_KEY,
-                        #     aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
                     else:
                         raise Exception("AWS credentials for DynamoDB logging not found")
             else:
@@ -234,12 +229,9 @@ class CSVLogger_custom(FlaggingCallback):
             if dynamodb_table_name is None:
                 raise ValueError("You must provide a dynamodb_table_name if save_to_dynamodb is True")
-            if dynamodb_headers:
-                dynamodb_headers = dynamodb_headers
-            if not dynamodb_headers and replacement_headers:
-                dynamodb_headers = replacement_headers
-            elif headers:
-                dynamodb_headers = headers
             elif not dynamodb_headers:
                 raise ValueError("Headers not found. You must provide dynamodb_headers or replacement_headers to create a new table.")
@@ -261,9 +253,6 @@ class CSVLogger_custom(FlaggingCallback):
             except botocore.exceptions.ClientError as e:
                 if e.response['Error']['Code'] == 'ResourceNotFoundException':
-                    #print(f"Creating DynamoDB table '{dynamodb_table_name}'...")
-                    #print("dynamodb_headers:", dynamodb_headers)
                     attribute_definitions = [
                         {'AttributeName': 'id', 'AttributeType': 'S'}  # Only define key attributes here
                     ]
@@ -288,18 +277,12 @@ class CSVLogger_custom(FlaggingCallback):
             try:
                 item = {
                     'id': str(generated_id),  # UUID primary key
-                    #'created_by': username if username else "unknown",
                     'timestamp': timestamp,
                 }
-                #print("dynamodb_headers:", dynamodb_headers)
-                #print("csv_data:", csv_data)
                 # Map the headers to values
                 item.update({header: str(value) for header, value in zip(dynamodb_headers, csv_data)})
-                #print("item:", item)
                 table.put_item(Item=item)
                 print("Successfully uploaded log to DynamoDB")

         if replacement_headers:
             if additional_headers is None:
+                additional_headers = list()
             if len(replacement_headers) != len(self.components):
                 raise ValueError(
     replacement_headers: list[str] | None = None
 ) -> int:
         if self.first_time:
+            additional_headers = list()
             if flag_option is not None:
                 additional_headers.append("flag")
             if username is not None:
                 additional_headers.append("username")
             additional_headers.append("id")
             self._create_dataset_file(additional_headers=additional_headers, replacement_headers=replacement_headers)
             self.first_time = False
+        csv_data = list()
         for idx, (component, sample) in enumerate(
             zip(self.components, flag_data, strict=False)
         ):
                 try:
                     print("Connecting to DynamoDB via existing SSO connection")
                     dynamodb = boto3.resource('dynamodb', region_name=AWS_REGION)
                     test_connection = dynamodb.meta.client.list_tables()
                         print("Trying DynamoDB credentials from environment variables")
                         dynamodb = boto3.resource('dynamodb',aws_access_key_id=AWS_ACCESS_KEY,
                             aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
                     else:
                         raise Exception("AWS credentials for DynamoDB logging not found")
             else:
             if dynamodb_table_name is None:
                 raise ValueError("You must provide a dynamodb_table_name if save_to_dynamodb is True")
+            if dynamodb_headers: dynamodb_headers = dynamodb_headers
+            if not dynamodb_headers and replacement_headers: dynamodb_headers = replacement_headers
+            elif headers: dynamodb_headers = headers
             elif not dynamodb_headers:
                 raise ValueError("Headers not found. You must provide dynamodb_headers or replacement_headers to create a new table.")
             except botocore.exceptions.ClientError as e:
                 if e.response['Error']['Code'] == 'ResourceNotFoundException':
                     attribute_definitions = [
                         {'AttributeName': 'id', 'AttributeType': 'S'}  # Only define key attributes here
                     ]
             try:
                 item = {
                     'id': str(generated_id),  # UUID primary key
                     'timestamp': timestamp,
                 }
                 # Map the headers to values
                 item.update({header: str(value) for header, value in zip(dynamodb_headers, csv_data)})
                 table.put_item(Item=item)
                 print("Successfully uploaded log to DynamoDB")

tools/data_anonymise.py CHANGED Viewed

@@ -6,6 +6,8 @@ import time
 import boto3
 import botocore
 import pandas as pd
 import docx
 import gradio as gr
 from openpyxl import Workbook
@@ -16,37 +18,76 @@ from botocore.client import BaseClient
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
-from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices
 from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities, create_nlp_analyser, load_spacy_model
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict
 fake = Faker("en_UK")
 def fake_first_name(x):
     return fake.first_name()
-def initial_clean(text:str) -> str:
-    #### Some of my cleaning functions
-    html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
-    html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
-    non_ascii_pattern = r'[^\x00-\x7F]+'
-    multiple_spaces_regex = r'\s{2,}'
     # Define a list of patterns and their replacements
     patterns = [
         (html_pattern_regex, ' '),
         (html_start_pattern_end_dots_regex, ' '),
         (non_ascii_pattern, ' '),
-        (multiple_spaces_regex, ' ')
     ]
     # Apply each regex replacement
     for pattern, replacement in patterns:
-        text = re.sub(pattern, replacement, text)
-    return text
 def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
         output = list()
@@ -275,7 +316,7 @@ def handle_docx_anonymisation(
     output_xlsx_path = os.path.join(output_folder, f"{file_name_without_ext}_redacted.csv")
-    anonymised_df.to_csv(output_xlsx_path, encoding="utf-8-sig")
     doc.save(output_docx_path)
     with open(log_file_path, "w", encoding="utf-8-sig") as f:
@@ -304,6 +345,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
                          aws_access_key_textbox:str='',
                          aws_secret_key_textbox:str='',
                          actual_time_taken_number:float=0,
                          language: Optional[str] = None,
                          progress: Progress = Progress(track_tqdm=True)):
     """
@@ -334,6 +376,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
     - actual_time_taken_number (float, optional): Time taken to do the redaction.
     - language (str, optional): The language of the text to anonymise.
     - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
     """
     tic = time.perf_counter()
@@ -431,7 +474,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
             sheet_name = ""
             file_type = ""
-            out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=OUTPUT_FOLDER)
         else:
             # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
             file_type = detect_file_type(anon_file)
@@ -482,14 +525,14 @@ def anonymise_files_with_open_text(file_paths: List[str],
                     anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
-                    out_file_paths, out_message, key_string, log_files_output_paths  = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
             else:
                 sheet_name = ""
                 anon_df = read_file(anon_file)
                 out_file_part = get_file_name_without_type(anon_file.name)
-                out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
         # Increase latest file completed count unless we are at the last file
         if latest_file_completed != len(file_paths):
@@ -543,7 +586,8 @@ def tabular_anonymise_wrapper_func(
     comprehend_query_number:int=0,
     comprehend_client:botocore.client.BaseClient="",
     nlp_analyser: AnalyzerEngine = nlp_analyser,
-    output_folder: str = OUTPUT_FOLDER
 ):
     """
     This function wraps the anonymisation process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymisation strategy using the anonymise_script function, and exports the anonymised data to a file.
@@ -570,6 +614,7 @@ def tabular_anonymise_wrapper_func(
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - comprehend_client (optional): The client object from AWS containing a client connection to AWS Comprehend if that option is chosen on the first tab.
     - output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
     """
     def check_lists(list1, list2):
             return any(string in list2 for string in list1)
@@ -610,12 +655,15 @@ def tabular_anonymise_wrapper_func(
     # Split dataframe to keep only selected columns
     #print("Remaining columns to redact:", chosen_cols_in_anon_df)
     anon_df_part = anon_df[chosen_cols_in_anon_df]
     anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
     # Anonymise the selected columns
-    anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser)
     anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
@@ -683,20 +731,35 @@ def anonymise_script(df:pd.DataFrame,
                      comprehend_client:botocore.client.BaseClient="",
                      custom_entities:List[str]=custom_entities,
                      nlp_analyser: AnalyzerEngine = nlp_analyser,
-                     progress:Progress=Progress(track_tqdm=False)):
     '''
     Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
     '''
     print("Identifying personal information")
     analyse_tic = time.perf_counter()
     # Initialize analyzer_results as an empty dictionary to store results by column
-    results_by_column = {}
-    key_string = ""
-    # DataFrame to dict
-    df_dict = df.to_dict(orient="list")
     if isinstance(in_allow_list, list):
         if in_allow_list:
@@ -714,13 +777,14 @@ def anonymise_script(df:pd.DataFrame,
     ### Language check - check if selected language packs exist
     try:
         if language != "en":
-            progress(0.1, desc=f"Loading SpaCy model for {language}")
         load_spacy_model(language)
     except Exception as e:
-        print(f"Error downloading language packs for {language}: {e}")
-        raise Exception(f"Error downloading language packs for {language}: {e}")
     # Try updating the supported languages for the spacy analyser
     try:
@@ -730,8 +794,9 @@ def anonymise_script(df:pd.DataFrame,
             gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
     except Exception as e:
-        print(f"Error creating nlp_analyser for {language}: {e}")
-        raise Exception(f"Error creating nlp_analyser for {language}: {e}")
     if isinstance(in_deny_list, pd.DataFrame):
         if not in_deny_list.empty:
@@ -758,6 +823,14 @@ def anonymise_script(df:pd.DataFrame,
     batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
     analyzer_results = list()
     if pii_identification_method == "Local":
         # Use custom analyzer to be able to track progress with Gradio

 import boto3
 import botocore
 import pandas as pd
+import polars as pl
+import unicodedata
 import docx
 import gradio as gr
 from openpyxl import Workbook
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
+from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices, DO_INITIAL_TABULAR_DATA_CLEAN
 from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities, create_nlp_analyser, load_spacy_model
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict
+if DO_INITIAL_TABULAR_DATA_CLEAN == "True": DO_INITIAL_TABULAR_DATA_CLEAN = True
+else: DO_INITIAL_TABULAR_DATA_CLEAN = False
 fake = Faker("en_UK")
 def fake_first_name(x):
     return fake.first_name()
+# #### Some of my cleaning functions
+url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(?:www\.)[a-zA-Z0-9._-]+\.[a-zA-Z]{2,}'
+html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
+html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
+non_ascii_pattern = r'[^\x00-\x7F]+'
+and_sign_regex = r'&'
+multiple_spaces_regex = r'\s{2,}'
+multiple_new_lines_regex = r'(\r\n|\n)+'
+multiple_punctuation_regex = r"(\p{P})\p{P}+"
+def initial_clean(texts:pd.Series) -> pd.Series:
+    '''
+    This function cleans the text by removing URLs, HTML tags, and non-ASCII characters.
+    '''
+    for text in texts:
+        if not text or pd.isnull(text):
+            text = ""
+        # Normalize unicode characters to decompose any special forms
+        normalized_text = unicodedata.normalize('NFKC', text)
+        # Replace smart quotes and special punctuation with standard ASCII equivalents
+        replacements = {
+            '‘': "'", '’': "'", '“': '"', '”': '"',
+            '–': '-', '—': '-', '…': '...', '•': '*',
+        }
+        # Perform replacements
+        for old_char, new_char in replacements.items():
+            normalised_text = normalized_text.replace(old_char, new_char)
+        text = normalised_text
+    # Convert to polars Series
+    texts = pl.Series(texts).str.strip_chars()
     # Define a list of patterns and their replacements
     patterns = [
+        (multiple_new_lines_regex, '  '),
+        (r'\r', ''),
+        (url_pattern, ' '),
         (html_pattern_regex, ' '),
         (html_start_pattern_end_dots_regex, ' '),
         (non_ascii_pattern, ' '),
+        (multiple_spaces_regex, ' '),
+        (multiple_punctuation_regex, "${1}"),
+        (and_sign_regex, 'and')
     ]
     # Apply each regex replacement
     for pattern, replacement in patterns:
+        texts = texts.str.replace_all(pattern, replacement)
+    # Convert the series back to a list
+    texts = texts.to_list()
+    return texts
 def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
         output = list()
     output_xlsx_path = os.path.join(output_folder, f"{file_name_without_ext}_redacted.csv")
+    anonymised_df.to_csv(output_xlsx_path, encoding="utf-8-sig", index=None)
     doc.save(output_docx_path)
     with open(log_file_path, "w", encoding="utf-8-sig") as f:
                          aws_access_key_textbox:str='',
                          aws_secret_key_textbox:str='',
                          actual_time_taken_number:float=0,
+                         do_initial_clean:bool=DO_INITIAL_TABULAR_DATA_CLEAN,
                          language: Optional[str] = None,
                          progress: Progress = Progress(track_tqdm=True)):
     """
     - actual_time_taken_number (float, optional): Time taken to do the redaction.
     - language (str, optional): The language of the text to anonymise.
     - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
+    - do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
     """
     tic = time.perf_counter()
             sheet_name = ""
             file_type = ""
+            out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=OUTPUT_FOLDER, do_initial_clean=do_initial_clean)
         else:
             # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
             file_type = detect_file_type(anon_file)
                     anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
+                    out_file_paths, out_message, key_string, log_files_output_paths  = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder, do_initial_clean=do_initial_clean)
             else:
                 sheet_name = ""
                 anon_df = read_file(anon_file)
                 out_file_part = get_file_name_without_type(anon_file.name)
+                out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder, do_initial_clean=do_initial_clean)
         # Increase latest file completed count unless we are at the last file
         if latest_file_completed != len(file_paths):
     comprehend_query_number:int=0,
     comprehend_client:botocore.client.BaseClient="",
     nlp_analyser: AnalyzerEngine = nlp_analyser,
+    output_folder: str = OUTPUT_FOLDER,
+    do_initial_clean:bool=DO_INITIAL_TABULAR_DATA_CLEAN
 ):
     """
     This function wraps the anonymisation process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymisation strategy using the anonymise_script function, and exports the anonymised data to a file.
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - comprehend_client (optional): The client object from AWS containing a client connection to AWS Comprehend if that option is chosen on the first tab.
     - output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
+    - do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
     """
     def check_lists(list1, list2):
             return any(string in list2 for string in list1)
     # Split dataframe to keep only selected columns
     #print("Remaining columns to redact:", chosen_cols_in_anon_df)
+    if not anon_df.index.is_unique:
+        anon_df = anon_df.reset_index(drop=True)
     anon_df_part = anon_df[chosen_cols_in_anon_df]
     anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
     # Anonymise the selected columns
+    anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser, do_initial_clean=do_initial_clean)
     anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
                      comprehend_client:botocore.client.BaseClient="",
                      custom_entities:List[str]=custom_entities,
                      nlp_analyser: AnalyzerEngine = nlp_analyser,
+                     do_initial_clean:bool=DO_INITIAL_TABULAR_DATA_CLEAN,
+                     progress:Progress=Progress(track_tqdm=True)):
     '''
     Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
+    Args:
+        df (pd.DataFrame): The input DataFrame containing text to be anonymised.
+        anon_strat (str): The anonymisation strategy to apply (e.g., "replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely").
+        language (str): The language of the text for analysis (e.g., "en", "es").
+        chosen_redact_entities (List[str]): A list of entity types to redact using the local (Presidio) method.
+        in_allow_list (List[str], optional): A list of terms to explicitly allow and not redact. Defaults to an empty list.
+        in_deny_list (List[str], optional): A list of terms to explicitly deny and always redact. Defaults to an empty list.
+        max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of fuzzy spelling mistakes to tolerate for custom recognizers. Defaults to 0.
+        pii_identification_method (str, optional): The method for PII identification ("Local", "AWS Comprehend", or "Both"). Defaults to "Local".
+        chosen_redact_comprehend_entities (List[str], optional): A list of entity types to redact using AWS Comprehend. Defaults to an empty list.
+        comprehend_query_number (int, optional): The number of queries to send to AWS Comprehend per batch. Defaults to 0.
+        comprehend_client (botocore.client.BaseClient, optional): An initialized AWS Comprehend client. Defaults to an empty string.
+        custom_entities (List[str], optional): A list of custom entities to be recognized. Defaults to `custom_entities`.
+        nlp_analyser (AnalyzerEngine, optional): The Presidio AnalyzerEngine instance to use. Defaults to `nlp_analyser`.
+        do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
+        progress (Progress, optional): Gradio Progress object for tracking progress. Defaults to Progress(track_tqdm=False).
     '''
     print("Identifying personal information")
     analyse_tic = time.perf_counter()
     # Initialize analyzer_results as an empty dictionary to store results by column
+    results_by_column = dict()
+    key_string = ""
     if isinstance(in_allow_list, list):
         if in_allow_list:
     ### Language check - check if selected language packs exist
     try:
         if language != "en":
+            progress(0.1, desc=f"Loading spaCy model for {language}")
         load_spacy_model(language)
     except Exception as e:
+        out_message = f"Error downloading language packs for {language}: {e}"
+        print(out_message)
+        raise Exception(out_message)
     # Try updating the supported languages for the spacy analyser
     try:
             gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
     except Exception as e:
+        out_message = f"Error creating nlp_analyser for {language}: {e}"
+        print(out_message)
+        raise Exception(out_message)
     if isinstance(in_deny_list, pd.DataFrame):
         if not in_deny_list.empty:
     batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
     analyzer_results = list()
+    if do_initial_clean:
+        progress(0.2, desc="Cleaning text")
+        for col in progress.tqdm(df.columns, desc="Cleaning text", unit = "Columns"):
+            df[col] = initial_clean(df[col])
+    # DataFrame to dict
+    df_dict = df.to_dict(orient="list")
     if pii_identification_method == "Local":
         # Use custom analyzer to be able to track progress with Gradio

tools/example_cli_calls.txt CHANGED Viewed

@@ -21,4 +21,10 @@ python your_cli_script.py \
     --output_dir "output/anonymised_docs/" \
     --anon_strat "encrypt" \
     --deny_list "config/codenames.csv" \
-    --language "en"

     --output_dir "output/anonymised_docs/" \
     --anon_strat "encrypt" \
     --deny_list "config/codenames.csv" \
+    --language "en"
+python cli_redact.py --task deduplicate --input_file ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 --min_word_count 5
+python cli_redact.py --task deduplicate --input_file data.csv --duplicate_type tabular --text_columns "Name" "Email" "Description"
+python cli_redact.py --task deduplicate --input_file ocr_output.csv --duplicate_type pages --search_query "confidential information"

tools/file_redaction.py CHANGED Viewed

@@ -92,7 +92,7 @@ def choose_and_run_redactor(file_paths:List[str],
  chosen_redact_comprehend_entities:List[str],
  text_extraction_method:str,
  in_allow_list:List[List[str]]=list(),
- custom_recogniser_word_list:List[str]=list(),
  redact_whole_page_list:List[str]=list(),
  latest_file_completed:int=0,
  combined_out_message:List=list(),
@@ -147,8 +147,8 @@ def choose_and_run_redactor(file_paths:List[str],
     - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
     - text_extraction_method (str): The method to use to extract text from documents.
     - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
-    - custom_recogniser_word_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
-    - redact_whole_page_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
     - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
     - combined_out_message (list, optional): A list to store output messages. Defaults to an empty list.
     - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
@@ -390,11 +390,11 @@ def choose_and_run_redactor(file_paths:List[str],
         in_allow_list_flat = list()
     # If string, assume file path
-    if isinstance(custom_recogniser_word_list, str):
-        custom_recogniser_word_list = pd.read_csv(custom_recogniser_word_list)
-    if isinstance(custom_recogniser_word_list, pd.DataFrame):
-        if not custom_recogniser_word_list.empty:
-            custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
         else:
             custom_recogniser_word_list_flat = list()
@@ -1383,7 +1383,7 @@ def redact_image_pdf(file_path:str,
                      comprehend_query_number:int=0,
                      comprehend_client:str="",
                      textract_client:str="",
-                     custom_recogniser_word_list:List[str]=list(),
                      redact_whole_page_list:List[str]=list(),
                      max_fuzzy_spelling_mistakes_num:int=1,
                      match_fuzzy_whole_phrase_bool:bool=True,
@@ -1423,7 +1423,7 @@ def redact_image_pdf(file_path:str,
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
     - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
-    - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
     - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
     - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
@@ -1459,13 +1459,13 @@ def redact_image_pdf(file_path:str,
         raise Exception(f"Error creating nlp_analyser for {language}: {e}")
     # Update custom word list analyser object with any new words that have been added to the custom deny list
-    if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
-        new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
         nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
-        new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
     # Only load in PaddleOCR models if not running Textract
@@ -2216,7 +2216,7 @@ def redact_text_pdf(
     pii_identification_method: str = "Local",
     comprehend_query_number:int = 0,
     comprehend_client="",
-    custom_recogniser_word_list:List[str]=list(),
     redact_whole_page_list:List[str]=list(),
     max_fuzzy_spelling_mistakes_num:int=1,
     match_fuzzy_whole_phrase_bool:bool=True,
@@ -2250,7 +2250,7 @@ def redact_text_pdf(
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
-    - custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
     -  max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
     -  match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
@@ -2290,13 +2290,13 @@ def redact_text_pdf(
         raise Exception(f"Error creating nlp_analyser for {language}: {e}")
     # Update custom word list analyser object with any new words that have been added to the custom deny list
-    if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
-        new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
         nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
-        new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
     # Open with Pikepdf to get text lines
@@ -2385,9 +2385,7 @@ def redact_text_pdf(
                     all_page_line_text_extraction_characters.extend(line_characters)
                     all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
-                #print("page_text_ocr_outputs_list:", page_text_ocr_outputs_list)
                 page_text_ocr_outputs = pd.concat(page_text_ocr_outputs_list)
-                #page_text_ocr_outputs.to_csv("output/page_text_ocr_outputs.csv")
                 ### REDACTION
                 if pii_identification_method != NO_REDACTION_PII_OPTION:

  chosen_redact_comprehend_entities:List[str],
  text_extraction_method:str,
  in_allow_list:List[List[str]]=list(),
+ in_deny_list:List[str]=list(),
  redact_whole_page_list:List[str]=list(),
  latest_file_completed:int=0,
  combined_out_message:List=list(),
     - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
     - text_extraction_method (str): The method to use to extract text from documents.
     - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
+    - in_deny_list (List[List[str]], optional): A list of denied terms for redaction. Defaults to None.
+    - redact_whole_page_list (List[List[str]], optional): A list of whole page numbers for redaction. Defaults to None.
     - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
     - combined_out_message (list, optional): A list to store output messages. Defaults to an empty list.
     - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
         in_allow_list_flat = list()
     # If string, assume file path
+    if isinstance(in_deny_list, str):
+        in_deny_list = pd.read_csv(in_deny_list)
+    if isinstance(in_deny_list, pd.DataFrame):
+        if not in_deny_list.empty:
+            custom_recogniser_word_list_flat = in_deny_list.iloc[:, 0].tolist()
         else:
             custom_recogniser_word_list_flat = list()
                      comprehend_query_number:int=0,
                      comprehend_client:str="",
                      textract_client:str="",
+                     in_deny_list:List[str]=list(),
                      redact_whole_page_list:List[str]=list(),
                      max_fuzzy_spelling_mistakes_num:int=1,
                      match_fuzzy_whole_phrase_bool:bool=True,
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
     - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
+    - in_deny_list (optional): A list of custom words that the user has chosen specifically to redact.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
     - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
     - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
         raise Exception(f"Error creating nlp_analyser for {language}: {e}")
     # Update custom word list analyser object with any new words that have been added to the custom deny list
+    if in_deny_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
+        new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
         nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
+        new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=in_deny_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
     # Only load in PaddleOCR models if not running Textract
     pii_identification_method: str = "Local",
     comprehend_query_number:int = 0,
     comprehend_client="",
+    in_deny_list:List[str]=list(),
     redact_whole_page_list:List[str]=list(),
     max_fuzzy_spelling_mistakes_num:int=1,
     match_fuzzy_whole_phrase_bool:bool=True,
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
+    - in_deny_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
     -  max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
     -  match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
         raise Exception(f"Error creating nlp_analyser for {language}: {e}")
     # Update custom word list analyser object with any new words that have been added to the custom deny list
+    if in_deny_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
+        new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
         nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
+        new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=in_deny_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
     # Open with Pikepdf to get text lines
                     all_page_line_text_extraction_characters.extend(line_characters)
                     all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
                 page_text_ocr_outputs = pd.concat(page_text_ocr_outputs_list)
                 ### REDACTION
                 if pii_identification_method != NO_REDACTION_PII_OPTION:

tools/find_duplicate_pages.py CHANGED Viewed

@@ -1,8 +1,6 @@
 import pandas as pd
 import os
 import re
-import itertools
-import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
@@ -16,9 +14,10 @@ from tools.helper_functions import OUTPUT_FOLDER
 from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
 from tools.load_spacy_model_custom_recognisers import nlp
-similarity_threshold = 0.95
 number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value
 ID_MULTIPLIER = 100000
 def split_text_with_punctuation(text: str) -> List[str]:
     """
@@ -604,8 +603,7 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
     return output_paths
-# Define the set of punctuation characters for efficient lookup
-PUNCTUATION_TO_STRIP = {'.', ',', '?', '!', ':', ';'}
 def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
     """

 import pandas as pd
 import os
 import re
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
 from tools.load_spacy_model_custom_recognisers import nlp
 number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value
 ID_MULTIPLIER = 100000
+# Define the set of punctuation characters for efficient lookup
+PUNCTUATION_TO_STRIP = {'.', ',', '?', '!', ':', ';'}
 def split_text_with_punctuation(text: str) -> List[str]:
     """
     return output_paths
 def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
     """

tools/find_duplicate_tabular.py ADDED Viewed

	@@ -0,0 +1,422 @@

+import pandas as pd
+import os
+import re
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from typing import List, Tuple, Dict
+import gradio as gr
+from gradio import Progress
+from pathlib import Path
+from tools.helper_functions import OUTPUT_FOLDER, read_file
+from tools.data_anonymise import initial_clean
+from tools.load_spacy_model_custom_recognisers import nlp
+from tools.config import DO_INITIAL_TABULAR_DATA_CLEAN
+similarity_threshold = 0.95
+def clean_and_stem_text_series(df: pd.DataFrame, column: str, do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN):
+    """
+    Clean and stem text columns in a data frame for tabular data
+    """
+    # Function to apply lemmatisation and remove stopwords
+    def _apply_lemmatization(text):
+        doc = nlp(text)
+        # Keep only alphabetic tokens and remove stopwords
+        lemmatized_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
+        return ' '.join(lemmatized_words)
+    if do_initial_clean_dup:
+        df['text_clean'] = initial_clean(df[column])
+    df['text_clean'] = df['text_clean'].apply(_apply_lemmatization)
+    df['text_clean'] = df[column].str.lower()#.str.replace(r'[^\w\s]', '', regex=True)
+    return df
+def convert_tabular_data_to_analysis_format(
+    df: pd.DataFrame,
+    file_name: str,
+    text_columns: List[str] = None
+) -> List[Tuple[str, pd.DataFrame]]:
+    """
+    Convert tabular data (CSV/XLSX) to the format needed for duplicate analysis.
+    Args:
+        df (pd.DataFrame): The input DataFrame
+        file_name (str): Name of the file
+        text_columns (List[str], optional): Columns to analyze for duplicates.
+                                          If None, uses all string columns.
+    Returns:
+        List[Tuple[str, pd.DataFrame]]: List containing (file_name, processed_df) tuple
+    """
+    if text_columns is None:
+        # Auto-detect text columns (string type columns)
+        text_columns = df.select_dtypes(include=['object', 'string']).columns.tolist()
+    if not text_columns:
+        print(f"No text columns found in {file_name}")
+        return []
+    # Create a copy to avoid modifying original
+    df_copy = df.copy()
+    # Create a combined text column from all text columns
+    df_copy['combined_text'] = df_copy[text_columns].fillna('').astype(str).agg(' '.join, axis=1)
+    # Add row identifier
+    df_copy['row_id'] = df_copy.index
+    # Create the format expected by the duplicate detection system
+    # Using 'page' as row number and 'text' as the combined text
+    processed_df = pd.DataFrame({
+        'page': df_copy['row_id'],
+        'text': df_copy['combined_text'],
+        'file': file_name
+    })
+    # Add original row data for reference
+    for col in text_columns:
+        processed_df[f'original_{col}'] = df_copy[col]
+    return [(file_name, processed_df)]
+def find_duplicate_cells_in_tabular_data(
+    input_files: List[str],
+    similarity_threshold: float = 0.95,
+    min_word_count: int = 3,
+    text_columns: List[str] = None,
+    output_folder: str = OUTPUT_FOLDER,
+    do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
+    progress: Progress = Progress(track_tqdm=True)
+) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
+    """
+    Find duplicate cells/text in tabular data files (CSV, XLSX).
+    Args:
+        input_files (List[str]): List of file paths to analyze
+        similarity_threshold (float): Minimum similarity score to consider duplicates
+        min_word_count (int): Minimum word count for text to be considered
+        text_columns (List[str], optional): Specific columns to analyze
+        output_folder (str, optional): Output folder for results
+        do_initial_clean_dup (bool, optional): Whether to do initial clean of text
+        progress (Progress): Progress tracking object
+    Returns:
+        Tuple containing:
+        - results_df: DataFrame with duplicate matches
+        - output_paths: List of output file paths
+        - full_data_by_file: Dictionary of processed data by file
+    """
+    if not input_files:
+        raise gr.Error("Please upload files to analyze.")
+    progress(0.1, desc="Loading and processing files...")
+    all_data_to_process = []
+    full_data_by_file = {}
+    file_paths = []
+    # Process each file
+    for file_path in input_files:
+        try:
+            df = read_file(file_path)
+            file_name = os.path.basename(file_path)
+            file_paths.append(file_path)
+            # Convert to analysis format
+            processed_data = convert_tabular_data_to_analysis_format(
+                df, file_name, text_columns
+            )
+            if processed_data:
+                all_data_to_process.extend(processed_data)
+                full_data_by_file[file_name] = processed_data[0][1]
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+            continue
+    if not all_data_to_process:
+        raise gr.Error("No valid data found in uploaded files.")
+    progress(0.2, desc="Combining data...")
+    # Combine all data
+    combined_df = pd.concat([data[1] for data in all_data_to_process], ignore_index=True)
+    progress(0.3, desc="Cleaning and preparing text...")
+    # Clean and prepare text
+    combined_df = clean_and_stem_text_series(combined_df, 'text', do_initial_clean_dup=do_initial_clean_dup)
+    # Filter by minimum word count
+    combined_df['word_count'] = combined_df['text_clean'].str.split().str.len().fillna(0)
+    combined_df = combined_df[combined_df['word_count'] >= min_word_count].copy()
+    if len(combined_df) < 2:
+        return pd.DataFrame(), [], full_data_by_file
+    progress(0.4, desc="Calculating similarities...")
+    # Calculate similarities
+    vectorizer = TfidfVectorizer()
+    tfidf_matrix = vectorizer.fit_transform(combined_df['text_clean'])
+    similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
+    # Find similar pairs
+    coo_matrix = similarity_matrix.tocoo()
+    similar_pairs = [
+        (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
+        if r < c and v >= similarity_threshold
+    ]
+    if not similar_pairs:
+        gr.Info("No duplicate cells found.")
+        return pd.DataFrame(), [], full_data_by_file
+    progress(0.7, desc="Processing results...")
+    # Create results DataFrame
+    results_data = []
+    for row1, row2, similarity in similar_pairs:
+        row1_data = combined_df.iloc[row1]
+        row2_data = combined_df.iloc[row2]
+        results_data.append({
+            'File1': row1_data['file'],
+            'Row1': int(row1_data['page']),
+            'File2': row2_data['file'],
+            'Row2': int(row2_data['page']),
+            'Similarity_Score': round(similarity, 3),
+            'Text1': row1_data['text'][:200] + '...' if len(row1_data['text']) > 200 else row1_data['text'],
+            'Text2': row2_data['text'][:200] + '...' if len(row2_data['text']) > 200 else row2_data['text'],
+            'Original_Index1': row1,
+            'Original_Index2': row2
+        })
+    results_df = pd.DataFrame(results_data)
+    results_df = results_df.sort_values(['File1', 'Row1', 'File2', 'Row2'])
+    progress(0.9, desc="Saving results...")
+    # Save results
+    output_paths = save_tabular_duplicate_results(results_df, output_folder, file_paths, file_replaced_index=0)
+    gr.Info(f"Found {len(results_df)} duplicate cell matches")
+    return results_df, output_paths, full_data_by_file
+def save_tabular_duplicate_results(results_df: pd.DataFrame, output_folder: str, file_paths: List[str], file_replaced_index: int = 0) -> List[str]:
+    """
+    Save tabular duplicate detection results to files.
+    Args:
+        results_df (pd.DataFrame): Results DataFrame
+        output_folder (str): Output folder path
+        file_paths (List[str]): List of file paths
+        file_replaced_index (int): Index of the file to replace with duplicate rows removed
+            (0 is the first file in the list)
+    Returns:
+        List[str]: List of output file paths
+    """
+    output_paths = []
+    output_folder_path = Path(output_folder)
+    output_folder_path.mkdir(exist_ok=True)
+    if results_df.empty:
+        print("No duplicate matches to save.")
+        return []
+    # Save main results
+    results_file = output_folder_path / 'tabular_duplicate_results.csv'
+    results_df.to_csv(results_file, index=False, encoding="utf-8-sig")
+    output_paths.append(str(results_file))
+    # Save per-file duplicate lists
+    for file_name, group in results_df.groupby('File1'):
+        file_stem = Path(file_name).stem
+        duplicate_rows_file = output_folder_path / f"{file_stem}_duplicate_rows.csv"
+        # Get unique row numbers to remove
+        rows_to_remove = sorted(group['Row1'].unique())
+        duplicate_df = pd.DataFrame({'Row_to_Remove': rows_to_remove})
+        duplicate_df.to_csv(duplicate_rows_file, index=False)
+        output_paths.append(str(duplicate_rows_file))
+        # Save also original file (first file in list) with duplicate rows removed
+        file_path = file_paths[file_replaced_index]
+        file_base_name = os.path.basename(file_path)
+        df = read_file(file_path)
+        df_cleaned = df.drop(index=rows_to_remove).reset_index(drop=True)
+        output_path = os.path.join(output_folder, f"{file_base_name}_deduplicated.csv")
+        df_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")
+        output_paths.append(str(output_path))
+    return output_paths
+def remove_duplicate_rows_from_tabular_data(
+    file_path: str,
+    duplicate_rows: List[int],
+    output_folder: str = OUTPUT_FOLDER
+) -> str:
+    """
+    Remove duplicate rows from a tabular data file.
+    Args:
+        file_path (str): Path to the input file
+        duplicate_rows (List[int]): List of row indices to remove
+        output_folder (str): Output folder for cleaned file
+    Returns:
+        str: Path to the cleaned file
+    """
+    try:
+        # Load the file
+        df = read_file(file_path)
+        # Remove duplicate rows (0-indexed)
+        df_cleaned = df.drop(index=duplicate_rows).reset_index(drop=True)
+        # Save cleaned file
+        file_name = os.path.basename(file_path)
+        file_stem = os.path.splitext(file_name)[0]
+        file_ext = os.path.splitext(file_name)[1]
+        output_path = os.path.join(output_folder, f"{file_stem}_deduplicated{file_ext}")
+        if file_ext in ['.xlsx', '.xls']:
+            df_cleaned.to_excel(output_path, index=False)
+        elif file_ext in ['.parquet']:
+            df_cleaned.to_parquet(output_path, index=False)
+        else:
+            df_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")
+        return output_path
+    except Exception as e:
+        print(f"Error removing duplicates from {file_path}: {e}")
+        raise
+def run_tabular_duplicate_analysis(
+    files: List[str],
+    threshold: float,
+    min_words: int,
+    text_columns: List[str] = None,
+    output_folder: str = OUTPUT_FOLDER,
+    do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
+    progress: Progress = Progress(track_tqdm=True)
+) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
+    """
+    Main function to run tabular duplicate analysis.
+    Args:
+        files (List[str]): List of file paths
+        threshold (float): Similarity threshold
+        min_words (int): Minimum word count
+        text_columns (List[str], optional): Specific columns to analyze
+        output_folder (str, optional): Output folder for results
+        progress (Progress): Progress tracking
+    Returns:
+        Tuple containing results DataFrame, output paths, and full data by file
+    """
+    return find_duplicate_cells_in_tabular_data(
+        input_files=files,
+        similarity_threshold=threshold,
+        min_word_count=min_words,
+        text_columns=text_columns,
+        output_folder=output_folder,
+        do_initial_clean_dup=do_initial_clean_dup,
+        progress=progress
+    )
+# Function to update column choices when files are uploaded
+def update_tabular_column_choices(files):
+    if not files:
+        return gr.update(choices=[])
+    all_columns = set()
+    for file in files:
+        try:
+            df = read_file(file.name)
+            # Get text columns
+            text_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
+            all_columns.update(text_cols)
+        except Exception as e:
+            print(f"Error reading {file.name}: {e}")
+            continue
+    return gr.Dropdown(choices=sorted(list(all_columns)))
+# Function to handle tabular duplicate detection
+def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, output_folder: str = OUTPUT_FOLDER, do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN):
+    if not files:
+        return pd.DataFrame(), [], gr.Dropdown(choices=[])
+    file_paths = [f.name for f in files]
+    results_df, output_paths, full_data = run_tabular_duplicate_analysis(
+        files=file_paths,
+        threshold=threshold,
+        min_words=min_words,
+        text_columns=text_columns if text_columns else None,
+        output_folder=output_folder,
+        do_initial_clean_dup=do_initial_clean_dup
+    )
+    print("output_paths:", output_paths)
+    # Update file choices for cleaning
+    file_choices = list(set([f for f in file_paths]))
+    return results_df, output_paths, gr.Dropdown(choices=file_choices)
+# Function to handle row selection for preview
+def handle_tabular_row_selection(results_df, evt:gr.SelectData):
+    if not evt:
+        return None, "", ""
+    if not isinstance(results_df, pd.DataFrame):
+        return None, "", ""
+    elif results_df.empty:
+        return None, "", ""
+    selected_index = evt.index[0]
+    if selected_index >= len(results_df):
+        return None, "", ""
+    row = results_df.iloc[selected_index]
+    return selected_index, row['Text1'], row['Text2']
+# Function to clean duplicates from selected file
+def clean_tabular_duplicates(file_name, results_df, output_folder):
+    if not file_name or results_df.empty:
+        return None
+    # Get duplicate rows for this file
+    file_duplicates = results_df[results_df['File1'] == file_name]['Row1'].tolist()
+    if not file_duplicates:
+        return None
+    try:
+        # Find the original file path
+        # This is a simplified approach - in practice you might want to store file paths
+        cleaned_file = remove_duplicate_rows_from_tabular_data(
+            file_path=file_name,
+            duplicate_rows=file_duplicates,
+            output_folder=output_folder
+        )
+        return cleaned_file
+    except Exception as e:
+        print(f"Error cleaning duplicates: {e}")
+        return None

tools/helper_functions.py CHANGED Viewed

@@ -132,26 +132,20 @@ def get_file_name_without_type(file_path):
 def detect_file_type(filename:str):
     """Detect the file type based on its extension."""
-    if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
-        return 'csv'
-    elif filename.endswith('.xlsx'):
-        return 'xlsx'
-    elif filename.endswith('.parquet'):
-        return 'parquet'
-    elif filename.endswith('.pdf'):
-        return 'pdf'
-    elif filename.endswith('.jpg'):
-        return 'jpg'
-    elif filename.endswith('.jpeg'):
-        return 'jpeg'
-    elif filename.endswith('.png'):
-        return 'png'
-    elif filename.endswith('.xfdf'):
-        return 'xfdf'
-    elif filename.endswith('.docx'):
-        return 'docx'
-    else:
-        raise ValueError("Unsupported file type.")
 def read_file(filename:str):
     """Read the file based on its detected type."""

 def detect_file_type(filename:str):
     """Detect the file type based on its extension."""
+    if not isinstance(filename, str):
+        filename = str(filename)
+    if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')): return 'csv'
+    elif filename.endswith('.xlsx'): return 'xlsx'
+    elif filename.endswith('.xls'): return 'xls'
+    elif filename.endswith('.parquet'): return 'parquet'
+    elif filename.endswith('.pdf'): return 'pdf'
+    elif filename.endswith('.jpg'): return 'jpg'
+    elif filename.endswith('.jpeg'): return 'jpeg'
+    elif filename.endswith('.png'): return 'png'
+    elif filename.endswith('.xfdf'): return 'xfdf'
+    elif filename.endswith('.docx'): return 'docx'
+    else: raise ValueError("Unsupported file type.")
 def read_file(filename:str):
     """Read the file based on its detected type."""