Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files

xet

Community

seanpedrickcase commited on Aug 21

Commit

3bff849

1 Parent(s): 601fcda

Updated command line redaction script with more options

Browse files

Files changed (8) hide show

Dockerfile +1 -1
tools/cli_redact.py +149 -69
tools/custom_image_analyser_engine.py +19 -19
tools/data_anonymise.py +1 -1
tools/example_cli_calls.txt +11 -0
tools/file_conversion.py +29 -55
tools/file_redaction.py +56 -41
tools/redaction_review.py +29 -29

Dockerfile CHANGED Viewed

@@ -101,7 +101,7 @@ RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_
     && chmod 755 \
     ${APP_HOME}/.local/share/spacy/data \
     mkdir -p /usr/share/tessdata && \
-    chmod 755 /usr/share/tessdata # Create tessdata directory and set permissions
 # Copy installed packages from builder stage
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/

     && chmod 755 \
     ${APP_HOME}/.local/share/spacy/data \
     mkdir -p /usr/share/tessdata && \
+    chmod 755 /usr/share/tessdata
 # Copy installed packages from builder stage
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/

tools/cli_redact.py CHANGED Viewed

@@ -1,84 +1,164 @@
 import argparse
 import os
-from tools.config import get_or_create_env_var
-from tools.helper_functions import ensure_output_folder_exists,tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
 from tools.file_redaction import choose_and_run_redactor
-import pandas as pd
-from datetime import datetime
-chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
-                                'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
-                                'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
-                                'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
-                                'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE',
-                                'UK_NATIONAL_HEALTH_SERVICE_NUMBER']
-chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS",
-                            "STREETNAME", "UKPOSTCODE"]
-def main(first_loop_state=True, latest_file_completed=0, output_summary="", output_file_list=None,
-         log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
-         current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"]):
-    if output_file_list is None:
-        output_file_list = []
-    if log_files_list is None:
-        log_files_list = []
-    parser = argparse.ArgumentParser(description='Redact PII from documents via command line')
-    # Required arguments
-    parser.add_argument('--input_file', help='Path to input file (PDF, JPG, or PNG)')
-    # Optional arguments with defaults matching the GUI app
-    parser.add_argument('--ocr_method', choices=[text_ocr_option, tesseract_ocr_option, textract_option],
-                       default='Quick image analysis', help='OCR method to use')
-    parser.add_argument('--pii_detector', choices=[local_pii_detector, aws_pii_detector],
-                       default='Local', help='PII detection method')
-    parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
-    parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
-    parser.add_argument('--allow_list', help='Path to allow list CSV file')
-    parser.add_argument('--output_dir', default='output/', help='Output directory')
-    args = parser.parse_args()
-    # Ensure output directory exists
-    ensure_output_folder_exists()
-    # Create file object similar to what Gradio provides
-    file_obj = {"name": args.input_file}
-    # Load allow list if provided
-    allow_list_df = pd.DataFrame()
-    if args.allow_list:
-        allow_list_df = pd.read_csv(args.allow_list)
-    # Get file names
-    file_name_no_ext, file_name_with_ext, full_file_name = get_input_file_names(file_obj)
-    # Initialize empty states for PDF processing
-    # Prepare PDF/image
-    output_summary, prepared_pdf, images_pdf, max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations = prepare_image_or_pdf(
-        file_obj, args.ocr_method, allow_list_df, latest_file_completed,
-        output_summary, first_loop_state, args.page_max, current_loop_page, all_image_annotations
-    )
-    output_summary, output_files, output_file_list, latest_file_completed, log_files, \
-    log_files_list, estimated_time, textract_metadata, pdf_doc_state, all_image_annotations, \
-    current_loop_page, page_break, all_line_level_ocr_results, all_decision_process_table, \
-    comprehend_query_num = choose_and_run_redactor(
-        file_obj, prepared_pdf, images_pdf, "en", chosen_redact_entities,
-        chosen_comprehend_entities, args.ocr_method, allow_list_df,
-        latest_file_completed, output_summary, output_file_list, log_files_list,
-        first_loop_state, args.page_min, args.page_max, estimated_time,
-        handwrite_signature_checkbox, textract_metadata, all_image_annotations,
-        all_line_level_ocr_results, all_decision_process_table, pdf_doc_state,
-        current_loop_page, page_break, args.pii_detector, comprehend_query_num, args.output_dir
-    )
-    print(f"\nRedaction complete. Output file_list:\n{output_file_list}")
-    print(f"\nOutput files saved to: {args.output_dir}")
 if __name__ == "__main__":
-    main()

 import argparse
 import os
+import pandas as pd
+from tools.config import get_or_create_env_var, LOCAL_PII_OPTION, AWS_PII_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION
+from tools.helper_functions import ensure_output_folder_exists
 from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
 from tools.file_redaction import choose_and_run_redactor
+from tools.anonymisation import anonymise_files_with_open_text
+# --- Constants and Configuration ---
+INPUT_FOLDER = 'input/'
+OUTPUT_FOLDER = 'output/'
+DEFAULT_LANGUAGE = 'en'
+# Define entities for redaction
+chosen_comprehend_entities = [
+    'BANK_ACCOUNT_NUMBER', 'BANK_ROUTING', 'CREDIT_DEBIT_NUMBER',
+    'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY', 'PIN', 'EMAIL', 'ADDRESS',
+    'NAME', 'PHONE', 'PASSPORT_NUMBER', 'DRIVER_ID', 'USERNAME', 'PASSWORD',
+    'IP_ADDRESS', 'MAC_ADDRESS', 'LICENSE_PLATE', 'VEHICLE_IDENTIFICATION_NUMBER',
+    'UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER',
+    'SWIFT_CODE', 'UK_NATIONAL_HEALTH_SERVICE_NUMBER'
+]
+chosen_redact_entities = [
+    "TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"
+]
+# --- Main CLI Function ---
+def main():
+    """
+    A unified command-line interface to prepare, redact, and anonymise various document types.
+    """
+    parser = argparse.ArgumentParser(
+        description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
+        formatter_class=argparse.RawTextHelpFormatter
+    )
+    # --- General Arguments (apply to all file types) ---
+    general_group = parser.add_argument_group('General Options')
+    general_group.add_argument('--input_file', required=True, help='Path to the input file to process.')
+    general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
+    general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
+    general_group.add_argument('--allow_list', help='Path to a CSV file with words to exclude from redaction.')
+    general_group.add_argument('--pii_detector',
+    choices=[LOCAL_PII_OPTION, AWS_PII_OPTION],
+    default=LOCAL_PII_OPTION,
+    help='Core PII detection method (Local or AWS).')
+    general_group.add_argument('--aws_access_key', default='', help='Your AWS Access Key ID.')
+    general_group.add_argument('--aws_secret_key', default='', help='Your AWS Secret Access Key.')
+    # --- PDF/Image Redaction Arguments ---
+    pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
+    pdf_group.add_argument('--ocr_method',
+    choices=[SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION],
+    default=TESSERACT_TEXT_EXTRACT_OPTION,
+    help='OCR method for text extraction from images.')
+    pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
+    pdf_group.add_argument('--page_max', type=int, default=999, help='Last page to redact.')
+    pdf_group.add_argument('--prepare_for_review', action='store_true', help='Prepare files for reviewing redactions.')
+    pdf_group.add_argument('--no_images', action='store_false', dest='prepare_images', help='Disable image creation for PDF pages.')
+    # --- Word/Tabular Anonymisation Arguments ---
+    tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
+    tabular_group.add_argument('--anon_strat', choices=['redact', 'encrypt', 'hash'], default='redact', help='The anonymisation strategy to apply.')
+    tabular_group.add_argument('--columns', nargs='+', default=[], help='A list of column names to anonymise in tabular data.')
+    tabular_group.add_argument('--excel_sheets', nargs='+', default=[], help='Specific Excel sheet names to process.')
+    tabular_group.add_argument('--deny_list', help='Path to a CSV file with specific terms/phrases to redact.')
+    tabular_group.add_argument('--fuzzy_mistakes', type=int, default=1, help='Number of allowed spelling mistakes for fuzzy matching.')
+    args = parser.parse_args()
+    # --- Initial Setup ---
+    ensure_output_folder_exists(args.output_dir)
+    _, file_extension = os.path.splitext(args.input_file)
+    file_extension = file_extension.lower()
+    # Load allow/deny lists
+    allow_list = pd.read_csv(args.allow_list) if args.allow_list else pd.DataFrame()
+    deny_list = pd.read_csv(args.deny_list).iloc[:, 0].tolist() if args.deny_list else []
+    # --- Route to the Correct Workflow Based on File Type ---
+    # Workflow 1: PDF/Image Redaction
+    if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
+        print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
+        try:
+            # Step 1: Prepare the document
+            print("\nStep 1: Preparing document...")
+            (
+                prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
+                image_annotations, _, original_cropboxes, page_sizes, textract_output_found, _, _, _, _
+            ) = prepare_image_or_pdf(
+                file_paths=[args.input_file], text_extract_method=args.ocr_method,
+                all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
+                first_loop_state=True, prepare_for_review=args.prepare_for_review,
+                output_folder=args.output_dir, prepare_images=args.prepare_images
+            )
+            print(f"Preparation complete. {prep_summary}")
+            # Step 2: Redact the prepared document
+            print("\nStep 2: Running redaction...")
+            (
+                output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _
+            ) = choose_and_run_redactor(
+                file_paths=[args.input_file], prepared_pdf_file_paths=prepared_pdf_paths,
+                pdf_image_file_paths=image_file_paths, chosen_redact_entities=chosen_redact_entities,
+                chosen_redact_comprehend_entities=chosen_comprehend_entities, text_extraction_method=args.ocr_method,
+                in_allow_list=allow_list, first_loop_state=True, page_min=args.page_min, page_max=args.page_max,
+                pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
+                document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
+                aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
+                language=args.language, output_folder=args.output_dir
+            )
+            print("\n--- Redaction Process Complete ---")
+            print(f"Summary: {output_summary}")
+            print(f"\nOutput files saved to: {args.output_dir}")
+            print("Generated Files:", sorted(output_files))
+            if log_files: print("Log Files:", sorted(log_files))
+        except Exception as e:
+            print(f"\nAn error occurred during the PDF/Image redaction workflow: {e}")
+    # Workflow 2: Word/Tabular Data Anonymisation
+    elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
+        print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
+        try:
+            # Run the anonymisation function directly
+            output_summary, output_files, _, _, log_files, _, _ = anonymise_files_with_open_text(
+                file_paths=[args.input_file],
+                in_text="", # Not used for file-based operations
+                anon_strat=args.anon_strat,
+                chosen_cols=args.columns,
+                chosen_redact_entities=chosen_redact_entities,
+                in_allow_list=allow_list,
+                in_excel_sheets=args.excel_sheets,
+                first_loop_state=True,
+                output_folder=args.output_dir,
+                in_deny_list=deny_list,
+                max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
+                pii_identification_method=args.pii_detector,
+                chosen_redact_comprehend_entities=chosen_comprehend_entities,
+                aws_access_key_textbox=args.aws_access_key,
+                aws_secret_key_textbox=args.aws_secret_key,
+                language=args.language
+            )
+            print("\n--- Anonymisation Process Complete ---")
+            print(f"Summary: {output_summary}")
+            print(f"\nOutput files saved to: {args.output_dir}")
+            print("Generated Files:", sorted(output_files))
+            if log_files: print("Log Files:", sorted(log_files))
+        except Exception as e:
+            print(f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}")
+    else:
+        print(f"Error: Unsupported file type '{file_extension}'.")
+        print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
+        print("Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet")
 if __name__ == "__main__":
+    main()

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -696,8 +696,8 @@ class CustomImageAnalyzerEngine:
     ) -> List[CustomImageRecognizerResult]:
         page_text = ""
-        page_text_mapping = []
-        all_text_line_results = []
         comprehend_query_number = 0
         print("custom_entities:", custom_entities)
@@ -774,13 +774,13 @@ class CustomImageAnalyzerEngine:
             # Process text in batches for AWS Comprehend
             current_batch = ""
-            current_batch_mapping = []
             batch_char_count = 0
             batch_word_count = 0
             for i, text_line in enumerate(line_level_ocr_results):
                 words = text_line.text.split()
-                word_start_positions = []
                 current_pos = 0
                 for word in words:
@@ -839,7 +839,7 @@ class CustomImageAnalyzerEngine:
                 comprehend_query_number += 1
         # Process results and create bounding boxes
-        combined_results = []
         for i, text_line in enumerate(line_level_ocr_results):
             line_results = next((results for idx, results in all_text_line_results if idx == i), [])
             if line_results and i < len(ocr_results_with_words):
@@ -872,7 +872,7 @@ class CustomImageAnalyzerEngine:
     allow_list: List[str],
     ocr_results_with_words_child_info: Dict[str, Dict]
 ) -> List[CustomImageRecognizerResult]:
-        redaction_bboxes = []
         for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
             #print("ocr_results_with_words_child_info:", ocr_results_with_words_child_info)
@@ -895,7 +895,7 @@ class CustomImageAnalyzerEngine:
                     matched_words = matched_text.split()
                     # Find the corresponding words in the OCR results
-                    matching_word_boxes = []
                     current_position = 0
@@ -1236,13 +1236,13 @@ def run_page_text_redaction(
                 )
         current_batch = ""
-        current_batch_mapping = []
         batch_char_count = 0
         batch_word_count = 0
         for i, text_line in enumerate(line_level_text_results_list):
             words = text_line.text.split()
-            word_start_positions = []
             # Calculate word start positions within the line
             current_pos = 0
@@ -1320,12 +1320,12 @@ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], c
     '''
     Merge identified bounding boxes containing PII that are very close to one another
     '''
-    analysed_bounding_boxes = []
-    original_bounding_boxes = []  # List to hold original bounding boxes
     if len(analyser_results) > 0 and len(characters) > 0:
         # Extract bounding box coordinates for sorting
-        bounding_boxes = []
         for result in analyser_results:
             #print("Result:", result)
             char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
@@ -1346,11 +1346,11 @@ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], c
         # Sort the results by y-coordinate and then by x-coordinate
         bounding_boxes.sort()
-        merged_bounding_boxes = []
         current_box = None
         current_y = None
         current_result = None
-        current_text = []
         for y, x, result, next_box, text in bounding_boxes:
             if current_y is None or current_box is None:
@@ -1406,7 +1406,7 @@ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], c
     return analysed_bounding_boxes
 def recreate_page_line_level_ocr_results_with_page(page_line_level_ocr_results_with_words: dict):
-    reconstructed_results = []
     # Assume all lines belong to the same page, so we can just read it from one item
     #page = next(iter(page_line_level_ocr_results_with_words.values()))["page"]
@@ -1445,7 +1445,7 @@ def split_words_and_punctuation_from_line(line_of_words: List[OCRResult]) -> Lis
     # Punctuation that will be split off. Hyphen is not included.
     PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
-    new_word_list = []
     for word_result in line_of_words:
         word_text = word_result.text
@@ -1528,8 +1528,8 @@ def combine_ocr_results(ocr_results: List[OCRResult], x_threshold: float = 50.0,
     if not ocr_results:
         return {"page": page, "results": []}, {"page": page, "results": {}}
-    lines = []
-    current_line = []
     for result in sorted(ocr_results, key=lambda x: (x.top, x.left)):
         if not current_line or abs(result.top - current_line[0].top) <= y_threshold:
             current_line.append(result)
@@ -1539,7 +1539,7 @@ def combine_ocr_results(ocr_results: List[OCRResult], x_threshold: float = 50.0,
     if current_line:
         lines.append(sorted(current_line, key=lambda x: x.left))
-    page_line_level_ocr_results = []
     page_line_level_ocr_results_with_words = {}
     line_counter = 1

     ) -> List[CustomImageRecognizerResult]:
         page_text = ""
+        page_text_mapping = list()
+        all_text_line_results = list()
         comprehend_query_number = 0
         print("custom_entities:", custom_entities)
             # Process text in batches for AWS Comprehend
             current_batch = ""
+            current_batch_mapping = list()
             batch_char_count = 0
             batch_word_count = 0
             for i, text_line in enumerate(line_level_ocr_results):
                 words = text_line.text.split()
+                word_start_positions = list()
                 current_pos = 0
                 for word in words:
                 comprehend_query_number += 1
         # Process results and create bounding boxes
+        combined_results = list()
         for i, text_line in enumerate(line_level_ocr_results):
             line_results = next((results for idx, results in all_text_line_results if idx == i), [])
             if line_results and i < len(ocr_results_with_words):
     allow_list: List[str],
     ocr_results_with_words_child_info: Dict[str, Dict]
 ) -> List[CustomImageRecognizerResult]:
+        redaction_bboxes = list()
         for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
             #print("ocr_results_with_words_child_info:", ocr_results_with_words_child_info)
                     matched_words = matched_text.split()
                     # Find the corresponding words in the OCR results
+                    matching_word_boxes = list()
                     current_position = 0
                 )
         current_batch = ""
+        current_batch_mapping = list()
         batch_char_count = 0
         batch_word_count = 0
         for i, text_line in enumerate(line_level_text_results_list):
             words = text_line.text.split()
+            word_start_positions = list()
             # Calculate word start positions within the line
             current_pos = 0
     '''
     Merge identified bounding boxes containing PII that are very close to one another
     '''
+    analysed_bounding_boxes = list()
+    original_bounding_boxes = list()  # List to hold original bounding boxes
     if len(analyser_results) > 0 and len(characters) > 0:
         # Extract bounding box coordinates for sorting
+        bounding_boxes = list()
         for result in analyser_results:
             #print("Result:", result)
             char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
         # Sort the results by y-coordinate and then by x-coordinate
         bounding_boxes.sort()
+        merged_bounding_boxes = list()
         current_box = None
         current_y = None
         current_result = None
+        current_text = list()
         for y, x, result, next_box, text in bounding_boxes:
             if current_y is None or current_box is None:
     return analysed_bounding_boxes
 def recreate_page_line_level_ocr_results_with_page(page_line_level_ocr_results_with_words: dict):
+    reconstructed_results = list()
     # Assume all lines belong to the same page, so we can just read it from one item
     #page = next(iter(page_line_level_ocr_results_with_words.values()))["page"]
     # Punctuation that will be split off. Hyphen is not included.
     PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
+    new_word_list = list()
     for word_result in line_of_words:
         word_text = word_result.text
     if not ocr_results:
         return {"page": page, "results": []}, {"page": page, "results": {}}
+    lines = list()
+    current_line = list()
     for result in sorted(ocr_results, key=lambda x: (x.top, x.left)):
         if not current_line or abs(result.top - current_line[0].top) <= y_threshold:
             current_line.append(result)
     if current_line:
         lines.append(sorted(current_line, key=lambda x: x.left))
+    page_line_level_ocr_results = list()
     page_line_level_ocr_results_with_words = {}
     line_counter = 1

tools/data_anonymise.py CHANGED Viewed

@@ -327,7 +327,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
     This function anonymises data files based on the provided parameters.
     Parameters:
-    - file_paths (List[str]): A list of file paths to anonymise.
     - in_text (str): The text to anonymise if file_paths is 'open_text'.
     - anon_strat (str): The anonymisation strategy to use.
     - chosen_cols (List[str]): A list of column names to anonymise.

     This function anonymises data files based on the provided parameters.
     Parameters:
+    - file_paths (List[str]): A list of file paths to anonymise: '.xlsx', '.xls', '.csv', '.parquet', or '.docx'.
     - in_text (str): The text to anonymise if file_paths is 'open_text'.
     - anon_strat (str): The anonymisation strategy to use.
     - chosen_cols (List[str]): A list of column names to anonymise.

tools/example_cli_calls.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+python cli_redact.py --help
+python cli_redact.py \
+    --input_file "documents/confidential-report.pdf" \
+    --output_dir "output/redacted_reports/" \
+    --ocr_method "Local OCR model - PDFs without selectable text" \
+    --pii_detector "Local" \
+    --page_min 2 \
+    --page_max 10 \
+    --allow_list "config/project_allowlist.csv"

tools/file_conversion.py CHANGED Viewed

@@ -72,7 +72,7 @@ def check_image_size_and_reduce(out_path:str, image:Image):
     Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
     '''
-    all_img_details = []
     page_num = 0
     # Check file size and resize if necessary
@@ -168,9 +168,9 @@ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min
     # Set page max to length of pdf if not specified
     if page_max == 0: page_max = page_count
-    results = []
     with ThreadPoolExecutor(max_workers=num_threads) as executor:
-        futures = []
         for page_num in range(page_min, page_max):
             futures.append(executor.submit(process_single_page_for_image_conversion, pdf_path, page_num, image_dpi, create_images=create_images, input_folder=input_folder))
@@ -222,10 +222,10 @@ def process_file_for_image_creation(file_path:str, prepare_for_review:bool=False
     else:
         print(f"{file_path} is not an image or PDF file.")
-        img_path = []
-        image_sizes_width = []
-        image_sizes_height = []
-        all_img_details = []
     return img_path, image_sizes_width, image_sizes_height, all_img_details
@@ -234,7 +234,7 @@ def get_input_file_names(file_input:List[str]):
     Get list of input files to report to logs.
     '''
-    all_relevant_files = []
     file_name_with_extension = ""
     full_file_name = ""
     total_pdf_page_count = 0
@@ -419,8 +419,8 @@ def redact_whole_pymupdf_page(rect_height:float, rect_width:float, page:Page, cu
     return whole_page_img_annotation_box
 def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float], image_file_paths:List[str]):
-    page_sizes = []
-    original_cropboxes = []
     for page_no, page in enumerate(pymupdf_doc):
         reported_page_no = page_no + 1
@@ -443,9 +443,6 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
         out_page_image_sizes['cropbox_x_offset'] = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
         # cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
-        # MediaBox top y = mediabox.y1
-        # CropBox top y = cropbox.y1
-        # The difference is mediabox.y1 - cropbox.y1
         out_page_image_sizes['cropbox_y_offset_from_top'] = pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
         if image_sizes_width and image_sizes_height:
@@ -460,7 +457,7 @@ def word_level_ocr_output_to_dataframe(ocr_results: dict) -> pd.DataFrame:
     '''
     Convert a json of ocr results to a dataframe
     '''
-    rows = []
     ocr_result_page = ocr_results[0]
     for ocr_result in ocr_results:
@@ -540,11 +537,11 @@ def prepare_image_or_pdf(
     tic = time.perf_counter()
     json_from_csv = False
-    original_cropboxes = []  # Store original CropBox values
-    converted_file_paths = []
-    image_file_paths = []
-    # pymupdf_doc = []
-    all_img_details = []
     review_file_csv = pd.DataFrame()
     out_textract_path = ""
     combined_out_message = ""
@@ -557,15 +554,15 @@ def prepare_image_or_pdf(
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         latest_file_completed = 0
-        out_message = []
-        all_annotations_object = []
     else:
         print("Now redacting file", str(latest_file_completed))
     # If combined out message or converted_file_paths are blank, change to a list so it can be appended to
     if isinstance(out_message, str): out_message = [out_message]
-    if not file_paths: file_paths = []
     if isinstance(file_paths, dict): file_paths = os.path.abspath(file_paths["name"])
@@ -595,8 +592,8 @@ def prepare_image_or_pdf(
     # Loop through files to load in
     for file in file_paths_loop:
-        converted_file_path = []
-        image_file_path = []
         if isinstance(file, str):
             file_path = file
@@ -631,12 +628,12 @@ def prepare_image_or_pdf(
             #Create base version of the annotation object that doesn't have any annotations in it
             if (not all_annotations_object) & (prepare_for_review == True):
-                all_annotations_object = []
                 for image_path in image_file_paths:
                     annotation = {}
                     annotation["image"] = image_path
-                    annotation["boxes"] = []
                     all_annotations_object.append(annotation)
@@ -826,29 +823,6 @@ def prepare_image_or_pdf(
                 else:
                     print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
-        # elif file_extension in ['.csv'] and "ocr_output" in file_path:
-        #     continue
-        # Must be something else, return with error message
-        # else:
-        #     if prepare_for_review == False:
-        #         if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION:
-        #             if is_pdf_or_image(file_path) == False:
-        #                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
-        #                 print(out_message)
-        #                 raise Exception(out_message)
-        #         else:# text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
-        #             if is_pdf(file_path) == False:
-        #                 out_message = "Please upload a PDF file for text analysis."
-        #                 print(out_message)
-        #                 raise Exception(out_message)
-        #     else:
-        #         message = f"File {file_name_with_ext} not a recognised type for review, skipping"
-        #         print(message)
-        #         gr.Info(message)
-        #         continue
         converted_file_paths.append(converted_file_path)
         image_file_paths.extend(image_file_path)
@@ -966,7 +940,7 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
         image_groups[item['image']].append(item)
     # Process each group to prioritize items with non-empty boxes
-    result = []
     for image, items in image_groups.items():
         # Filter items with non-empty boxes
         non_empty_boxes = [item for item in items if item.get('boxes')]
@@ -1496,7 +1470,7 @@ def create_annotation_dicts_from_annotation_df(
 def convert_annotation_json_to_review_df(
     all_annotations: List[dict],
     redaction_decision_output: pd.DataFrame = pd.DataFrame(),
-    page_sizes: List[dict] = [],
     do_proximity_match: bool = True
 ) -> pd.DataFrame:
     '''
@@ -2021,7 +1995,7 @@ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12
     # --- Generate Unique IDs ---
     character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
     generated_ids_set = set() # Keep track of IDs generated *in this run*
-    new_ids_list = []      # Store the generated IDs in order
     max_possible_ids = len(character_set) ** length
     if num_needed > max_possible_ids:
@@ -2228,14 +2202,14 @@ def convert_review_df_to_annotation_json(
     # --- Build JSON Structure ---
-    json_data = []
     output_cols_for_boxes = [col for col in ["label", "color", xmin, ymin, xmax, ymax, "id", "text"] if col in review_file_df.columns]
     # Iterate through page_sizes_df to define the structure (one entry per image path)
     for _, row in page_sizes_df.iterrows():
         page_num = row['page'] # Already Int64
         pdf_image_path = row['image_path']
-        annotation_boxes = [] # Default to empty list
         # Check if the page exists in the grouped annotations (using the faster set lookup)
         # Check pd.notna because page_num could be <NA> if conversion failed
@@ -2254,7 +2228,7 @@ def convert_review_df_to_annotation_json(
             except KeyError:
                  print(f"Warning: Group key {page_num} not found despite being in group_keys (should not happen).")
-                 annotation_boxes = [] # Keep empty
         # Append the structured data for this image/page
         json_data.append({

     Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
     '''
+    all_img_details = list()
     page_num = 0
     # Check file size and resize if necessary
     # Set page max to length of pdf if not specified
     if page_max == 0: page_max = page_count
+    results = list()
     with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        futures = list()
         for page_num in range(page_min, page_max):
             futures.append(executor.submit(process_single_page_for_image_conversion, pdf_path, page_num, image_dpi, create_images=create_images, input_folder=input_folder))
     else:
         print(f"{file_path} is not an image or PDF file.")
+        img_path = list()
+        image_sizes_width = list()
+        image_sizes_height = list()
+        all_img_details = list()
     return img_path, image_sizes_width, image_sizes_height, all_img_details
     Get list of input files to report to logs.
     '''
+    all_relevant_files = list()
     file_name_with_extension = ""
     full_file_name = ""
     total_pdf_page_count = 0
     return whole_page_img_annotation_box
 def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float], image_file_paths:List[str]):
+    page_sizes = list()
+    original_cropboxes = list()
     for page_no, page in enumerate(pymupdf_doc):
         reported_page_no = page_no + 1
         out_page_image_sizes['cropbox_x_offset'] = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
         # cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
         out_page_image_sizes['cropbox_y_offset_from_top'] = pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
         if image_sizes_width and image_sizes_height:
     '''
     Convert a json of ocr results to a dataframe
     '''
+    rows = list()
     ocr_result_page = ocr_results[0]
     for ocr_result in ocr_results:
     tic = time.perf_counter()
     json_from_csv = False
+    original_cropboxes = list()  # Store original CropBox values
+    converted_file_paths = list()
+    image_file_paths = list()
+    # pymupdf_doc = list()
+    all_img_details = list()
     review_file_csv = pd.DataFrame()
     out_textract_path = ""
     combined_out_message = ""
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         latest_file_completed = 0
+        out_message = list()
+        all_annotations_object = list()
     else:
         print("Now redacting file", str(latest_file_completed))
     # If combined out message or converted_file_paths are blank, change to a list so it can be appended to
     if isinstance(out_message, str): out_message = [out_message]
+    if not file_paths: file_paths = list()
     if isinstance(file_paths, dict): file_paths = os.path.abspath(file_paths["name"])
     # Loop through files to load in
     for file in file_paths_loop:
+        converted_file_path = list()
+        image_file_path = list()
         if isinstance(file, str):
             file_path = file
             #Create base version of the annotation object that doesn't have any annotations in it
             if (not all_annotations_object) & (prepare_for_review == True):
+                all_annotations_object = list()
                 for image_path in image_file_paths:
                     annotation = {}
                     annotation["image"] = image_path
+                    annotation["boxes"] = list()
                     all_annotations_object.append(annotation)
                 else:
                     print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
         converted_file_paths.append(converted_file_path)
         image_file_paths.extend(image_file_path)
         image_groups[item['image']].append(item)
     # Process each group to prioritize items with non-empty boxes
+    result = list()
     for image, items in image_groups.items():
         # Filter items with non-empty boxes
         non_empty_boxes = [item for item in items if item.get('boxes')]
 def convert_annotation_json_to_review_df(
     all_annotations: List[dict],
     redaction_decision_output: pd.DataFrame = pd.DataFrame(),
+    page_sizes: List[dict] = list(),
     do_proximity_match: bool = True
 ) -> pd.DataFrame:
     '''
     # --- Generate Unique IDs ---
     character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
     generated_ids_set = set() # Keep track of IDs generated *in this run*
+    new_ids_list = list()      # Store the generated IDs in order
     max_possible_ids = len(character_set) ** length
     if num_needed > max_possible_ids:
     # --- Build JSON Structure ---
+    json_data = list()
     output_cols_for_boxes = [col for col in ["label", "color", xmin, ymin, xmax, ymax, "id", "text"] if col in review_file_df.columns]
     # Iterate through page_sizes_df to define the structure (one entry per image path)
     for _, row in page_sizes_df.iterrows():
         page_num = row['page'] # Already Int64
         pdf_image_path = row['image_path']
+        annotation_boxes = list() # Default to empty list
         # Check if the page exists in the grouped annotations (using the faster set lookup)
         # Check pd.notna because page_num could be <NA> if conversion failed
             except KeyError:
                  print(f"Warning: Group key {page_num} not found despite being in group_keys (should not happen).")
+                 annotation_boxes = list() # Keep empty
         # Append the structured data for this image/page
         json_data.append({

tools/file_redaction.py CHANGED Viewed

@@ -201,7 +201,7 @@ def choose_and_run_redactor(file_paths:List[str],
     pdf_file_name_with_ext = ""
     pdf_file_name_without_ext = ""
     page_break_return = False
-    blank_request_metadata = []
     all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
     review_out_file_paths = [prepared_pdf_file_paths[0]]
@@ -387,7 +387,7 @@ def choose_and_run_redactor(file_paths:List[str],
     if not in_allow_list.empty:
         in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
     else:
-        in_allow_list_flat = []
     # If string, assume file path
     if isinstance(custom_recogniser_word_list, str):
@@ -396,7 +396,7 @@ def choose_and_run_redactor(file_paths:List[str],
         if not custom_recogniser_word_list.empty:
             custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
         else:
-            custom_recogniser_word_list_flat = []
         # Sort the strings in order from the longest string to the shortest
         custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
@@ -412,7 +412,7 @@ def choose_and_run_redactor(file_paths:List[str],
                 print("Could not convert whole page redaction data to number list due to:", e)
                 redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
         else:
-            redact_whole_page_list_flat = []
@@ -1100,7 +1100,7 @@ def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
     else:
         page.set_cropbox(original_cropbox)
-def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
     rect_height = page.rect.height
     rect_width = page.rect.width
@@ -1127,7 +1127,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
             image_dimensions = {}
     out_annotation_boxes = {}
-    all_image_annotation_boxes = []
     if isinstance(image, Image.Image):
         image_path = move_page_info(str(page))
@@ -1238,10 +1238,25 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
 # IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
 ###
-def merge_img_bboxes(bboxes:list, combined_results: Dict, page_signature_recogniser_results=[], page_handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Extract handwriting", "Extract signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
-    all_bboxes = []
-    merged_bboxes = []
     grouped_bboxes = defaultdict(list)
     # Deep copy original bounding boxes to retain them
@@ -1256,7 +1271,7 @@ def merge_img_bboxes(bboxes:list, combined_results: Dict, page_signature_recogni
             merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
     # Reconstruct bounding boxes for substrings of interest
-    reconstructed_bboxes = []
     for bbox in bboxes:
         bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
         for line_text, line_info in combined_results.items():
@@ -1266,7 +1281,7 @@ def merge_img_bboxes(bboxes:list, combined_results: Dict, page_signature_recogni
                     start_char = line_text.index(bbox.text)
                     end_char = start_char + len(bbox.text)
-                    relevant_words = []
                     current_char = 0
                     for word in line_info['words']:
                         word_end = current_char + len(word['text'])
@@ -1501,8 +1516,8 @@ def redact_image_pdf(file_path:str,
     progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
     # If there's data from a previous run (passed in via the DataFrame parameters), add it
-    all_line_level_ocr_results_list = []
-    all_pages_decision_process_list = []
     if not all_page_line_level_ocr_results_df.empty:
         all_line_level_ocr_results_list.extend(all_page_line_level_ocr_results_df.to_dict('records'))
@@ -1513,10 +1528,10 @@ def redact_image_pdf(file_path:str,
     # Go through each page
     for page_no in progress_bar:
-        handwriting_or_signature_boxes = []
-        page_signature_recogniser_results = []
-        page_handwriting_recogniser_results = []
-        page_line_level_ocr_results_with_words = []
         page_break_return = False
         reported_page_number = str(page_no + 1)
@@ -1567,7 +1582,7 @@ def redact_image_pdf(file_path:str,
                     )
                     page_line_level_ocr_results_with_words = matching_page if matching_page else []
-                else: page_line_level_ocr_results_with_words = []
                 if page_line_level_ocr_results_with_words:
                     print("Found OCR results for page in existing OCR with words object")
@@ -1581,7 +1596,7 @@ def redact_image_pdf(file_path:str,
             # Check if page exists in existing textract data. If not, send to service to analyse
             if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
-                text_blocks = []
                 if not textract_data:
                     try:
@@ -1619,7 +1634,7 @@ def redact_image_pdf(file_path:str,
                             text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                             # Check if "pages" key exists, if not, initialise it as an empty list
-                            if "pages" not in textract_data: textract_data["pages"] = []
                             # Append the new page data
                             textract_data["pages"].append(text_blocks)
@@ -1627,11 +1642,11 @@ def redact_image_pdf(file_path:str,
                         except Exception as e:
                             out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
                             print(out_message)
-                            text_blocks = []
                             new_textract_request_metadata = "Failed Textract API call"
                             # Check if "pages" key exists, if not, initialise it as an empty list
-                            if "pages" not in textract_data: textract_data["pages"] = []
                             raise Exception(out_message)
@@ -1678,12 +1693,12 @@ def redact_image_pdf(file_path:str,
                     comprehend_query_number = comprehend_query_number + comprehend_query_number_new
-                else: page_redaction_bounding_boxes = []
                 # Merge redaction bounding boxes that are close together
                 page_merged_redaction_bboxes = merge_img_bboxes(page_redaction_bounding_boxes, page_line_level_ocr_results_with_words['results'], page_signature_recogniser_results, page_handwriting_recogniser_results, handwrite_signature_checkbox)
-            else: page_merged_redaction_bboxes = []
             # 3. Draw the merged boxes
             ## Apply annotations to pdf with pymupdf
@@ -1710,7 +1725,7 @@ def redact_image_pdf(file_path:str,
                 fill = (0, 0, 0)   # Fill colour for redactions
                 draw = ImageDraw.Draw(image)
-                all_image_annotations_boxes = []
                 for box in page_merged_redaction_bboxes:
@@ -1914,9 +1929,9 @@ def create_line_level_ocr_results_from_characters(char_objects:List, line_number
     Create OCRResult objects based on a list of pdfminer LTChar objects.
     This version is corrected to use the specified OCRResult class definition.
     """
-    line_level_results_out = []
-    line_level_characters_out = []
-    character_objects_out = []
     full_text = ""
     # [x0, y0, x1, y1]
@@ -1943,7 +1958,7 @@ def create_line_level_ocr_results_from_characters(char_objects:List, line_number
                     line_level_characters_out.append(character_objects_out)
                 # Reset for the next line
-                character_objects_out = []
                 full_text = ""
                 overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
                 line_number += 1
@@ -2003,7 +2018,7 @@ def generate_words_for_line(line_chars: List) -> List[Dict[str, Any]]:
     # The hyphen '-' is intentionally excluded to keep words like 'high-tech' together.
     PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
-    line_words = []
     current_word_text = ""
     current_word_bbox = [float('inf'), float('inf'), -1, -1]  # [x0, y0, x1, y1]
     prev_char = None
@@ -2152,7 +2167,7 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
     return decision_process_table
 def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
-    pikepdf_redaction_annotations_on_page = []
     for analysed_bounding_box in analysed_bounding_boxes:
         bounding_box = analysed_bounding_box["boundingBox"]
@@ -2282,7 +2297,7 @@ def redact_text_pdf(
     #file_name = get_file_name_without_type(file_path)
-    if not all_page_line_level_ocr_results_with_words: all_page_line_level_ocr_results_with_words = []
     # Check that page_min and page_max are within expected ranges
     if page_max > number_of_pages or page_max == 0: page_max = number_of_pages
@@ -2315,20 +2330,20 @@ def redact_text_pdf(
             # Go page by page
             for page_layout in extract_pages(file_path, page_numbers = [page_no], maxpages=1):
-                all_page_line_text_extraction_characters = []
-                all_page_line_level_text_extraction_results_list = []
-                page_analyser_results = []
-                page_redaction_bounding_boxes = []
-                characters = []
-                pikepdf_redaction_annotations_on_page = []
                 page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
                 page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"])
-                page_text_ocr_outputs_list = []
                 text_line_no = 1
                 for n, text_container in enumerate(page_layout):
-                    characters = []
                     if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
                         characters = get_text_container_characters(text_container)
@@ -2390,7 +2405,7 @@ def redact_text_pdf(
                         # Annotate redactions on page
                         pikepdf_redaction_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_redaction_bounding_boxes)
-                    else: pikepdf_redaction_annotations_on_page = []
                     # Make pymupdf page redactions
                     if redact_whole_page_list:

     pdf_file_name_with_ext = ""
     pdf_file_name_without_ext = ""
     page_break_return = False
+    blank_request_metadata = list()
     all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
     review_out_file_paths = [prepared_pdf_file_paths[0]]
     if not in_allow_list.empty:
         in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
     else:
+        in_allow_list_flat = list()
     # If string, assume file path
     if isinstance(custom_recogniser_word_list, str):
         if not custom_recogniser_word_list.empty:
             custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
         else:
+            custom_recogniser_word_list_flat = list()
         # Sort the strings in order from the longest string to the shortest
         custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
                 print("Could not convert whole page redaction data to number list due to:", e)
                 redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
         else:
+            redact_whole_page_list_flat = list()
     else:
         page.set_cropbox(original_cropbox)
+def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]= list(), page_sizes_df:pd.DataFrame=pd.DataFrame()):
     rect_height = page.rect.height
     rect_width = page.rect.width
             image_dimensions = {}
     out_annotation_boxes = {}
+    all_image_annotation_boxes = list()
     if isinstance(image, Image.Image):
         image_path = move_page_info(str(page))
 # IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
 ###
+def merge_img_bboxes(bboxes: list, combined_results: Dict, page_signature_recogniser_results: list = list(), page_handwriting_recogniser_results: list = list(), handwrite_signature_checkbox: List[str] = ["Extract handwriting", "Extract signatures"], horizontal_threshold: int = 50, vertical_threshold: int = 12):
+    """
+    Merges bounding boxes for image annotations based on the provided results from signature and handwriting recognizers.
+    Args:
+        bboxes (list): A list of bounding boxes to be merged.
+        combined_results (Dict): A dictionary containing combined results with line text and their corresponding bounding boxes.
+        page_signature_recogniser_results (list, optional): A list of results from the signature recognizer. Defaults to an empty list.
+        page_handwriting_recogniser_results (list, optional): A list of results from the handwriting recognizer. Defaults to an empty list.
+        handwrite_signature_checkbox (List[str], optional): A list of options indicating whether to extract handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
+        horizontal_threshold (int, optional): The threshold for merging bounding boxes horizontally. Defaults to 50.
+        vertical_threshold (int, optional): The threshold for merging bounding boxes vertically. Defaults to 12.
+    Returns:
+        None: This function modifies the bounding boxes in place and does not return a value.
+    """
+    all_bboxes = list()
+    merged_bboxes = list()
     grouped_bboxes = defaultdict(list)
     # Deep copy original bounding boxes to retain them
             merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
     # Reconstruct bounding boxes for substrings of interest
+    reconstructed_bboxes = list()
     for bbox in bboxes:
         bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
         for line_text, line_info in combined_results.items():
                     start_char = line_text.index(bbox.text)
                     end_char = start_char + len(bbox.text)
+                    relevant_words = list()
                     current_char = 0
                     for word in line_info['words']:
                         word_end = current_char + len(word['text'])
     progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
     # If there's data from a previous run (passed in via the DataFrame parameters), add it
+    all_line_level_ocr_results_list = list()
+    all_pages_decision_process_list = list()
     if not all_page_line_level_ocr_results_df.empty:
         all_line_level_ocr_results_list.extend(all_page_line_level_ocr_results_df.to_dict('records'))
     # Go through each page
     for page_no in progress_bar:
+        handwriting_or_signature_boxes = list()
+        page_signature_recogniser_results = list()
+        page_handwriting_recogniser_results = list()
+        page_line_level_ocr_results_with_words = list()
         page_break_return = False
         reported_page_number = str(page_no + 1)
                     )
                     page_line_level_ocr_results_with_words = matching_page if matching_page else []
+                else: page_line_level_ocr_results_with_words = list()
                 if page_line_level_ocr_results_with_words:
                     print("Found OCR results for page in existing OCR with words object")
             # Check if page exists in existing textract data. If not, send to service to analyse
             if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+                text_blocks = list()
                 if not textract_data:
                     try:
                             text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                             # Check if "pages" key exists, if not, initialise it as an empty list
+                            if "pages" not in textract_data: textract_data["pages"] = list()
                             # Append the new page data
                             textract_data["pages"].append(text_blocks)
                         except Exception as e:
                             out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
                             print(out_message)
+                            text_blocks = list()
                             new_textract_request_metadata = "Failed Textract API call"
                             # Check if "pages" key exists, if not, initialise it as an empty list
+                            if "pages" not in textract_data: textract_data["pages"] = list()
                             raise Exception(out_message)
                     comprehend_query_number = comprehend_query_number + comprehend_query_number_new
+                else: page_redaction_bounding_boxes = list()
                 # Merge redaction bounding boxes that are close together
                 page_merged_redaction_bboxes = merge_img_bboxes(page_redaction_bounding_boxes, page_line_level_ocr_results_with_words['results'], page_signature_recogniser_results, page_handwriting_recogniser_results, handwrite_signature_checkbox)
+            else: page_merged_redaction_bboxes = list()
             # 3. Draw the merged boxes
             ## Apply annotations to pdf with pymupdf
                 fill = (0, 0, 0)   # Fill colour for redactions
                 draw = ImageDraw.Draw(image)
+                all_image_annotations_boxes = list()
                 for box in page_merged_redaction_bboxes:
     Create OCRResult objects based on a list of pdfminer LTChar objects.
     This version is corrected to use the specified OCRResult class definition.
     """
+    line_level_results_out = list()
+    line_level_characters_out = list()
+    character_objects_out = list()
     full_text = ""
     # [x0, y0, x1, y1]
                     line_level_characters_out.append(character_objects_out)
                 # Reset for the next line
+                character_objects_out = list()
                 full_text = ""
                 overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
                 line_number += 1
     # The hyphen '-' is intentionally excluded to keep words like 'high-tech' together.
     PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
+    line_words = list()
     current_word_text = ""
     current_word_bbox = [float('inf'), float('inf'), -1, -1]  # [x0, y0, x1, y1]
     prev_char = None
     return decision_process_table
 def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
+    pikepdf_redaction_annotations_on_page = list()
     for analysed_bounding_box in analysed_bounding_boxes:
         bounding_box = analysed_bounding_box["boundingBox"]
     #file_name = get_file_name_without_type(file_path)
+    if not all_page_line_level_ocr_results_with_words: all_page_line_level_ocr_results_with_words = list()
     # Check that page_min and page_max are within expected ranges
     if page_max > number_of_pages or page_max == 0: page_max = number_of_pages
             # Go page by page
             for page_layout in extract_pages(file_path, page_numbers = [page_no], maxpages=1):
+                all_page_line_text_extraction_characters = list()
+                all_page_line_level_text_extraction_results_list = list()
+                page_analyser_results = list()
+                page_redaction_bounding_boxes = list()
+                characters = list()
+                pikepdf_redaction_annotations_on_page = list()
                 page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
                 page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"])
+                page_text_ocr_outputs_list = list()
                 text_line_no = 1
                 for n, text_container in enumerate(page_layout):
+                    characters = list()
                     if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
                         characters = get_text_container_characters(text_container)
                         # Annotate redactions on page
                         pikepdf_redaction_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_redaction_bounding_boxes)
+                    else: pikepdf_redaction_annotations_on_page = list()
                     # Make pymupdf page redactions
                     if redact_whole_page_list:

tools/redaction_review.py CHANGED Viewed

@@ -99,8 +99,8 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
                                  recogniser_dropdown_value:str,
                                  text_dropdown_value:str,
                                  page_dropdown_value:str,
-                                 review_df:pd.DataFrame=[],
-                                 page_sizes:List[str]=[]):
     '''
     Create a filtered recogniser dataframe and associated dropdowns based on current information in the image annotator and review data frame.
     '''
@@ -147,7 +147,7 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
     return recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
-def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData, recogniser_dataframe_base:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=[], page_sizes:list[str]=[]):
     '''
     Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
     '''
@@ -265,7 +265,7 @@ def update_annotator_page_from_review_df(
         if not current_page_review_df.empty:
             # Convert the current page's review data to annotation list format for *this page*
-            current_page_annotations_list = []
             # Define expected annotation dict keys, including 'image', 'page', coords, 'label', 'text', 'color' etc.
             # Assuming review_df has compatible columns
             expected_annotation_keys = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id'] # Add/remove as needed
@@ -340,7 +340,7 @@ def update_annotator_page_from_review_df(
     if not page_sizes_df.empty:
         page_sizes = page_sizes_df.to_dict(orient='records')
     else:
-        page_sizes = [] # Ensure page_sizes is a list if df is empty
     # --- Re-evaluate Coordinate Multiplication and Duplicate Removal ---
     # Let's assume remove_duplicate_images_with_blank_boxes expects the raw list of dicts state format:
@@ -609,7 +609,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
     merged_df = merged_df.sort_values('image')
-    final_annotations_list = []
     box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
     # Now, when we group, we use `sort=False`. This tells groupby to respect the
@@ -622,7 +622,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
         # Check if the group has actual annotations. iloc[0] is safe because even pages
         # without annotations will have one row with NaN values from the merge.
         if pd.isna(group.iloc[0].get('id')):
-            boxes = []
         else:
             valid_box_cols = [col for col in box_cols if col in group.columns]
             # We should also sort the boxes within a page for consistency (e.g., left-to-right)
@@ -751,7 +751,7 @@ def update_annotator_object_and_filter_df(
     recogniser_dataframe_base:pd.DataFrame=None, # Simplified default
     zoom:int=100,
     review_df:pd.DataFrame=None, # Use None for default empty DataFrame
-    page_sizes:List[dict]=[],
     doc_full_file_name_textbox:str='',
     input_folder:str=INPUT_FOLDER
 ) -> Tuple[image_annotator, gr.Number, gr.Number, int, str, gr.Dataframe, pd.DataFrame, List[str], List[str], List[dict], List[AnnotatedImageData]]:
@@ -775,7 +775,7 @@ def update_annotator_object_and_filter_df(
         # Return blank/default outputs
         blank_annotator = image_annotator(
-            value = None, boxes_alpha=0.1, box_thickness=1, label_list=[], label_colors=[],
             show_label=False, height=zoom_str, width=zoom_str, box_min_size=1,
             box_selected_thickness=2, handle_size=4, sources=None,
             show_clear_button=False, show_share_button=False, show_remove_button=False,
@@ -851,7 +851,7 @@ def update_annotator_object_and_filter_df(
     if not page_sizes_df.empty:
         page_sizes = page_sizes_df.to_dict(orient='records')
     else:
-        page_sizes = [] # Ensure page_sizes is a list if df is empty
     # --- OPTIMIZATION: Prepare data *only* for the current page for display ---
     current_page_image_annotator_object = None
@@ -907,12 +907,12 @@ def update_annotator_object_and_filter_df(
     except Exception as e:
         print(f"Error calling update_recogniser_dataframes: {e}. Returning empty/default filter data.")
-        recogniser_entities_list = []
-        recogniser_colour_list = []
         recogniser_dataframe_out_gr = gr.Dataframe(pd.DataFrame(columns=["page", "label", "text", "id"]))
         recogniser_dataframe_modified = pd.DataFrame(columns=["page", "label", "text", "id"])
-        text_entities_drop = []
-        page_entities_drop = []
     # --- Final Output Components ---
@@ -946,7 +946,7 @@ def update_annotator_object_and_filter_df(
             interactive=True # Keep interactive if data is present
         )
-    page_entities_drop_redaction_list = []
     all_pages_in_doc_list = [str(i) for i in range(1, len(page_sizes) + 1)]
     page_entities_drop_redaction_list.extend(all_pages_in_doc_list)
@@ -970,7 +970,7 @@ def update_all_page_annotation_object_based_on_previous_page(
                                     current_page:int,
                                     previous_page:int,
                                     all_image_annotations:List[AnnotatedImageData],
-                                    page_sizes:List[dict]=[],
                                     clear_all:bool=False
                                     ):
     '''
@@ -991,7 +991,7 @@ def update_all_page_annotation_object_based_on_previous_page(
     page_image_annotator_object, all_image_annotations = replace_annotator_object_img_np_array_with_page_sizes_image_path(all_image_annotations, page_image_annotator_object, page_sizes, previous_page)
     if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
-    else: all_image_annotations[previous_page_zero_index]["boxes"] = []
     return all_image_annotations, current_page, current_page
@@ -1003,16 +1003,16 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
                      review_file_state:pd.DataFrame,
                      output_folder:str = OUTPUT_FOLDER,
                      save_pdf:bool=True,
-                     page_sizes:List[dict]=[],
                      COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF,
                      progress=gr.Progress(track_tqdm=True)):
     '''
     Apply modified redactions to a pymupdf and export review files.
     '''
-    output_files = []
-    output_log_files = []
-    pdf_doc = []
     review_df = review_file_state
     page_image_annotator_object = all_image_annotations[current_page - 1]
@@ -1078,7 +1078,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
                 doc = [image]
             elif file_extension in '.csv':
-                pdf_doc = []
             # If working with pdfs
             elif is_pdf(file_path) == True:
@@ -1088,7 +1088,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
                 output_files.append(orig_pdf_file_path)
                 number_of_pages = pdf_doc.page_count
-                original_cropboxes = []
                 page_sizes_df = pd.DataFrame(page_sizes)
                 page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
@@ -1619,7 +1619,7 @@ def convert_pymupdf_coords_to_adobe(x1: float, y1: float, x2: float, y2: float,
     return x1, adobe_y1, x2, adobe_y2
-def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str]=[], document_cropboxes:List=[], page_sizes:List[dict]=[]):
     '''
     Create an xfdf file from a review csv file and a pdf
     '''
@@ -1711,11 +1711,11 @@ def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, i
     reparsed = minidom.parseString(rough_string)
     return reparsed.toxml() #.toprettyxml(indent="  ")
-def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = OUTPUT_FOLDER, document_cropboxes:List=[], page_sizes:List[dict]=[]):
     '''
     Load in files to convert a review file into an Adobe comment file format
     '''
-    output_paths = []
     pdf_name = ""
     file_path_name = ""
@@ -1814,7 +1814,7 @@ def parse_xfdf(xfdf_path:str):
     # Define the namespace
     namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
-    redactions = []
     # Find all redact elements using the namespace
     for redact in root.findall('.//xfdf:redact', namespaces=namespace):
@@ -1846,8 +1846,8 @@ def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_path
     Returns:
     - DataFrame containing redaction information
     '''
-    output_paths = []
-    xfdf_paths = []
     df = pd.DataFrame()
     # Sort the file paths so that the pdfs come first

                                  recogniser_dropdown_value:str,
                                  text_dropdown_value:str,
                                  page_dropdown_value:str,
+                                 review_df:pd.DataFrame=list(),
+                                 page_sizes:List[str]=list()):
     '''
     Create a filtered recogniser dataframe and associated dropdowns based on current information in the image annotator and review data frame.
     '''
     return recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
+def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData, recogniser_dataframe_base:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=list(), page_sizes:list[str]=list()):
     '''
     Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
     '''
         if not current_page_review_df.empty:
             # Convert the current page's review data to annotation list format for *this page*
+            current_page_annotations_list = list()
             # Define expected annotation dict keys, including 'image', 'page', coords, 'label', 'text', 'color' etc.
             # Assuming review_df has compatible columns
             expected_annotation_keys = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id'] # Add/remove as needed
     if not page_sizes_df.empty:
         page_sizes = page_sizes_df.to_dict(orient='records')
     else:
+        page_sizes = list() # Ensure page_sizes is a list if df is empty
     # --- Re-evaluate Coordinate Multiplication and Duplicate Removal ---
     # Let's assume remove_duplicate_images_with_blank_boxes expects the raw list of dicts state format:
     merged_df = merged_df.sort_values('image')
+    final_annotations_list = list()
     box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
     # Now, when we group, we use `sort=False`. This tells groupby to respect the
         # Check if the group has actual annotations. iloc[0] is safe because even pages
         # without annotations will have one row with NaN values from the merge.
         if pd.isna(group.iloc[0].get('id')):
+            boxes = list()
         else:
             valid_box_cols = [col for col in box_cols if col in group.columns]
             # We should also sort the boxes within a page for consistency (e.g., left-to-right)
     recogniser_dataframe_base:pd.DataFrame=None, # Simplified default
     zoom:int=100,
     review_df:pd.DataFrame=None, # Use None for default empty DataFrame
+    page_sizes:List[dict]=list(),
     doc_full_file_name_textbox:str='',
     input_folder:str=INPUT_FOLDER
 ) -> Tuple[image_annotator, gr.Number, gr.Number, int, str, gr.Dataframe, pd.DataFrame, List[str], List[str], List[dict], List[AnnotatedImageData]]:
         # Return blank/default outputs
         blank_annotator = image_annotator(
+            value = None, boxes_alpha=0.1, box_thickness=1, label_list=list(), label_colors=list(),
             show_label=False, height=zoom_str, width=zoom_str, box_min_size=1,
             box_selected_thickness=2, handle_size=4, sources=None,
             show_clear_button=False, show_share_button=False, show_remove_button=False,
     if not page_sizes_df.empty:
         page_sizes = page_sizes_df.to_dict(orient='records')
     else:
+        page_sizes = list() # Ensure page_sizes is a list if df is empty
     # --- OPTIMIZATION: Prepare data *only* for the current page for display ---
     current_page_image_annotator_object = None
     except Exception as e:
         print(f"Error calling update_recogniser_dataframes: {e}. Returning empty/default filter data.")
+        recogniser_entities_list = list()
+        recogniser_colour_list = list()
         recogniser_dataframe_out_gr = gr.Dataframe(pd.DataFrame(columns=["page", "label", "text", "id"]))
         recogniser_dataframe_modified = pd.DataFrame(columns=["page", "label", "text", "id"])
+        text_entities_drop = list()
+        page_entities_drop = list()
     # --- Final Output Components ---
             interactive=True # Keep interactive if data is present
         )
+    page_entities_drop_redaction_list = list()
     all_pages_in_doc_list = [str(i) for i in range(1, len(page_sizes) + 1)]
     page_entities_drop_redaction_list.extend(all_pages_in_doc_list)
                                     current_page:int,
                                     previous_page:int,
                                     all_image_annotations:List[AnnotatedImageData],
+                                    page_sizes:List[dict]=list(),
                                     clear_all:bool=False
                                     ):
     '''
     page_image_annotator_object, all_image_annotations = replace_annotator_object_img_np_array_with_page_sizes_image_path(all_image_annotations, page_image_annotator_object, page_sizes, previous_page)
     if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
+    else: all_image_annotations[previous_page_zero_index]["boxes"] = list()
     return all_image_annotations, current_page, current_page
                      review_file_state:pd.DataFrame,
                      output_folder:str = OUTPUT_FOLDER,
                      save_pdf:bool=True,
+                     page_sizes:List[dict]=list(),
                      COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF,
                      progress=gr.Progress(track_tqdm=True)):
     '''
     Apply modified redactions to a pymupdf and export review files.
     '''
+    output_files = list()
+    output_log_files = list()
+    pdf_doc = list()
     review_df = review_file_state
     page_image_annotator_object = all_image_annotations[current_page - 1]
                 doc = [image]
             elif file_extension in '.csv':
+                pdf_doc = list()
             # If working with pdfs
             elif is_pdf(file_path) == True:
                 output_files.append(orig_pdf_file_path)
                 number_of_pages = pdf_doc.page_count
+                original_cropboxes = list()
                 page_sizes_df = pd.DataFrame(page_sizes)
                 page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
     return x1, adobe_y1, x2, adobe_y2
+def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str]=list(), document_cropboxes:List=list(), page_sizes:List[dict]=list()):
     '''
     Create an xfdf file from a review csv file and a pdf
     '''
     reparsed = minidom.parseString(rough_string)
     return reparsed.toxml() #.toprettyxml(indent="  ")
+def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = OUTPUT_FOLDER, document_cropboxes:List=list(), page_sizes:List[dict]=list()):
     '''
     Load in files to convert a review file into an Adobe comment file format
     '''
+    output_paths = list()
     pdf_name = ""
     file_path_name = ""
     # Define the namespace
     namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
+    redactions = list()
     # Find all redact elements using the namespace
     for redact in root.findall('.//xfdf:redact', namespaces=namespace):
     Returns:
     - DataFrame containing redaction information
     '''
+    output_paths = list()
+    xfdf_paths = list()
     df = pd.DataFrame()
     # Sort the file paths so that the pdfs come first