Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on about 20 hours ago

Commit

d3e6a24

1 Parent(s): d60759d

Added form, table, and layout extraction options to AWS Textract calls. Added options to config to bound document length, maximum table rows, etc.

Browse files

Files changed (8) hide show

app.py +19 -5
cli_redact.py +17 -3
lambda_entrypoint.py +13 -0
tools/aws_textract.py +97 -22
tools/config.py +30 -0
tools/data_anonymise.py +31 -5
tools/file_redaction.py +13 -3
tools/textract_batch_call.py +34 -16

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
-from tools.config import DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, DEFAULT_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC,  TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_LANGUAGE_FULL_NAME, SHOW_LANGUAGE_SELECTION, DO_INITIAL_TABULAR_DATA_CLEAN, DEFAULT_DUPLICATE_DETECTION_THRESHOLD, DIRECT_MODE_TASK, DIRECT_MODE_INPUT_FILE, DIRECT_MODE_OUTPUT_DIR, DIRECT_MODE_DUPLICATE_TYPE, DIRECT_MODE_DEFAULT_USER, LOCAL_PII_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, DOCUMENT_REDACTION_BUCKET, IMAGES_DPI, PREPROCESS_LOCAL_OCR_IMAGES, COMPRESS_REDACTED_PDF, RETURN_PDF_END_OF_REDACTION, DENY_LIST_PATH, WHOLE_PAGE_REDACTION_LIST_PATH, DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, DEFAULT_PAGE_MIN, DEFAULT_PAGE_MAX, DEFAULT_EXCEL_SHEETS, USE_GREEDY_DUPLICATE_DETECTION, DEFAULT_MIN_CONSECUTIVE_PAGES, DEFAULT_COMBINE_PAGES, DEFAULT_MIN_WORD_COUNT, DEFAULT_TEXT_COLUMNS, DEFAULT_SEARCH_QUERY, REMOVE_DUPLICATE_ROWS
 from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe, update_language_dropdown, LANGUAGE_CHOICES, MAPPED_LANGUAGE_CHOICES
 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
@@ -349,7 +349,7 @@ with app:
             text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
             with gr.Accordion("Enable AWS Textract signature detection (default is off)", open = False):
-                handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX)
             with gr.Row(equal_height=True):
                 pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = DEFAULT_PII_DETECTION_MODEL, choices=PII_DETECTION_MODELS)
@@ -385,7 +385,7 @@ with app:
                         with gr.Column(scale=2):
                             textract_job_detail_df = gr.Dataframe(label="Previous job details", visible=True, type="pandas", wrap=True, interactive=True, row_count=(0, 'fixed'), col_count=(5,'fixed'), static_columns=[0,1,2,3,4], max_height=400)
                         with gr.Column(scale=1):
-                            job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
                             check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
                     with gr.Row():
                         with gr.Column():
@@ -604,7 +604,7 @@ with app:
             with gr.Accordion("Upload docx, xlsx, or csv files", open = True):
                 in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.docx'], height=FILE_INPUT_HEIGHT)
             with gr.Accordion("Redact open text", open = False):
-                in_text = gr.Textbox(label="Enter open text", lines=10)
             in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
@@ -627,7 +627,7 @@ with app:
         ###
-        # TABULAR DUPLICATE DETECTION TAB
         ###
         with gr.Accordion(label="Find duplicate cells in tabular data", open=False):
             gr.Markdown("""Find duplicate cells or rows in CSV, Excel, or Parquet files. This tool analyzes text content across all columns to identify similar or identical entries that may be duplicates. You can review the results and choose to remove duplicate rows from your files.""")
@@ -1355,6 +1355,9 @@ if __name__ == "__main__":
             'deny_list_file': DENY_LIST_PATH,
             'redact_whole_page_file': WHOLE_PAGE_REDACTION_LIST_PATH,
             'handwrite_signature_extraction': DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
             'anon_strategy': DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
             'excel_sheets': DEFAULT_EXCEL_SHEETS,
             'fuzzy_mistakes': DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
@@ -1398,5 +1401,16 @@ if __name__ == "__main__":
             if DEFAULT_TEXT_COLUMNS:
                 print(f"Text columns: {DEFAULT_TEXT_COLUMNS}")
             print(f"Remove duplicate rows: {REMOVE_DUPLICATE_ROWS}")
         # Run the CLI main function with direct mode arguments
         main(direct_mode_args=direct_mode_args)

 import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
+from tools.config import DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, DEFAULT_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC,  TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_LANGUAGE_FULL_NAME, SHOW_LANGUAGE_SELECTION, DO_INITIAL_TABULAR_DATA_CLEAN, DEFAULT_DUPLICATE_DETECTION_THRESHOLD, DIRECT_MODE_TASK, DIRECT_MODE_INPUT_FILE, DIRECT_MODE_OUTPUT_DIR, DIRECT_MODE_DUPLICATE_TYPE, DIRECT_MODE_DEFAULT_USER, LOCAL_PII_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, DOCUMENT_REDACTION_BUCKET, IMAGES_DPI, PREPROCESS_LOCAL_OCR_IMAGES, COMPRESS_REDACTED_PDF, RETURN_PDF_END_OF_REDACTION, DENY_LIST_PATH, WHOLE_PAGE_REDACTION_LIST_PATH, DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, DEFAULT_PAGE_MIN, DEFAULT_PAGE_MAX, DEFAULT_EXCEL_SHEETS, USE_GREEDY_DUPLICATE_DETECTION, DEFAULT_MIN_CONSECUTIVE_PAGES, DEFAULT_COMBINE_PAGES, DEFAULT_MIN_WORD_COUNT, DEFAULT_TEXT_COLUMNS, DEFAULT_SEARCH_QUERY, REMOVE_DUPLICATE_ROWS, HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS, MAX_OPEN_TEXT_CHARACTERS
 from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe, update_language_dropdown, LANGUAGE_CHOICES, MAPPED_LANGUAGE_CHOICES
 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
             text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
             with gr.Accordion("Enable AWS Textract signature detection (default is off)", open = False):
+                handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS, value=DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX)
             with gr.Row(equal_height=True):
                 pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = DEFAULT_PII_DETECTION_MODEL, choices=PII_DETECTION_MODELS)
                         with gr.Column(scale=2):
                             textract_job_detail_df = gr.Dataframe(label="Previous job details", visible=True, type="pandas", wrap=True, interactive=True, row_count=(0, 'fixed'), col_count=(5,'fixed'), static_columns=[0,1,2,3,4], max_height=400)
                         with gr.Column(scale=1):
+                            job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True, lines=2)
                             check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
                     with gr.Row():
                         with gr.Column():
             with gr.Accordion("Upload docx, xlsx, or csv files", open = True):
                 in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.docx'], height=FILE_INPUT_HEIGHT)
             with gr.Accordion("Redact open text", open = False):
+                in_text = gr.Textbox(label="Enter open text", lines=10, max_length=MAX_OPEN_TEXT_CHARACTERS)
             in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
         ###
+        # TABULAR DUPLICATE DETECTION
         ###
         with gr.Accordion(label="Find duplicate cells in tabular data", open=False):
             gr.Markdown("""Find duplicate cells or rows in CSV, Excel, or Parquet files. This tool analyzes text content across all columns to identify similar or identical entries that may be duplicates. You can review the results and choose to remove duplicate rows from your files.""")
             'deny_list_file': DENY_LIST_PATH,
             'redact_whole_page_file': WHOLE_PAGE_REDACTION_LIST_PATH,
             'handwrite_signature_extraction': DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
+            'extract_forms': False,
+            'extract_tables': False,
+            'extract_layout': False,
             'anon_strategy': DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
             'excel_sheets': DEFAULT_EXCEL_SHEETS,
             'fuzzy_mistakes': DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
             if DEFAULT_TEXT_COLUMNS:
                 print(f"Text columns: {DEFAULT_TEXT_COLUMNS}")
             print(f"Remove duplicate rows: {REMOVE_DUPLICATE_ROWS}")
+        # Combine extraction options
+        extraction_options = list(direct_mode_args['handwrite_signature_extraction']) if direct_mode_args['handwrite_signature_extraction'] else []
+        if direct_mode_args['extract_forms']:
+            extraction_options.append('Extract forms')
+        if direct_mode_args['extract_tables']:
+            extraction_options.append('Extract tables')
+        if direct_mode_args['extract_layout']:
+            extraction_options.append('Extract layout')
+        direct_mode_args['handwrite_signature_extraction'] = extraction_options
         # Run the CLI main function with direct mode arguments
         main(direct_mode_args=direct_mode_args)

cli_redact.py CHANGED Viewed

@@ -133,6 +133,9 @@ python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_profe
 ## Redact specific pages with AWS OCR and signature extraction:
 python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
 # Duplicate page detection
 ## Find duplicate pages in OCR files:
@@ -212,6 +215,9 @@ python cli_redact.py --task textract --textract_action list
     pdf_group.add_argument('--allow_list_file', default=ALLOW_LIST_PATH, help='Custom words file to recognize for redaction.')
     pdf_group.add_argument('--redact_whole_page_file', default=WHOLE_PAGE_REDACTION_LIST_PATH, help='File for pages to redact completely.')
     pdf_group.add_argument('--handwrite_signature_extraction', nargs='+', default=default_handwrite_signature_checkbox, help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".')
     # --- Word/Tabular Anonymisation Arguments ---
     tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
@@ -279,6 +285,16 @@ python cli_redact.py --task textract --textract_action list
     if args.save_to_user_folders == "True": args.save_to_user_folders = True
     else: args.save_to_user_folders = False
     if args.task in ['redact', 'deduplicate']:
         if args.input_file:
             if isinstance(args.input_file, str):
@@ -298,8 +314,6 @@ python cli_redact.py --task textract --textract_action list
         except Exception as e:
             print(f"Warning: Could not initialise usage logger: {e}")
-    print(f"Argument args.save_to_user_folders: {args.save_to_user_folders} will be used to determine if outputs will be saved to user folders.")
     # Get username and folders
     session_hash, args.output_dir, _, args.input_dir, args.textract_input_prefix, args.textract_output_prefix, args.s3_textract_document_logs_subfolder, args.local_textract_document_logs_subfolder = get_username_and_folders(username=args.username, output_folder_textbox=args.output_dir, input_folder_textbox=args.input_dir, session_output_folder=args.save_to_user_folders, textract_document_upload_input_folder=args.textract_input_prefix, textract_document_upload_output_folder=args.textract_output_prefix, s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder)
@@ -711,7 +725,7 @@ python cli_redact.py --task textract --textract_action list
                     s3_bucket_name=textract_bucket,
                     general_s3_bucket_name=args.s3_bucket,
                     local_output_dir=args.output_dir,
-                    analyse_signatures=signature_options,
                     aws_region=args.aws_region
                 )

 ## Redact specific pages with AWS OCR and signature extraction:
 python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
+## Redact with AWS OCR and additional extraction options:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --ocr_method "AWS Textract" --extract_forms --extract_tables --extract_layout
 # Duplicate page detection
 ## Find duplicate pages in OCR files:
     pdf_group.add_argument('--allow_list_file', default=ALLOW_LIST_PATH, help='Custom words file to recognize for redaction.')
     pdf_group.add_argument('--redact_whole_page_file', default=WHOLE_PAGE_REDACTION_LIST_PATH, help='File for pages to redact completely.')
     pdf_group.add_argument('--handwrite_signature_extraction', nargs='+', default=default_handwrite_signature_checkbox, help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".')
+    pdf_group.add_argument('--extract_forms', action='store_true', help='Extract forms during Textract analysis.')
+    pdf_group.add_argument('--extract_tables', action='store_true', help='Extract tables during Textract analysis.')
+    pdf_group.add_argument('--extract_layout', action='store_true', help='Extract layout during Textract analysis.')
     # --- Word/Tabular Anonymisation Arguments ---
     tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
     if args.save_to_user_folders == "True": args.save_to_user_folders = True
     else: args.save_to_user_folders = False
+    # Combine extraction options
+    extraction_options = list(args.handwrite_signature_extraction) if args.handwrite_signature_extraction else []
+    if args.extract_forms:
+        extraction_options.append('Extract forms')
+    if args.extract_tables:
+        extraction_options.append('Extract tables')
+    if args.extract_layout:
+        extraction_options.append('Extract layout')
+    args.handwrite_signature_extraction = extraction_options
     if args.task in ['redact', 'deduplicate']:
         if args.input_file:
             if isinstance(args.input_file, str):
         except Exception as e:
             print(f"Warning: Could not initialise usage logger: {e}")
     # Get username and folders
     session_hash, args.output_dir, _, args.input_dir, args.textract_input_prefix, args.textract_output_prefix, args.s3_textract_document_logs_subfolder, args.local_textract_document_logs_subfolder = get_username_and_folders(username=args.username, output_folder_textbox=args.output_dir, input_folder_textbox=args.input_dir, session_output_folder=args.save_to_user_folders, textract_document_upload_input_folder=args.textract_input_prefix, textract_document_upload_output_folder=args.textract_output_prefix, s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder)
                     s3_bucket_name=textract_bucket,
                     general_s3_bucket_name=args.s3_bucket,
                     local_output_dir=args.output_dir,
+                    handwrite_signature_checkbox=signature_options,
                     aws_region=args.aws_region
                 )

lambda_entrypoint.py CHANGED Viewed

@@ -93,6 +93,9 @@ def lambda_handler(event, context):
         'page_min': int(arguments.get('page_min', 0)),
         'page_max': int(arguments.get('page_max', 0)),
         'handwrite_signature_extraction': arguments.get('handwrite_signature_checkbox', ['Extract handwriting', 'Extract signatures']),
         # General arguments
         'local_redact_entities': arguments.get('local_redact_entities', []),
@@ -156,6 +159,16 @@ def lambda_handler(event, context):
         'return_pdf_end_of_redaction': arguments.get('return_pdf_end_of_redaction', True)
     }
     # Download optional files if they are specified
     allow_list_key = arguments.get('allow_list_file')
     if allow_list_key:

         'page_min': int(arguments.get('page_min', 0)),
         'page_max': int(arguments.get('page_max', 0)),
         'handwrite_signature_extraction': arguments.get('handwrite_signature_checkbox', ['Extract handwriting', 'Extract signatures']),
+        'extract_forms': arguments.get('extract_forms', False),
+        'extract_tables': arguments.get('extract_tables', False),
+        'extract_layout': arguments.get('extract_layout', False),
         # General arguments
         'local_redact_entities': arguments.get('local_redact_entities', []),
         'return_pdf_end_of_redaction': arguments.get('return_pdf_end_of_redaction', True)
     }
+    # Combine extraction options
+    extraction_options = list(cli_args['handwrite_signature_extraction']) if cli_args['handwrite_signature_extraction'] else []
+    if cli_args['extract_forms']:
+        extraction_options.append('Extract forms')
+    if cli_args['extract_tables']:
+        extraction_options.append('Extract tables')
+    if cli_args['extract_layout']:
+        extraction_options.append('Extract layout')
+    cli_args['handwrite_signature_extraction'] = extraction_options
     # Download optional files if they are specified
     allow_list_key = arguments.get('allow_list_file')
     if allow_list_key:

tools/aws_textract.py CHANGED Viewed

@@ -7,7 +7,7 @@ import pikepdf
 import time
 import pandas as pd
 from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
-from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
 def extract_textract_metadata(response:object):
     """Extracts metadata from an AWS Textract response."""
@@ -20,20 +20,69 @@ def extract_textract_metadata(response:object):
         'Pages': pages
     })
-def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Extract handwriting", "Redact all identified signatures"]):
     '''
-    Analyse page with AWS Textract
     '''
-    print("handwrite_signature_checkbox in analyse_page_with_textract:", handwrite_signature_checkbox)
     if client == "":
         try:
-            if AWS_ACCESS_KEY and AWS_SECRET_KEY:
                 client = boto3.client('textract',
-                aws_access_key_id=AWS_ACCESS_KEY,
-                aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
-            else:
                 client = boto3.client('textract', region_name=AWS_REGION)
         except:
             out_message = "Cannot connect to AWS Textract"
             print(out_message)
@@ -41,15 +90,24 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
             return [], ""  # Return an empty list and an empty string
     # Redact signatures if specified
-    if "Redact all identified signatures" in handwrite_signature_checkbox:
-        #print("Analysing document with signature detection")
         try:
-            response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
         except Exception as e:
             print("Textract call failed due to:", e, "trying again in 3 seconds.")
             time.sleep(3)
-            response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
-    else:
         # Call detect_document_text to extract plain text
         try:
             response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
@@ -98,16 +156,33 @@ def convert_pike_pdf_page_to_bytes(pdf:object, page_num:int):
 def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
     '''
-    Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
     '''
-    all_ocr_results = []
-    signature_or_handwriting_recogniser_results = []
-    signature_recogniser_results = []
-    handwriting_recogniser_results = []
-    signatures = []
-    handwriting = []
-    ocr_results_with_words = {}
-    text_block={}
     text_line_number = 1

 import time
 import pandas as pd
 from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
+from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, RUN_AWS_FUNCTIONS, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS
 def extract_textract_metadata(response:object):
     """Extracts metadata from an AWS Textract response."""
         'Pages': pages
     })
+def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Extract handwriting"], textract_output_found:bool=False, aws_access_key_textbox:str=AWS_ACCESS_KEY, aws_secret_key_textbox:str=AWS_SECRET_KEY, RUN_AWS_FUNCTIONS:str=RUN_AWS_FUNCTIONS, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:str=PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS):
     '''
+    Analyzes a single page of a document using AWS Textract to extract text and other features.
+    Args:
+        pdf_page_bytes (object): The content of the PDF page or image as bytes.
+        page_no (int): The page number being analyzed.
+        client (str, optional): An optional pre-initialized AWS Textract client. If not provided,
+                                the function will attempt to create one based on configuration.
+                                Defaults to "".
+        handwrite_signature_checkbox (List[str], optional): A list of feature types to extract
+                                                            from the document. Options include
+                                                            "Extract handwriting", "Extract signatures",
+                                                            "Extract forms", "Extract layout", "Extract tables".
+                                                            Defaults to ["Extract handwriting"].
+        textract_output_found (bool, optional): A flag indicating whether existing Textract output
+                                                for the document has been found. This can prevent
+                                                unnecessary API calls. Defaults to False.
+        aws_access_key_textbox (str, optional): AWS access key provided by the user, if not using
+                                                SSO or environment variables. Defaults to AWS_ACCESS_KEY.
+        aws_secret_key_textbox (str, optional): AWS secret key provided by the user, if not using
+                                                SSO or environment variables. Defaults to AWS_SECRET_KEY.
+        RUN_AWS_FUNCTIONS (str, optional): Configuration flag (e.g., "1" or "0") to enable or
+                                           disable AWS functions. Defaults to RUN_AWS_FUNCTIONS.
+        PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS (str, optional): Configuration flag (e.g., "1" or "0")
+                                                                 to prioritize AWS SSO credentials
+                                                                 over environment variables.
+                                                                 Defaults to PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS.
+    Returns:
+        Tuple[List[Dict], str]: A tuple containing:
+            - A list of dictionaries, where each dictionary represents a Textract block (e.g., LINE, WORD, FORM, TABLE).
+            - A string containing metadata about the Textract request.
     '''
+    #print("handwrite_signature_checkbox in analyse_page_with_textract:", handwrite_signature_checkbox)
     if client == "":
         try:
+            # Try to connect to AWS Textract Client if using that text extraction method
+            if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
+                print("Connecting to Textract via existing SSO connection")
+                client = boto3.client('textract', region_name=AWS_REGION)
+            elif aws_access_key_textbox and aws_secret_key_textbox:
+                print("Connecting to Textract using AWS access key and secret keys from user input.")
                 client = boto3.client('textract',
+                    aws_access_key_id=aws_access_key_textbox,
+                    aws_secret_access_key=aws_secret_key_textbox, region_name=AWS_REGION)
+            elif RUN_AWS_FUNCTIONS == "1":
+                print("Connecting to Textract via existing SSO connection")
                 client = boto3.client('textract', region_name=AWS_REGION)
+            elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+                print("Getting Textract credentials from environment variables.")
+                client = boto3.client('textract',
+                    aws_access_key_id=AWS_ACCESS_KEY,
+                    aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
+            elif textract_output_found==True:
+                print("Existing Textract data found for file, no need to connect to AWS Textract")
+                client = boto3.client('textract', region_name=AWS_REGION)
+            else:
+                client = ""
+                out_message = "Cannot connect to AWS Textract service."
+                print(out_message)
+                raise Exception(out_message)
         except:
             out_message = "Cannot connect to AWS Textract"
             print(out_message)
             return [], ""  # Return an empty list and an empty string
     # Redact signatures if specified
+    feature_types = []
+    if "Extract signatures" in handwrite_signature_checkbox or "Extract forms" in handwrite_signature_checkbox or "Extract layout" in handwrite_signature_checkbox or "Extract tables" in handwrite_signature_checkbox:
+        feature_types.append("SIGNATURES")
+    if "Extract forms" in handwrite_signature_checkbox:
+        feature_types.append("FORMS")
+    if "Extract layout" in handwrite_signature_checkbox:
+        feature_types.append("LAYOUT")
+    if "Extract tables" in handwrite_signature_checkbox:
+        feature_types.append("TABLES")
         try:
+            response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=feature_types)
         except Exception as e:
             print("Textract call failed due to:", e, "trying again in 3 seconds.")
             time.sleep(3)
+            response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=feature_types)
+    if not "Extract signatures" in handwrite_signature_checkbox and not "Extract forms" in handwrite_signature_checkbox and not "Extract layout" in handwrite_signature_checkbox and not "Extract tables" in handwrite_signature_checkbox:
         # Call detect_document_text to extract plain text
         try:
             response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
 def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
     '''
+    Convert the json response from Textract to the OCRResult format used elsewhere in the code.
+    Looks for lines, words, and signatures. Handwriting and signatures are set aside especially
+    for later in case the user wants to override the default behaviour and redact all
+    handwriting/signatures.
+    Args:
+        json_data (dict): The raw JSON response from AWS Textract for a document or page.
+        page_width (float): The absolute width of the page in pixels.
+        page_height (float): The absolute height of the page in pixels.
+        page_no (int): The 1-based page number being processed.
+    Returns:
+        tuple: A tuple containing:
+            - dict: OCR results structured as an OCRResult object (containing 'page' and 'results' list).
+            - list: Bounding boxes identified as handwriting or signatures.
+            - list: Bounding boxes identified specifically as signatures.
+            - list: Bounding boxes identified specifically as handwriting.
+            - dict: OCR results with word-level detail, structured for further processing.
     '''
+    all_ocr_results = list()
+    signature_or_handwriting_recogniser_results = list()
+    signature_recogniser_results = list()
+    handwriting_recogniser_results = list()
+    signatures = list()
+    handwriting = list()
+    ocr_results_with_words = dict()
+    text_block=dict()
     text_line_number = 1

tools/config.py CHANGED Viewed

@@ -5,10 +5,19 @@ import logging
 from datetime import datetime
 from dotenv import load_dotenv
 from tldextract import TLDExtract
 today_rev = datetime.now().strftime("%Y%m%d")
 HOST_NAME = socket.gethostname()
 # Set or retrieve configuration variables for the redaction app
 def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
@@ -297,6 +306,22 @@ CUSTOM_ENTITIES = get_or_create_env_var('CUSTOM_ENTITIES', "['TITLES', 'UKPOSTCO
 DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var('DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX', "['Extract handwriting']")
 DEFAULT_SEARCH_QUERY = get_or_create_env_var('DEFAULT_SEARCH_QUERY', '')
 DEFAULT_FUZZY_SPELLING_MISTAKES_NUM = int(get_or_create_env_var('DEFAULT_FUZZY_SPELLING_MISTAKES_NUM', '1'))
@@ -309,6 +334,11 @@ DEFAULT_PAGE_MAX = int(get_or_create_env_var('DEFAULT_PAGE_MAX', '999'))
 PAGE_BREAK_VALUE = int(get_or_create_env_var('PAGE_BREAK_VALUE', '99999'))
 MAX_TIME_VALUE = int(get_or_create_env_var('MAX_TIME_VALUE', '999999'))
 CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour

 from datetime import datetime
 from dotenv import load_dotenv
 from tldextract import TLDExtract
+from typing import List
 today_rev = datetime.now().strftime("%Y%m%d")
 HOST_NAME = socket.gethostname()
+def _get_env_list(env_var_name: str) -> List[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(',') if s.strip()]
 # Set or retrieve configuration variables for the redaction app
 def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
 DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var('DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX', "['Extract handwriting']")
+HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS = get_or_create_env_var('HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS', "['Extract handwriting', 'Extract signatures']")
+if HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS: HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS = _get_env_list(HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS)
+INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var('INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION', "False")
+INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var('INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION', "False")
+INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var('INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION', "False")
+if INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION == "True":
+    HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append('Extract forms')
+if INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION == "True":
+    HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append('Extract layout')
+if INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION == "True":
+    HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append('Extract tables')
 DEFAULT_SEARCH_QUERY = get_or_create_env_var('DEFAULT_SEARCH_QUERY', '')
 DEFAULT_FUZZY_SPELLING_MISTAKES_NUM = int(get_or_create_env_var('DEFAULT_FUZZY_SPELLING_MISTAKES_NUM', '1'))
 PAGE_BREAK_VALUE = int(get_or_create_env_var('PAGE_BREAK_VALUE', '99999'))
 MAX_TIME_VALUE = int(get_or_create_env_var('MAX_TIME_VALUE', '999999'))
+MAX_SIMULTANEOUS_FILES = int(get_or_create_env_var('MAX_SIMULTANEOUS_FILES', '10'))
+MAX_DOC_PAGES = int(get_or_create_env_var('MAX_DOC_PAGES', '3000'))
+MAX_TABLE_ROWS = int(get_or_create_env_var('MAX_TABLE_ROWS', '250000'))
+MAX_TABLE_COLUMNS = int(get_or_create_env_var('MAX_TABLE_COLUMNS', '100'))
+MAX_OPEN_TEXT_CHARACTERS = int(get_or_create_env_var('MAX_OPEN_TEXT_CHARACTERS', '50000'))
 CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour

tools/data_anonymise.py CHANGED Viewed

@@ -18,7 +18,7 @@ from botocore.client import BaseClient
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
-from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices, DO_INITIAL_TABULAR_DATA_CLEAN, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS, AWS_REGION
 from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type, _get_env_list
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer,  create_nlp_analyser, load_spacy_model
 # Use custom version of analyze_dict to be able to track progress
@@ -261,6 +261,13 @@ def handle_docx_anonymisation(
     text_elements = list()  # This will store the actual docx objects (paragraphs, cells)
     original_texts = list() # This will store the text from those objects
     # Extract from paragraphs
     for para in doc.paragraphs:
         if para.text.strip():  # Only process non-empty paragraphs
@@ -464,6 +471,14 @@ def anonymise_files_with_open_text(file_paths: List[str],
         else:
             out_message = "Please enter text or a file to redact."
             raise Exception(out_message)
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     if latest_file_completed >= len(file_paths):
@@ -527,9 +542,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
                 # Create xlsx file:
                 anon_xlsx = pd.ExcelFile(file_path)
-                anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
                 # Iterate through the sheet names
                 for sheet_name in progress.tqdm(in_excel_sheets, desc="Anonymising sheets", unit = "sheets"):
@@ -675,7 +688,20 @@ def tabular_anonymise_wrapper_func(
     anon_df_part = anon_df[chosen_cols_in_anon_df]
     anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
     # Anonymise the selected columns
     anon_df_part_out, key_string, decision_process_output_str, comprehend_query_number = anonymise_script(anon_df_part, anon_strategy, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser, do_initial_clean=do_initial_clean)

 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
+from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices, DO_INITIAL_TABULAR_DATA_CLEAN, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS, AWS_REGION, MAX_TABLE_ROWS, MAX_TABLE_COLUMNS, MAX_SIMULTANEOUS_FILES
 from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type, _get_env_list
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer,  create_nlp_analyser, load_spacy_model
 # Use custom version of analyze_dict to be able to track progress
     text_elements = list()  # This will store the actual docx objects (paragraphs, cells)
     original_texts = list() # This will store the text from those objects
+    paragraph_count = len(doc.paragraphs)
+    if paragraph_count > MAX_TABLE_ROWS:
+        out_message = f"Number of paragraphs in document is greater than {MAX_TABLE_ROWS}. Please submit a smaller document."
+        print(out_message)
+        raise Exception(out_message)
     # Extract from paragraphs
     for para in doc.paragraphs:
         if para.text.strip():  # Only process non-empty paragraphs
         else:
             out_message = "Please enter text or a file to redact."
             raise Exception(out_message)
+    if not isinstance(file_paths, list):
+        file_paths = [file_paths]
+    if len(file_paths) > MAX_SIMULTANEOUS_FILES:
+        out_message = f"Number of files to anonymise is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
+        print(out_message)
+        raise Exception(out_message)
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     if latest_file_completed >= len(file_paths):
                 # Create xlsx file:
                 anon_xlsx = pd.ExcelFile(file_path)
+                anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
                 # Iterate through the sheet names
                 for sheet_name in progress.tqdm(in_excel_sheets, desc="Anonymising sheets", unit = "sheets"):
     anon_df_part = anon_df[chosen_cols_in_anon_df]
     anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
+    row_count = anon_df_part.shape[0]
+    if row_count > MAX_TABLE_ROWS:
+        out_message = f"Number of rows in dataframe is greater than {MAX_TABLE_ROWS}. Please submit a smaller dataframe."
+        print(out_message)
+        raise Exception(out_message)
+    column_count = anon_df_part.shape[1]
+    if column_count > MAX_TABLE_COLUMNS:
+        out_message = f"Number of columns in dataframe is greater than {MAX_TABLE_COLUMNS}. Please submit a smaller dataframe."
+        print(out_message)
+        raise Exception(out_message)
     # Anonymise the selected columns
     anon_df_part_out, key_string, decision_process_output_str, comprehend_query_number = anonymise_script(anon_df_part, anon_strategy, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser, do_initial_clean=do_initial_clean)

tools/file_redaction.py CHANGED Viewed

@@ -20,7 +20,7 @@ import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
-from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction,  recreate_page_line_level_ocr_results_with_page
 from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer, load_spacy_model, download_tesseract_lang_pack, create_nlp_analyser
@@ -106,7 +106,7 @@ def choose_and_run_redactor(file_paths:List[str],
  page_min:int=0,
  page_max:int=999,
  estimated_time_taken_state:float=0.0,
- handwrite_signature_checkbox:List[str]=list(["Extract handwriting", "Extract signatures"]),
  all_request_metadata_str:str = "",
  annotations_all_pages:List[dict]=list(),
  all_page_line_level_ocr_results_df:pd.DataFrame=None,
@@ -273,6 +273,11 @@ def choose_and_run_redactor(file_paths:List[str],
         file_paths_list = [os.path.abspath(file_paths)]
     else: file_paths_list = file_paths
     valid_extensions = {".pdf", ".jpg", ".jpeg", ".png"}
     # Filter only files with valid extensions. Currently only allowing one file to be redacted at a time
     # Filter the file_paths_list to include only files with valid extensions
@@ -374,7 +379,12 @@ def choose_and_run_redactor(file_paths:List[str],
     page_sizes = page_sizes_df.to_dict(orient="records")
-    number_of_pages = pymupdf_doc.page_count
     # If we have reached the last page, return message and outputs
     if current_loop_page >= number_of_pages:

 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
+from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS, MAX_DOC_PAGES, MAX_SIMULTANEOUS_FILES
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction,  recreate_page_line_level_ocr_results_with_page
 from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer, load_spacy_model, download_tesseract_lang_pack, create_nlp_analyser
  page_min:int=0,
  page_max:int=999,
  estimated_time_taken_state:float=0.0,
+ handwrite_signature_checkbox:List[str]=list(["Extract handwriting"]),
  all_request_metadata_str:str = "",
  annotations_all_pages:List[dict]=list(),
  all_page_line_level_ocr_results_df:pd.DataFrame=None,
         file_paths_list = [os.path.abspath(file_paths)]
     else: file_paths_list = file_paths
+    if len(file_paths_list) > MAX_SIMULTANEOUS_FILES:
+        out_message = f"Number of files to redact is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
+        print(out_message)
+        raise Exception(out_message)
     valid_extensions = {".pdf", ".jpg", ".jpeg", ".png"}
     # Filter only files with valid extensions. Currently only allowing one file to be redacted at a time
     # Filter the file_paths_list to include only files with valid extensions
     page_sizes = page_sizes_df.to_dict(orient="records")
+    number_of_pages = pymupdf_doc.page_count
+    if number_of_pages > MAX_DOC_PAGES:
+        out_message = f"Number of pages in document is greater than {MAX_DOC_PAGES}. Please submit a smaller document."
+        print(out_message)
+        raise Exception(out_message)
     # If we have reached the last page, return message and outputs
     if current_loop_page >= number_of_pages:

tools/textract_batch_call.py CHANGED Viewed

@@ -25,7 +25,7 @@ def analyse_document_with_textract_api(
     job_df:pd.DataFrame,
     s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
     local_output_dir: str = OUTPUT_FOLDER,
-    analyse_signatures:List[str] = [],
     successful_job_number:int=0,
     total_document_page_count:int=1,
     general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
@@ -43,7 +43,7 @@ def analyse_document_with_textract_api(
         job_df (pd.DataFrame): Dataframe containing information from previous Textract API calls.
         s3_bucket_name (str, optional): S3 bucket in which to save API call outputs.
         local_output_dir (str, optional): Local directory to save the downloaded JSON results.
-        analyse_signatures (List[str], optional): Analyse signatures? Default is no.
         successful_job_number (int): The number of successful jobs that have been submitted in this session.
         total_document_page_count (int): The number of pages in the document
         aws_region (str, optional): AWS region name. Defaults to boto3 default region.
@@ -122,10 +122,10 @@ def analyse_document_with_textract_api(
     if not job_df.empty:
         if "file_name" in job_df.columns:
-            matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "file_name"]
-            matching_job_id_file_names_dates = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "job_date_time"]
-            matching_job_id = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "job_id"]
-            matching_handwrite_signature = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "signature_extraction"]
             if len(matching_job_id) > 0:
                 pass
@@ -142,7 +142,16 @@ def analyse_document_with_textract_api(
     print(message)
     try:
-        if "Extract signatures" in analyse_signatures:
             response = textract_client.start_document_analysis(
                 DocumentLocation={
                     'S3Object': {
@@ -150,20 +159,15 @@ def analyse_document_with_textract_api(
                         'Name': s3_input_key
                     }
                 },
-                FeatureTypes=['SIGNATURES'], # Analyze for signatures, forms, and tables
                 OutputConfig={
                     'S3Bucket': s3_bucket_name,
                     'S3Prefix': s3_output_prefix
                 }
-                # Optional: Add NotificationChannel for SNS topic notifications
-                # NotificationChannel={
-                #     'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
-                #     'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
-                # }
             )
             job_type="document_analysis"
-        else:
             response = textract_client.start_document_text_detection(
                 DocumentLocation={
                     'S3Object': {
@@ -190,7 +194,7 @@ def analyse_document_with_textract_api(
             'job_id': job_id,
             'file_name': pdf_filename,
             'job_type': job_type,
-            'signature_extraction':analyse_signatures,
             'job_date_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
         }])
@@ -236,7 +240,21 @@ def return_job_status(job_id:str,
                      max_polling_attempts: int = 1 # ~10 minutes total wait time
                      ):
     '''
-    Poll Textract for the current status of a previously-submitted job.
     '''
     job_status = response['JobStatus']

     job_df:pd.DataFrame,
     s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
     local_output_dir: str = OUTPUT_FOLDER,
+    handwrite_signature_checkbox:List[str] = list(),
     successful_job_number:int=0,
     total_document_page_count:int=1,
     general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
         job_df (pd.DataFrame): Dataframe containing information from previous Textract API calls.
         s3_bucket_name (str, optional): S3 bucket in which to save API call outputs.
         local_output_dir (str, optional): Local directory to save the downloaded JSON results.
+        handwrite_signature_checkbox (List[str], optional): List of feature types to extract from the document.
         successful_job_number (int): The number of successful jobs that have been submitted in this session.
         total_document_page_count (int): The number of pages in the document
         aws_region (str, optional): AWS region name. Defaults to boto3 default region.
     if not job_df.empty:
         if "file_name" in job_df.columns:
+            matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "file_name"]
+            matching_job_id_file_names_dates = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "job_date_time"]
+            matching_job_id = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "job_id"]
+            matching_handwrite_signature = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "signature_extraction"]
             if len(matching_job_id) > 0:
                 pass
     print(message)
     try:
+        if "Extract signatures" in handwrite_signature_checkbox or "Extract forms" in handwrite_signature_checkbox or "Extract layout" in handwrite_signature_checkbox or "Extract tables" in handwrite_signature_checkbox:
+            feature_types = list()
+            if 'Extract signatures' in handwrite_signature_checkbox:
+                feature_types.append('SIGNATURES')
+            if "Extract forms" in handwrite_signature_checkbox:
+                feature_types.append('FORMS')
+            if "Extract layout" in handwrite_signature_checkbox:
+                feature_types.append('LAYOUT')
+            if "Extract tables" in handwrite_signature_checkbox:
+                feature_types.append('TABLES')
             response = textract_client.start_document_analysis(
                 DocumentLocation={
                     'S3Object': {
                         'Name': s3_input_key
                     }
                 },
+                FeatureTypes=feature_types, # Analyze for signatures, forms, and tables
                 OutputConfig={
                     'S3Bucket': s3_bucket_name,
                     'S3Prefix': s3_output_prefix
                 }
             )
             job_type="document_analysis"
+        if not "Extract signatures" in handwrite_signature_checkbox and not "Extract forms" in handwrite_signature_checkbox and not "Extract layout" in handwrite_signature_checkbox and not "Extract tables" in handwrite_signature_checkbox:
             response = textract_client.start_document_text_detection(
                 DocumentLocation={
                     'S3Object': {
             'job_id': job_id,
             'file_name': pdf_filename,
             'job_type': job_type,
+            'signature_extraction':handwrite_signature_checkbox,
             'job_date_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
         }])
                      max_polling_attempts: int = 1 # ~10 minutes total wait time
                      ):
     '''
+    Polls the AWS Textract service to retrieve the current status of an asynchronous document analysis job.
+    This function checks the job status from the provided response and logs relevant information or errors.
+    Args:
+        job_id (str): The unique identifier of the Textract job.
+        response (dict): The response dictionary received from Textract's `get_document_analysis` or `get_document_text_detection` call.
+        attempts (int): The current polling attempt number.
+        poll_interval_seconds (int, optional): The time in seconds to wait before the next poll (currently unused in this function, but kept for context). Defaults to 0.
+        max_polling_attempts (int, optional): The maximum number of polling attempts allowed (currently unused in this function, but kept for context). Defaults to 1.
+    Returns:
+        str: The current status of the Textract job (e.g., 'IN_PROGRESS', 'SUCCEEDED').
+    Raises:
+        Exception: If the Textract job status is 'FAILED' or 'PARTIAL_SUCCESS', or if an unexpected status is encountered.
     '''
     job_status = response['JobStatus']