seanpedrickcase commited on
Commit
d3e6a24
·
1 Parent(s): d60759d

Added form, table, and layout extraction options to AWS Textract calls. Added options to config to bound document length, maximum table rows, etc.

Browse files
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
- from tools.config import DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, DEFAULT_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_LANGUAGE_FULL_NAME, SHOW_LANGUAGE_SELECTION, DO_INITIAL_TABULAR_DATA_CLEAN, DEFAULT_DUPLICATE_DETECTION_THRESHOLD, DIRECT_MODE_TASK, DIRECT_MODE_INPUT_FILE, DIRECT_MODE_OUTPUT_DIR, DIRECT_MODE_DUPLICATE_TYPE, DIRECT_MODE_DEFAULT_USER, LOCAL_PII_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, DOCUMENT_REDACTION_BUCKET, IMAGES_DPI, PREPROCESS_LOCAL_OCR_IMAGES, COMPRESS_REDACTED_PDF, RETURN_PDF_END_OF_REDACTION, DENY_LIST_PATH, WHOLE_PAGE_REDACTION_LIST_PATH, DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, DEFAULT_PAGE_MIN, DEFAULT_PAGE_MAX, DEFAULT_EXCEL_SHEETS, USE_GREEDY_DUPLICATE_DETECTION, DEFAULT_MIN_CONSECUTIVE_PAGES, DEFAULT_COMBINE_PAGES, DEFAULT_MIN_WORD_COUNT, DEFAULT_TEXT_COLUMNS, DEFAULT_SEARCH_QUERY, REMOVE_DUPLICATE_ROWS
6
  from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe, update_language_dropdown, LANGUAGE_CHOICES, MAPPED_LANGUAGE_CHOICES
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
@@ -349,7 +349,7 @@ with app:
349
  text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
350
 
351
  with gr.Accordion("Enable AWS Textract signature detection (default is off)", open = False):
352
- handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX)
353
 
354
  with gr.Row(equal_height=True):
355
  pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = DEFAULT_PII_DETECTION_MODEL, choices=PII_DETECTION_MODELS)
@@ -385,7 +385,7 @@ with app:
385
  with gr.Column(scale=2):
386
  textract_job_detail_df = gr.Dataframe(label="Previous job details", visible=True, type="pandas", wrap=True, interactive=True, row_count=(0, 'fixed'), col_count=(5,'fixed'), static_columns=[0,1,2,3,4], max_height=400)
387
  with gr.Column(scale=1):
388
- job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
389
  check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
390
  with gr.Row():
391
  with gr.Column():
@@ -604,7 +604,7 @@ with app:
604
  with gr.Accordion("Upload docx, xlsx, or csv files", open = True):
605
  in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.docx'], height=FILE_INPUT_HEIGHT)
606
  with gr.Accordion("Redact open text", open = False):
607
- in_text = gr.Textbox(label="Enter open text", lines=10)
608
 
609
  in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
610
 
@@ -627,7 +627,7 @@ with app:
627
 
628
 
629
  ###
630
- # TABULAR DUPLICATE DETECTION TAB
631
  ###
632
  with gr.Accordion(label="Find duplicate cells in tabular data", open=False):
633
  gr.Markdown("""Find duplicate cells or rows in CSV, Excel, or Parquet files. This tool analyzes text content across all columns to identify similar or identical entries that may be duplicates. You can review the results and choose to remove duplicate rows from your files.""")
@@ -1355,6 +1355,9 @@ if __name__ == "__main__":
1355
  'deny_list_file': DENY_LIST_PATH,
1356
  'redact_whole_page_file': WHOLE_PAGE_REDACTION_LIST_PATH,
1357
  'handwrite_signature_extraction': DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
 
 
 
1358
  'anon_strategy': DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
1359
  'excel_sheets': DEFAULT_EXCEL_SHEETS,
1360
  'fuzzy_mistakes': DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
@@ -1398,5 +1401,16 @@ if __name__ == "__main__":
1398
  if DEFAULT_TEXT_COLUMNS:
1399
  print(f"Text columns: {DEFAULT_TEXT_COLUMNS}")
1400
  print(f"Remove duplicate rows: {REMOVE_DUPLICATE_ROWS}")
 
 
 
 
 
 
 
 
 
 
 
1401
  # Run the CLI main function with direct mode arguments
1402
  main(direct_mode_args=direct_mode_args)
 
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
+ from tools.config import DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, DEFAULT_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_LANGUAGE_FULL_NAME, SHOW_LANGUAGE_SELECTION, DO_INITIAL_TABULAR_DATA_CLEAN, DEFAULT_DUPLICATE_DETECTION_THRESHOLD, DIRECT_MODE_TASK, DIRECT_MODE_INPUT_FILE, DIRECT_MODE_OUTPUT_DIR, DIRECT_MODE_DUPLICATE_TYPE, DIRECT_MODE_DEFAULT_USER, LOCAL_PII_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, DOCUMENT_REDACTION_BUCKET, IMAGES_DPI, PREPROCESS_LOCAL_OCR_IMAGES, COMPRESS_REDACTED_PDF, RETURN_PDF_END_OF_REDACTION, DENY_LIST_PATH, WHOLE_PAGE_REDACTION_LIST_PATH, DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, DEFAULT_PAGE_MIN, DEFAULT_PAGE_MAX, DEFAULT_EXCEL_SHEETS, USE_GREEDY_DUPLICATE_DETECTION, DEFAULT_MIN_CONSECUTIVE_PAGES, DEFAULT_COMBINE_PAGES, DEFAULT_MIN_WORD_COUNT, DEFAULT_TEXT_COLUMNS, DEFAULT_SEARCH_QUERY, REMOVE_DUPLICATE_ROWS, HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS, MAX_OPEN_TEXT_CHARACTERS
6
  from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe, update_language_dropdown, LANGUAGE_CHOICES, MAPPED_LANGUAGE_CHOICES
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
 
349
  text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
350
 
351
  with gr.Accordion("Enable AWS Textract signature detection (default is off)", open = False):
352
+ handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS, value=DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX)
353
 
354
  with gr.Row(equal_height=True):
355
  pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = DEFAULT_PII_DETECTION_MODEL, choices=PII_DETECTION_MODELS)
 
385
  with gr.Column(scale=2):
386
  textract_job_detail_df = gr.Dataframe(label="Previous job details", visible=True, type="pandas", wrap=True, interactive=True, row_count=(0, 'fixed'), col_count=(5,'fixed'), static_columns=[0,1,2,3,4], max_height=400)
387
  with gr.Column(scale=1):
388
+ job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True, lines=2)
389
  check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
390
  with gr.Row():
391
  with gr.Column():
 
604
  with gr.Accordion("Upload docx, xlsx, or csv files", open = True):
605
  in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.docx'], height=FILE_INPUT_HEIGHT)
606
  with gr.Accordion("Redact open text", open = False):
607
+ in_text = gr.Textbox(label="Enter open text", lines=10, max_length=MAX_OPEN_TEXT_CHARACTERS)
608
 
609
  in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
610
 
 
627
 
628
 
629
  ###
630
+ # TABULAR DUPLICATE DETECTION
631
  ###
632
  with gr.Accordion(label="Find duplicate cells in tabular data", open=False):
633
  gr.Markdown("""Find duplicate cells or rows in CSV, Excel, or Parquet files. This tool analyzes text content across all columns to identify similar or identical entries that may be duplicates. You can review the results and choose to remove duplicate rows from your files.""")
 
1355
  'deny_list_file': DENY_LIST_PATH,
1356
  'redact_whole_page_file': WHOLE_PAGE_REDACTION_LIST_PATH,
1357
  'handwrite_signature_extraction': DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
1358
+ 'extract_forms': False,
1359
+ 'extract_tables': False,
1360
+ 'extract_layout': False,
1361
  'anon_strategy': DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
1362
  'excel_sheets': DEFAULT_EXCEL_SHEETS,
1363
  'fuzzy_mistakes': DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
 
1401
  if DEFAULT_TEXT_COLUMNS:
1402
  print(f"Text columns: {DEFAULT_TEXT_COLUMNS}")
1403
  print(f"Remove duplicate rows: {REMOVE_DUPLICATE_ROWS}")
1404
+
1405
+ # Combine extraction options
1406
+ extraction_options = list(direct_mode_args['handwrite_signature_extraction']) if direct_mode_args['handwrite_signature_extraction'] else []
1407
+ if direct_mode_args['extract_forms']:
1408
+ extraction_options.append('Extract forms')
1409
+ if direct_mode_args['extract_tables']:
1410
+ extraction_options.append('Extract tables')
1411
+ if direct_mode_args['extract_layout']:
1412
+ extraction_options.append('Extract layout')
1413
+ direct_mode_args['handwrite_signature_extraction'] = extraction_options
1414
+
1415
  # Run the CLI main function with direct mode arguments
1416
  main(direct_mode_args=direct_mode_args)
cli_redact.py CHANGED
@@ -133,6 +133,9 @@ python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_profe
133
  ## Redact specific pages with AWS OCR and signature extraction:
134
  python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
135
 
 
 
 
136
  # Duplicate page detection
137
 
138
  ## Find duplicate pages in OCR files:
@@ -212,6 +215,9 @@ python cli_redact.py --task textract --textract_action list
212
  pdf_group.add_argument('--allow_list_file', default=ALLOW_LIST_PATH, help='Custom words file to recognize for redaction.')
213
  pdf_group.add_argument('--redact_whole_page_file', default=WHOLE_PAGE_REDACTION_LIST_PATH, help='File for pages to redact completely.')
214
  pdf_group.add_argument('--handwrite_signature_extraction', nargs='+', default=default_handwrite_signature_checkbox, help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".')
 
 
 
215
 
216
  # --- Word/Tabular Anonymisation Arguments ---
217
  tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
@@ -279,6 +285,16 @@ python cli_redact.py --task textract --textract_action list
279
  if args.save_to_user_folders == "True": args.save_to_user_folders = True
280
  else: args.save_to_user_folders = False
281
 
 
 
 
 
 
 
 
 
 
 
282
  if args.task in ['redact', 'deduplicate']:
283
  if args.input_file:
284
  if isinstance(args.input_file, str):
@@ -298,8 +314,6 @@ python cli_redact.py --task textract --textract_action list
298
  except Exception as e:
299
  print(f"Warning: Could not initialise usage logger: {e}")
300
 
301
- print(f"Argument args.save_to_user_folders: {args.save_to_user_folders} will be used to determine if outputs will be saved to user folders.")
302
-
303
  # Get username and folders
304
  session_hash, args.output_dir, _, args.input_dir, args.textract_input_prefix, args.textract_output_prefix, args.s3_textract_document_logs_subfolder, args.local_textract_document_logs_subfolder = get_username_and_folders(username=args.username, output_folder_textbox=args.output_dir, input_folder_textbox=args.input_dir, session_output_folder=args.save_to_user_folders, textract_document_upload_input_folder=args.textract_input_prefix, textract_document_upload_output_folder=args.textract_output_prefix, s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder)
305
 
@@ -711,7 +725,7 @@ python cli_redact.py --task textract --textract_action list
711
  s3_bucket_name=textract_bucket,
712
  general_s3_bucket_name=args.s3_bucket,
713
  local_output_dir=args.output_dir,
714
- analyse_signatures=signature_options,
715
  aws_region=args.aws_region
716
  )
717
 
 
133
  ## Redact specific pages with AWS OCR and signature extraction:
134
  python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
135
 
136
+ ## Redact with AWS OCR and additional extraction options:
137
+ python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --ocr_method "AWS Textract" --extract_forms --extract_tables --extract_layout
138
+
139
  # Duplicate page detection
140
 
141
  ## Find duplicate pages in OCR files:
 
215
  pdf_group.add_argument('--allow_list_file', default=ALLOW_LIST_PATH, help='Custom words file to recognize for redaction.')
216
  pdf_group.add_argument('--redact_whole_page_file', default=WHOLE_PAGE_REDACTION_LIST_PATH, help='File for pages to redact completely.')
217
  pdf_group.add_argument('--handwrite_signature_extraction', nargs='+', default=default_handwrite_signature_checkbox, help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".')
218
+ pdf_group.add_argument('--extract_forms', action='store_true', help='Extract forms during Textract analysis.')
219
+ pdf_group.add_argument('--extract_tables', action='store_true', help='Extract tables during Textract analysis.')
220
+ pdf_group.add_argument('--extract_layout', action='store_true', help='Extract layout during Textract analysis.')
221
 
222
  # --- Word/Tabular Anonymisation Arguments ---
223
  tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
 
285
  if args.save_to_user_folders == "True": args.save_to_user_folders = True
286
  else: args.save_to_user_folders = False
287
 
288
+ # Combine extraction options
289
+ extraction_options = list(args.handwrite_signature_extraction) if args.handwrite_signature_extraction else []
290
+ if args.extract_forms:
291
+ extraction_options.append('Extract forms')
292
+ if args.extract_tables:
293
+ extraction_options.append('Extract tables')
294
+ if args.extract_layout:
295
+ extraction_options.append('Extract layout')
296
+ args.handwrite_signature_extraction = extraction_options
297
+
298
  if args.task in ['redact', 'deduplicate']:
299
  if args.input_file:
300
  if isinstance(args.input_file, str):
 
314
  except Exception as e:
315
  print(f"Warning: Could not initialise usage logger: {e}")
316
 
 
 
317
  # Get username and folders
318
  session_hash, args.output_dir, _, args.input_dir, args.textract_input_prefix, args.textract_output_prefix, args.s3_textract_document_logs_subfolder, args.local_textract_document_logs_subfolder = get_username_and_folders(username=args.username, output_folder_textbox=args.output_dir, input_folder_textbox=args.input_dir, session_output_folder=args.save_to_user_folders, textract_document_upload_input_folder=args.textract_input_prefix, textract_document_upload_output_folder=args.textract_output_prefix, s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder)
319
 
 
725
  s3_bucket_name=textract_bucket,
726
  general_s3_bucket_name=args.s3_bucket,
727
  local_output_dir=args.output_dir,
728
+ handwrite_signature_checkbox=signature_options,
729
  aws_region=args.aws_region
730
  )
731
 
lambda_entrypoint.py CHANGED
@@ -93,6 +93,9 @@ def lambda_handler(event, context):
93
  'page_min': int(arguments.get('page_min', 0)),
94
  'page_max': int(arguments.get('page_max', 0)),
95
  'handwrite_signature_extraction': arguments.get('handwrite_signature_checkbox', ['Extract handwriting', 'Extract signatures']),
 
 
 
96
 
97
  # General arguments
98
  'local_redact_entities': arguments.get('local_redact_entities', []),
@@ -156,6 +159,16 @@ def lambda_handler(event, context):
156
  'return_pdf_end_of_redaction': arguments.get('return_pdf_end_of_redaction', True)
157
  }
158
 
 
 
 
 
 
 
 
 
 
 
159
  # Download optional files if they are specified
160
  allow_list_key = arguments.get('allow_list_file')
161
  if allow_list_key:
 
93
  'page_min': int(arguments.get('page_min', 0)),
94
  'page_max': int(arguments.get('page_max', 0)),
95
  'handwrite_signature_extraction': arguments.get('handwrite_signature_checkbox', ['Extract handwriting', 'Extract signatures']),
96
+ 'extract_forms': arguments.get('extract_forms', False),
97
+ 'extract_tables': arguments.get('extract_tables', False),
98
+ 'extract_layout': arguments.get('extract_layout', False),
99
 
100
  # General arguments
101
  'local_redact_entities': arguments.get('local_redact_entities', []),
 
159
  'return_pdf_end_of_redaction': arguments.get('return_pdf_end_of_redaction', True)
160
  }
161
 
162
+ # Combine extraction options
163
+ extraction_options = list(cli_args['handwrite_signature_extraction']) if cli_args['handwrite_signature_extraction'] else []
164
+ if cli_args['extract_forms']:
165
+ extraction_options.append('Extract forms')
166
+ if cli_args['extract_tables']:
167
+ extraction_options.append('Extract tables')
168
+ if cli_args['extract_layout']:
169
+ extraction_options.append('Extract layout')
170
+ cli_args['handwrite_signature_extraction'] = extraction_options
171
+
172
  # Download optional files if they are specified
173
  allow_list_key = arguments.get('allow_list_file')
174
  if allow_list_key:
tools/aws_textract.py CHANGED
@@ -7,7 +7,7 @@ import pikepdf
7
  import time
8
  import pandas as pd
9
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
10
- from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
11
 
12
  def extract_textract_metadata(response:object):
13
  """Extracts metadata from an AWS Textract response."""
@@ -20,20 +20,69 @@ def extract_textract_metadata(response:object):
20
  'Pages': pages
21
  })
22
 
23
- def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Extract handwriting", "Redact all identified signatures"]):
24
  '''
25
- Analyse page with AWS Textract
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  '''
27
 
28
- print("handwrite_signature_checkbox in analyse_page_with_textract:", handwrite_signature_checkbox)
29
  if client == "":
30
  try:
31
- if AWS_ACCESS_KEY and AWS_SECRET_KEY:
 
 
 
 
 
32
  client = boto3.client('textract',
33
- aws_access_key_id=AWS_ACCESS_KEY,
34
- aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
35
- else:
 
36
  client = boto3.client('textract', region_name=AWS_REGION)
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  except:
38
  out_message = "Cannot connect to AWS Textract"
39
  print(out_message)
@@ -41,15 +90,24 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
41
  return [], "" # Return an empty list and an empty string
42
 
43
  # Redact signatures if specified
44
- if "Redact all identified signatures" in handwrite_signature_checkbox:
45
- #print("Analysing document with signature detection")
 
 
 
 
 
 
 
46
  try:
47
- response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
48
  except Exception as e:
49
  print("Textract call failed due to:", e, "trying again in 3 seconds.")
50
  time.sleep(3)
51
- response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
52
- else:
 
 
53
  # Call detect_document_text to extract plain text
54
  try:
55
  response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
@@ -98,16 +156,33 @@ def convert_pike_pdf_page_to_bytes(pdf:object, page_num:int):
98
 
99
  def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
100
  '''
101
- Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  '''
103
- all_ocr_results = []
104
- signature_or_handwriting_recogniser_results = []
105
- signature_recogniser_results = []
106
- handwriting_recogniser_results = []
107
- signatures = []
108
- handwriting = []
109
- ocr_results_with_words = {}
110
- text_block={}
111
 
112
  text_line_number = 1
113
 
 
7
  import time
8
  import pandas as pd
9
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
10
+ from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, RUN_AWS_FUNCTIONS, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS
11
 
12
  def extract_textract_metadata(response:object):
13
  """Extracts metadata from an AWS Textract response."""
 
20
  'Pages': pages
21
  })
22
 
23
+ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Extract handwriting"], textract_output_found:bool=False, aws_access_key_textbox:str=AWS_ACCESS_KEY, aws_secret_key_textbox:str=AWS_SECRET_KEY, RUN_AWS_FUNCTIONS:str=RUN_AWS_FUNCTIONS, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:str=PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS):
24
  '''
25
+ Analyzes a single page of a document using AWS Textract to extract text and other features.
26
+
27
+ Args:
28
+ pdf_page_bytes (object): The content of the PDF page or image as bytes.
29
+ page_no (int): The page number being analyzed.
30
+ client (str, optional): An optional pre-initialized AWS Textract client. If not provided,
31
+ the function will attempt to create one based on configuration.
32
+ Defaults to "".
33
+ handwrite_signature_checkbox (List[str], optional): A list of feature types to extract
34
+ from the document. Options include
35
+ "Extract handwriting", "Extract signatures",
36
+ "Extract forms", "Extract layout", "Extract tables".
37
+ Defaults to ["Extract handwriting"].
38
+ textract_output_found (bool, optional): A flag indicating whether existing Textract output
39
+ for the document has been found. This can prevent
40
+ unnecessary API calls. Defaults to False.
41
+ aws_access_key_textbox (str, optional): AWS access key provided by the user, if not using
42
+ SSO or environment variables. Defaults to AWS_ACCESS_KEY.
43
+ aws_secret_key_textbox (str, optional): AWS secret key provided by the user, if not using
44
+ SSO or environment variables. Defaults to AWS_SECRET_KEY.
45
+ RUN_AWS_FUNCTIONS (str, optional): Configuration flag (e.g., "1" or "0") to enable or
46
+ disable AWS functions. Defaults to RUN_AWS_FUNCTIONS.
47
+ PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS (str, optional): Configuration flag (e.g., "1" or "0")
48
+ to prioritize AWS SSO credentials
49
+ over environment variables.
50
+ Defaults to PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS.
51
+
52
+ Returns:
53
+ Tuple[List[Dict], str]: A tuple containing:
54
+ - A list of dictionaries, where each dictionary represents a Textract block (e.g., LINE, WORD, FORM, TABLE).
55
+ - A string containing metadata about the Textract request.
56
  '''
57
 
58
+ #print("handwrite_signature_checkbox in analyse_page_with_textract:", handwrite_signature_checkbox)
59
  if client == "":
60
  try:
61
+ # Try to connect to AWS Textract Client if using that text extraction method
62
+ if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
63
+ print("Connecting to Textract via existing SSO connection")
64
+ client = boto3.client('textract', region_name=AWS_REGION)
65
+ elif aws_access_key_textbox and aws_secret_key_textbox:
66
+ print("Connecting to Textract using AWS access key and secret keys from user input.")
67
  client = boto3.client('textract',
68
+ aws_access_key_id=aws_access_key_textbox,
69
+ aws_secret_access_key=aws_secret_key_textbox, region_name=AWS_REGION)
70
+ elif RUN_AWS_FUNCTIONS == "1":
71
+ print("Connecting to Textract via existing SSO connection")
72
  client = boto3.client('textract', region_name=AWS_REGION)
73
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
74
+ print("Getting Textract credentials from environment variables.")
75
+ client = boto3.client('textract',
76
+ aws_access_key_id=AWS_ACCESS_KEY,
77
+ aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
78
+ elif textract_output_found==True:
79
+ print("Existing Textract data found for file, no need to connect to AWS Textract")
80
+ client = boto3.client('textract', region_name=AWS_REGION)
81
+ else:
82
+ client = ""
83
+ out_message = "Cannot connect to AWS Textract service."
84
+ print(out_message)
85
+ raise Exception(out_message)
86
  except:
87
  out_message = "Cannot connect to AWS Textract"
88
  print(out_message)
 
90
  return [], "" # Return an empty list and an empty string
91
 
92
  # Redact signatures if specified
93
+ feature_types = []
94
+ if "Extract signatures" in handwrite_signature_checkbox or "Extract forms" in handwrite_signature_checkbox or "Extract layout" in handwrite_signature_checkbox or "Extract tables" in handwrite_signature_checkbox:
95
+ feature_types.append("SIGNATURES")
96
+ if "Extract forms" in handwrite_signature_checkbox:
97
+ feature_types.append("FORMS")
98
+ if "Extract layout" in handwrite_signature_checkbox:
99
+ feature_types.append("LAYOUT")
100
+ if "Extract tables" in handwrite_signature_checkbox:
101
+ feature_types.append("TABLES")
102
  try:
103
+ response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=feature_types)
104
  except Exception as e:
105
  print("Textract call failed due to:", e, "trying again in 3 seconds.")
106
  time.sleep(3)
107
+ response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=feature_types)
108
+
109
+
110
+ if not "Extract signatures" in handwrite_signature_checkbox and not "Extract forms" in handwrite_signature_checkbox and not "Extract layout" in handwrite_signature_checkbox and not "Extract tables" in handwrite_signature_checkbox:
111
  # Call detect_document_text to extract plain text
112
  try:
113
  response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
 
156
 
157
  def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
158
  '''
159
+ Convert the json response from Textract to the OCRResult format used elsewhere in the code.
160
+ Looks for lines, words, and signatures. Handwriting and signatures are set aside especially
161
+ for later in case the user wants to override the default behaviour and redact all
162
+ handwriting/signatures.
163
+
164
+ Args:
165
+ json_data (dict): The raw JSON response from AWS Textract for a document or page.
166
+ page_width (float): The absolute width of the page in pixels.
167
+ page_height (float): The absolute height of the page in pixels.
168
+ page_no (int): The 1-based page number being processed.
169
+
170
+ Returns:
171
+ tuple: A tuple containing:
172
+ - dict: OCR results structured as an OCRResult object (containing 'page' and 'results' list).
173
+ - list: Bounding boxes identified as handwriting or signatures.
174
+ - list: Bounding boxes identified specifically as signatures.
175
+ - list: Bounding boxes identified specifically as handwriting.
176
+ - dict: OCR results with word-level detail, structured for further processing.
177
  '''
178
+ all_ocr_results = list()
179
+ signature_or_handwriting_recogniser_results = list()
180
+ signature_recogniser_results = list()
181
+ handwriting_recogniser_results = list()
182
+ signatures = list()
183
+ handwriting = list()
184
+ ocr_results_with_words = dict()
185
+ text_block=dict()
186
 
187
  text_line_number = 1
188
 
tools/config.py CHANGED
@@ -5,10 +5,19 @@ import logging
5
  from datetime import datetime
6
  from dotenv import load_dotenv
7
  from tldextract import TLDExtract
 
8
 
9
  today_rev = datetime.now().strftime("%Y%m%d")
10
  HOST_NAME = socket.gethostname()
11
 
 
 
 
 
 
 
 
 
12
  # Set or retrieve configuration variables for the redaction app
13
 
14
  def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
@@ -297,6 +306,22 @@ CUSTOM_ENTITIES = get_or_create_env_var('CUSTOM_ENTITIES', "['TITLES', 'UKPOSTCO
297
 
298
  DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var('DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX', "['Extract handwriting']")
299
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  DEFAULT_SEARCH_QUERY = get_or_create_env_var('DEFAULT_SEARCH_QUERY', '')
301
  DEFAULT_FUZZY_SPELLING_MISTAKES_NUM = int(get_or_create_env_var('DEFAULT_FUZZY_SPELLING_MISTAKES_NUM', '1'))
302
 
@@ -309,6 +334,11 @@ DEFAULT_PAGE_MAX = int(get_or_create_env_var('DEFAULT_PAGE_MAX', '999'))
309
  PAGE_BREAK_VALUE = int(get_or_create_env_var('PAGE_BREAK_VALUE', '99999'))
310
 
311
  MAX_TIME_VALUE = int(get_or_create_env_var('MAX_TIME_VALUE', '999999'))
 
 
 
 
 
312
 
313
  CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
314
 
 
5
  from datetime import datetime
6
  from dotenv import load_dotenv
7
  from tldextract import TLDExtract
8
+ from typing import List
9
 
10
  today_rev = datetime.now().strftime("%Y%m%d")
11
  HOST_NAME = socket.gethostname()
12
 
13
+ def _get_env_list(env_var_name: str) -> List[str]:
14
+ """Parses a comma-separated environment variable into a list of strings."""
15
+ value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
16
+ if not value:
17
+ return []
18
+ # Split by comma and filter out any empty strings that might result from extra commas
19
+ return [s.strip() for s in value.split(',') if s.strip()]
20
+
21
  # Set or retrieve configuration variables for the redaction app
22
 
23
  def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
 
306
 
307
  DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var('DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX', "['Extract handwriting']")
308
 
309
+ HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS = get_or_create_env_var('HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS', "['Extract handwriting', 'Extract signatures']")
310
+
311
+ if HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS: HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS = _get_env_list(HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS)
312
+
313
+ INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var('INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION', "False")
314
+ INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var('INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION', "False")
315
+ INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var('INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION', "False")
316
+
317
+ if INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION == "True":
318
+ HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append('Extract forms')
319
+ if INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION == "True":
320
+ HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append('Extract layout')
321
+ if INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION == "True":
322
+ HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append('Extract tables')
323
+
324
+
325
  DEFAULT_SEARCH_QUERY = get_or_create_env_var('DEFAULT_SEARCH_QUERY', '')
326
  DEFAULT_FUZZY_SPELLING_MISTAKES_NUM = int(get_or_create_env_var('DEFAULT_FUZZY_SPELLING_MISTAKES_NUM', '1'))
327
 
 
334
  PAGE_BREAK_VALUE = int(get_or_create_env_var('PAGE_BREAK_VALUE', '99999'))
335
 
336
  MAX_TIME_VALUE = int(get_or_create_env_var('MAX_TIME_VALUE', '999999'))
337
+ MAX_SIMULTANEOUS_FILES = int(get_or_create_env_var('MAX_SIMULTANEOUS_FILES', '10'))
338
+ MAX_DOC_PAGES = int(get_or_create_env_var('MAX_DOC_PAGES', '3000'))
339
+ MAX_TABLE_ROWS = int(get_or_create_env_var('MAX_TABLE_ROWS', '250000'))
340
+ MAX_TABLE_COLUMNS = int(get_or_create_env_var('MAX_TABLE_COLUMNS', '100'))
341
+ MAX_OPEN_TEXT_CHARACTERS = int(get_or_create_env_var('MAX_OPEN_TEXT_CHARACTERS', '50000'))
342
 
343
  CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
344
 
tools/data_anonymise.py CHANGED
@@ -18,7 +18,7 @@ from botocore.client import BaseClient
18
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
19
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
20
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
21
- from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices, DO_INITIAL_TABULAR_DATA_CLEAN, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS, AWS_REGION
22
  from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type, _get_env_list
23
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, create_nlp_analyser, load_spacy_model
24
  # Use custom version of analyze_dict to be able to track progress
@@ -261,6 +261,13 @@ def handle_docx_anonymisation(
261
  text_elements = list() # This will store the actual docx objects (paragraphs, cells)
262
  original_texts = list() # This will store the text from those objects
263
 
 
 
 
 
 
 
 
264
  # Extract from paragraphs
265
  for para in doc.paragraphs:
266
  if para.text.strip(): # Only process non-empty paragraphs
@@ -464,6 +471,14 @@ def anonymise_files_with_open_text(file_paths: List[str],
464
  else:
465
  out_message = "Please enter text or a file to redact."
466
  raise Exception(out_message)
 
 
 
 
 
 
 
 
467
 
468
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
469
  if latest_file_completed >= len(file_paths):
@@ -527,9 +542,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
527
 
528
  # Create xlsx file:
529
  anon_xlsx = pd.ExcelFile(file_path)
530
- anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
531
-
532
-
533
 
534
  # Iterate through the sheet names
535
  for sheet_name in progress.tqdm(in_excel_sheets, desc="Anonymising sheets", unit = "sheets"):
@@ -675,7 +688,20 @@ def tabular_anonymise_wrapper_func(
675
  anon_df_part = anon_df[chosen_cols_in_anon_df]
676
  anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
677
 
678
-
 
 
 
 
 
 
 
 
 
 
 
 
 
679
  # Anonymise the selected columns
680
  anon_df_part_out, key_string, decision_process_output_str, comprehend_query_number = anonymise_script(anon_df_part, anon_strategy, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser, do_initial_clean=do_initial_clean)
681
 
 
18
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
19
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
20
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
21
+ from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices, DO_INITIAL_TABULAR_DATA_CLEAN, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS, AWS_REGION, MAX_TABLE_ROWS, MAX_TABLE_COLUMNS, MAX_SIMULTANEOUS_FILES
22
  from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type, _get_env_list
23
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, create_nlp_analyser, load_spacy_model
24
  # Use custom version of analyze_dict to be able to track progress
 
261
  text_elements = list() # This will store the actual docx objects (paragraphs, cells)
262
  original_texts = list() # This will store the text from those objects
263
 
264
+ paragraph_count = len(doc.paragraphs)
265
+
266
+ if paragraph_count > MAX_TABLE_ROWS:
267
+ out_message = f"Number of paragraphs in document is greater than {MAX_TABLE_ROWS}. Please submit a smaller document."
268
+ print(out_message)
269
+ raise Exception(out_message)
270
+
271
  # Extract from paragraphs
272
  for para in doc.paragraphs:
273
  if para.text.strip(): # Only process non-empty paragraphs
 
471
  else:
472
  out_message = "Please enter text or a file to redact."
473
  raise Exception(out_message)
474
+
475
+ if not isinstance(file_paths, list):
476
+ file_paths = [file_paths]
477
+
478
+ if len(file_paths) > MAX_SIMULTANEOUS_FILES:
479
+ out_message = f"Number of files to anonymise is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
480
+ print(out_message)
481
+ raise Exception(out_message)
482
 
483
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
484
  if latest_file_completed >= len(file_paths):
 
542
 
543
  # Create xlsx file:
544
  anon_xlsx = pd.ExcelFile(file_path)
545
+ anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
 
 
546
 
547
  # Iterate through the sheet names
548
  for sheet_name in progress.tqdm(in_excel_sheets, desc="Anonymising sheets", unit = "sheets"):
 
688
  anon_df_part = anon_df[chosen_cols_in_anon_df]
689
  anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
690
 
691
+ row_count = anon_df_part.shape[0]
692
+
693
+ if row_count > MAX_TABLE_ROWS:
694
+ out_message = f"Number of rows in dataframe is greater than {MAX_TABLE_ROWS}. Please submit a smaller dataframe."
695
+ print(out_message)
696
+ raise Exception(out_message)
697
+
698
+ column_count = anon_df_part.shape[1]
699
+
700
+ if column_count > MAX_TABLE_COLUMNS:
701
+ out_message = f"Number of columns in dataframe is greater than {MAX_TABLE_COLUMNS}. Please submit a smaller dataframe."
702
+ print(out_message)
703
+ raise Exception(out_message)
704
+
705
  # Anonymise the selected columns
706
  anon_df_part_out, key_string, decision_process_output_str, comprehend_query_number = anonymise_script(anon_df_part, anon_strategy, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser, do_initial_clean=do_initial_clean)
707
 
tools/file_redaction.py CHANGED
@@ -20,7 +20,7 @@ import gradio as gr
20
  from gradio import Progress
21
  from collections import defaultdict # For efficient grouping
22
 
23
- from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS
24
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
25
  from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
26
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer, load_spacy_model, download_tesseract_lang_pack, create_nlp_analyser
@@ -106,7 +106,7 @@ def choose_and_run_redactor(file_paths:List[str],
106
  page_min:int=0,
107
  page_max:int=999,
108
  estimated_time_taken_state:float=0.0,
109
- handwrite_signature_checkbox:List[str]=list(["Extract handwriting", "Extract signatures"]),
110
  all_request_metadata_str:str = "",
111
  annotations_all_pages:List[dict]=list(),
112
  all_page_line_level_ocr_results_df:pd.DataFrame=None,
@@ -273,6 +273,11 @@ def choose_and_run_redactor(file_paths:List[str],
273
  file_paths_list = [os.path.abspath(file_paths)]
274
  else: file_paths_list = file_paths
275
 
 
 
 
 
 
276
  valid_extensions = {".pdf", ".jpg", ".jpeg", ".png"}
277
  # Filter only files with valid extensions. Currently only allowing one file to be redacted at a time
278
  # Filter the file_paths_list to include only files with valid extensions
@@ -374,7 +379,12 @@ def choose_and_run_redactor(file_paths:List[str],
374
 
375
  page_sizes = page_sizes_df.to_dict(orient="records")
376
 
377
- number_of_pages = pymupdf_doc.page_count
 
 
 
 
 
378
 
379
  # If we have reached the last page, return message and outputs
380
  if current_loop_page >= number_of_pages:
 
20
  from gradio import Progress
21
  from collections import defaultdict # For efficient grouping
22
 
23
+ from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS, MAX_DOC_PAGES, MAX_SIMULTANEOUS_FILES
24
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
25
  from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
26
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer, load_spacy_model, download_tesseract_lang_pack, create_nlp_analyser
 
106
  page_min:int=0,
107
  page_max:int=999,
108
  estimated_time_taken_state:float=0.0,
109
+ handwrite_signature_checkbox:List[str]=list(["Extract handwriting"]),
110
  all_request_metadata_str:str = "",
111
  annotations_all_pages:List[dict]=list(),
112
  all_page_line_level_ocr_results_df:pd.DataFrame=None,
 
273
  file_paths_list = [os.path.abspath(file_paths)]
274
  else: file_paths_list = file_paths
275
 
276
+ if len(file_paths_list) > MAX_SIMULTANEOUS_FILES:
277
+ out_message = f"Number of files to redact is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
278
+ print(out_message)
279
+ raise Exception(out_message)
280
+
281
  valid_extensions = {".pdf", ".jpg", ".jpeg", ".png"}
282
  # Filter only files with valid extensions. Currently only allowing one file to be redacted at a time
283
  # Filter the file_paths_list to include only files with valid extensions
 
379
 
380
  page_sizes = page_sizes_df.to_dict(orient="records")
381
 
382
+ number_of_pages = pymupdf_doc.page_count
383
+
384
+ if number_of_pages > MAX_DOC_PAGES:
385
+ out_message = f"Number of pages in document is greater than {MAX_DOC_PAGES}. Please submit a smaller document."
386
+ print(out_message)
387
+ raise Exception(out_message)
388
 
389
  # If we have reached the last page, return message and outputs
390
  if current_loop_page >= number_of_pages:
tools/textract_batch_call.py CHANGED
@@ -25,7 +25,7 @@ def analyse_document_with_textract_api(
25
  job_df:pd.DataFrame,
26
  s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
27
  local_output_dir: str = OUTPUT_FOLDER,
28
- analyse_signatures:List[str] = [],
29
  successful_job_number:int=0,
30
  total_document_page_count:int=1,
31
  general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
@@ -43,7 +43,7 @@ def analyse_document_with_textract_api(
43
  job_df (pd.DataFrame): Dataframe containing information from previous Textract API calls.
44
  s3_bucket_name (str, optional): S3 bucket in which to save API call outputs.
45
  local_output_dir (str, optional): Local directory to save the downloaded JSON results.
46
- analyse_signatures (List[str], optional): Analyse signatures? Default is no.
47
  successful_job_number (int): The number of successful jobs that have been submitted in this session.
48
  total_document_page_count (int): The number of pages in the document
49
  aws_region (str, optional): AWS region name. Defaults to boto3 default region.
@@ -122,10 +122,10 @@ def analyse_document_with_textract_api(
122
  if not job_df.empty:
123
 
124
  if "file_name" in job_df.columns:
125
- matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "file_name"]
126
- matching_job_id_file_names_dates = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "job_date_time"]
127
- matching_job_id = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "job_id"]
128
- matching_handwrite_signature = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "signature_extraction"]
129
 
130
  if len(matching_job_id) > 0:
131
  pass
@@ -142,7 +142,16 @@ def analyse_document_with_textract_api(
142
  print(message)
143
 
144
  try:
145
- if "Extract signatures" in analyse_signatures:
 
 
 
 
 
 
 
 
 
146
  response = textract_client.start_document_analysis(
147
  DocumentLocation={
148
  'S3Object': {
@@ -150,20 +159,15 @@ def analyse_document_with_textract_api(
150
  'Name': s3_input_key
151
  }
152
  },
153
- FeatureTypes=['SIGNATURES'], # Analyze for signatures, forms, and tables
154
  OutputConfig={
155
  'S3Bucket': s3_bucket_name,
156
  'S3Prefix': s3_output_prefix
157
  }
158
- # Optional: Add NotificationChannel for SNS topic notifications
159
- # NotificationChannel={
160
- # 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
161
- # 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
162
- # }
163
  )
164
  job_type="document_analysis"
165
 
166
- else:
167
  response = textract_client.start_document_text_detection(
168
  DocumentLocation={
169
  'S3Object': {
@@ -190,7 +194,7 @@ def analyse_document_with_textract_api(
190
  'job_id': job_id,
191
  'file_name': pdf_filename,
192
  'job_type': job_type,
193
- 'signature_extraction':analyse_signatures,
194
  'job_date_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
195
  }])
196
 
@@ -236,7 +240,21 @@ def return_job_status(job_id:str,
236
  max_polling_attempts: int = 1 # ~10 minutes total wait time
237
  ):
238
  '''
239
- Poll Textract for the current status of a previously-submitted job.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  '''
241
 
242
  job_status = response['JobStatus']
 
25
  job_df:pd.DataFrame,
26
  s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
27
  local_output_dir: str = OUTPUT_FOLDER,
28
+ handwrite_signature_checkbox:List[str] = list(),
29
  successful_job_number:int=0,
30
  total_document_page_count:int=1,
31
  general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
 
43
  job_df (pd.DataFrame): Dataframe containing information from previous Textract API calls.
44
  s3_bucket_name (str, optional): S3 bucket in which to save API call outputs.
45
  local_output_dir (str, optional): Local directory to save the downloaded JSON results.
46
+ handwrite_signature_checkbox (List[str], optional): List of feature types to extract from the document.
47
  successful_job_number (int): The number of successful jobs that have been submitted in this session.
48
  total_document_page_count (int): The number of pages in the document
49
  aws_region (str, optional): AWS region name. Defaults to boto3 default region.
 
122
  if not job_df.empty:
123
 
124
  if "file_name" in job_df.columns:
125
+ matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "file_name"]
126
+ matching_job_id_file_names_dates = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "job_date_time"]
127
+ matching_job_id = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "job_id"]
128
+ matching_handwrite_signature = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "signature_extraction"]
129
 
130
  if len(matching_job_id) > 0:
131
  pass
 
142
  print(message)
143
 
144
  try:
145
+ if "Extract signatures" in handwrite_signature_checkbox or "Extract forms" in handwrite_signature_checkbox or "Extract layout" in handwrite_signature_checkbox or "Extract tables" in handwrite_signature_checkbox:
146
+ feature_types = list()
147
+ if 'Extract signatures' in handwrite_signature_checkbox:
148
+ feature_types.append('SIGNATURES')
149
+ if "Extract forms" in handwrite_signature_checkbox:
150
+ feature_types.append('FORMS')
151
+ if "Extract layout" in handwrite_signature_checkbox:
152
+ feature_types.append('LAYOUT')
153
+ if "Extract tables" in handwrite_signature_checkbox:
154
+ feature_types.append('TABLES')
155
  response = textract_client.start_document_analysis(
156
  DocumentLocation={
157
  'S3Object': {
 
159
  'Name': s3_input_key
160
  }
161
  },
162
+ FeatureTypes=feature_types, # Analyze for signatures, forms, and tables
163
  OutputConfig={
164
  'S3Bucket': s3_bucket_name,
165
  'S3Prefix': s3_output_prefix
166
  }
 
 
 
 
 
167
  )
168
  job_type="document_analysis"
169
 
170
+ if not "Extract signatures" in handwrite_signature_checkbox and not "Extract forms" in handwrite_signature_checkbox and not "Extract layout" in handwrite_signature_checkbox and not "Extract tables" in handwrite_signature_checkbox:
171
  response = textract_client.start_document_text_detection(
172
  DocumentLocation={
173
  'S3Object': {
 
194
  'job_id': job_id,
195
  'file_name': pdf_filename,
196
  'job_type': job_type,
197
+ 'signature_extraction':handwrite_signature_checkbox,
198
  'job_date_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
199
  }])
200
 
 
240
  max_polling_attempts: int = 1 # ~10 minutes total wait time
241
  ):
242
  '''
243
+ Polls the AWS Textract service to retrieve the current status of an asynchronous document analysis job.
244
+ This function checks the job status from the provided response and logs relevant information or errors.
245
+
246
+ Args:
247
+ job_id (str): The unique identifier of the Textract job.
248
+ response (dict): The response dictionary received from Textract's `get_document_analysis` or `get_document_text_detection` call.
249
+ attempts (int): The current polling attempt number.
250
+ poll_interval_seconds (int, optional): The time in seconds to wait before the next poll (currently unused in this function, but kept for context). Defaults to 0.
251
+ max_polling_attempts (int, optional): The maximum number of polling attempts allowed (currently unused in this function, but kept for context). Defaults to 1.
252
+
253
+ Returns:
254
+ str: The current status of the Textract job (e.g., 'IN_PROGRESS', 'SUCCEEDED').
255
+
256
+ Raises:
257
+ Exception: If the Textract job status is 'FAILED' or 'PARTIAL_SUCCESS', or if an unexpected status is encountered.
258
  '''
259
 
260
  job_status = response['JobStatus']