Commit
·
d3e6a24
1
Parent(s):
d60759d
Added form, table, and layout extraction options to AWS Textract calls. Added options to config to bound document length, maximum table rows, etc.
Browse files- app.py +19 -5
- cli_redact.py +17 -3
- lambda_entrypoint.py +13 -0
- tools/aws_textract.py +97 -22
- tools/config.py +30 -0
- tools/data_anonymise.py +31 -5
- tools/file_redaction.py +13 -3
- tools/textract_batch_call.py +34 -16
app.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import pandas as pd
|
3 |
import gradio as gr
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
-
from tools.config import DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, DEFAULT_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_LANGUAGE_FULL_NAME, SHOW_LANGUAGE_SELECTION, DO_INITIAL_TABULAR_DATA_CLEAN, DEFAULT_DUPLICATE_DETECTION_THRESHOLD, DIRECT_MODE_TASK, DIRECT_MODE_INPUT_FILE, DIRECT_MODE_OUTPUT_DIR, DIRECT_MODE_DUPLICATE_TYPE, DIRECT_MODE_DEFAULT_USER, LOCAL_PII_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, DOCUMENT_REDACTION_BUCKET, IMAGES_DPI, PREPROCESS_LOCAL_OCR_IMAGES, COMPRESS_REDACTED_PDF, RETURN_PDF_END_OF_REDACTION, DENY_LIST_PATH, WHOLE_PAGE_REDACTION_LIST_PATH, DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, DEFAULT_PAGE_MIN, DEFAULT_PAGE_MAX, DEFAULT_EXCEL_SHEETS, USE_GREEDY_DUPLICATE_DETECTION, DEFAULT_MIN_CONSECUTIVE_PAGES, DEFAULT_COMBINE_PAGES, DEFAULT_MIN_WORD_COUNT, DEFAULT_TEXT_COLUMNS, DEFAULT_SEARCH_QUERY, REMOVE_DUPLICATE_ROWS
|
6 |
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe, update_language_dropdown, LANGUAGE_CHOICES, MAPPED_LANGUAGE_CHOICES
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
@@ -349,7 +349,7 @@ with app:
|
|
349 |
text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
|
350 |
|
351 |
with gr.Accordion("Enable AWS Textract signature detection (default is off)", open = False):
|
352 |
-
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=
|
353 |
|
354 |
with gr.Row(equal_height=True):
|
355 |
pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = DEFAULT_PII_DETECTION_MODEL, choices=PII_DETECTION_MODELS)
|
@@ -385,7 +385,7 @@ with app:
|
|
385 |
with gr.Column(scale=2):
|
386 |
textract_job_detail_df = gr.Dataframe(label="Previous job details", visible=True, type="pandas", wrap=True, interactive=True, row_count=(0, 'fixed'), col_count=(5,'fixed'), static_columns=[0,1,2,3,4], max_height=400)
|
387 |
with gr.Column(scale=1):
|
388 |
-
job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
|
389 |
check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
|
390 |
with gr.Row():
|
391 |
with gr.Column():
|
@@ -604,7 +604,7 @@ with app:
|
|
604 |
with gr.Accordion("Upload docx, xlsx, or csv files", open = True):
|
605 |
in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.docx'], height=FILE_INPUT_HEIGHT)
|
606 |
with gr.Accordion("Redact open text", open = False):
|
607 |
-
in_text = gr.Textbox(label="Enter open text", lines=10)
|
608 |
|
609 |
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
|
610 |
|
@@ -627,7 +627,7 @@ with app:
|
|
627 |
|
628 |
|
629 |
###
|
630 |
-
# TABULAR DUPLICATE DETECTION
|
631 |
###
|
632 |
with gr.Accordion(label="Find duplicate cells in tabular data", open=False):
|
633 |
gr.Markdown("""Find duplicate cells or rows in CSV, Excel, or Parquet files. This tool analyzes text content across all columns to identify similar or identical entries that may be duplicates. You can review the results and choose to remove duplicate rows from your files.""")
|
@@ -1355,6 +1355,9 @@ if __name__ == "__main__":
|
|
1355 |
'deny_list_file': DENY_LIST_PATH,
|
1356 |
'redact_whole_page_file': WHOLE_PAGE_REDACTION_LIST_PATH,
|
1357 |
'handwrite_signature_extraction': DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
|
|
|
|
|
|
|
1358 |
'anon_strategy': DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
|
1359 |
'excel_sheets': DEFAULT_EXCEL_SHEETS,
|
1360 |
'fuzzy_mistakes': DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
|
@@ -1398,5 +1401,16 @@ if __name__ == "__main__":
|
|
1398 |
if DEFAULT_TEXT_COLUMNS:
|
1399 |
print(f"Text columns: {DEFAULT_TEXT_COLUMNS}")
|
1400 |
print(f"Remove duplicate rows: {REMOVE_DUPLICATE_ROWS}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1401 |
# Run the CLI main function with direct mode arguments
|
1402 |
main(direct_mode_args=direct_mode_args)
|
|
|
2 |
import pandas as pd
|
3 |
import gradio as gr
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
+
from tools.config import DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, DEFAULT_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_LANGUAGE_FULL_NAME, SHOW_LANGUAGE_SELECTION, DO_INITIAL_TABULAR_DATA_CLEAN, DEFAULT_DUPLICATE_DETECTION_THRESHOLD, DIRECT_MODE_TASK, DIRECT_MODE_INPUT_FILE, DIRECT_MODE_OUTPUT_DIR, DIRECT_MODE_DUPLICATE_TYPE, DIRECT_MODE_DEFAULT_USER, LOCAL_PII_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, DOCUMENT_REDACTION_BUCKET, IMAGES_DPI, PREPROCESS_LOCAL_OCR_IMAGES, COMPRESS_REDACTED_PDF, RETURN_PDF_END_OF_REDACTION, DENY_LIST_PATH, WHOLE_PAGE_REDACTION_LIST_PATH, DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, DEFAULT_PAGE_MIN, DEFAULT_PAGE_MAX, DEFAULT_EXCEL_SHEETS, USE_GREEDY_DUPLICATE_DETECTION, DEFAULT_MIN_CONSECUTIVE_PAGES, DEFAULT_COMBINE_PAGES, DEFAULT_MIN_WORD_COUNT, DEFAULT_TEXT_COLUMNS, DEFAULT_SEARCH_QUERY, REMOVE_DUPLICATE_ROWS, HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS, MAX_OPEN_TEXT_CHARACTERS
|
6 |
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe, update_language_dropdown, LANGUAGE_CHOICES, MAPPED_LANGUAGE_CHOICES
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
|
|
349 |
text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
|
350 |
|
351 |
with gr.Accordion("Enable AWS Textract signature detection (default is off)", open = False):
|
352 |
+
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS, value=DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX)
|
353 |
|
354 |
with gr.Row(equal_height=True):
|
355 |
pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = DEFAULT_PII_DETECTION_MODEL, choices=PII_DETECTION_MODELS)
|
|
|
385 |
with gr.Column(scale=2):
|
386 |
textract_job_detail_df = gr.Dataframe(label="Previous job details", visible=True, type="pandas", wrap=True, interactive=True, row_count=(0, 'fixed'), col_count=(5,'fixed'), static_columns=[0,1,2,3,4], max_height=400)
|
387 |
with gr.Column(scale=1):
|
388 |
+
job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True, lines=2)
|
389 |
check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
|
390 |
with gr.Row():
|
391 |
with gr.Column():
|
|
|
604 |
with gr.Accordion("Upload docx, xlsx, or csv files", open = True):
|
605 |
in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.docx'], height=FILE_INPUT_HEIGHT)
|
606 |
with gr.Accordion("Redact open text", open = False):
|
607 |
+
in_text = gr.Textbox(label="Enter open text", lines=10, max_length=MAX_OPEN_TEXT_CHARACTERS)
|
608 |
|
609 |
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
|
610 |
|
|
|
627 |
|
628 |
|
629 |
###
|
630 |
+
# TABULAR DUPLICATE DETECTION
|
631 |
###
|
632 |
with gr.Accordion(label="Find duplicate cells in tabular data", open=False):
|
633 |
gr.Markdown("""Find duplicate cells or rows in CSV, Excel, or Parquet files. This tool analyzes text content across all columns to identify similar or identical entries that may be duplicates. You can review the results and choose to remove duplicate rows from your files.""")
|
|
|
1355 |
'deny_list_file': DENY_LIST_PATH,
|
1356 |
'redact_whole_page_file': WHOLE_PAGE_REDACTION_LIST_PATH,
|
1357 |
'handwrite_signature_extraction': DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
|
1358 |
+
'extract_forms': False,
|
1359 |
+
'extract_tables': False,
|
1360 |
+
'extract_layout': False,
|
1361 |
'anon_strategy': DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
|
1362 |
'excel_sheets': DEFAULT_EXCEL_SHEETS,
|
1363 |
'fuzzy_mistakes': DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
|
|
|
1401 |
if DEFAULT_TEXT_COLUMNS:
|
1402 |
print(f"Text columns: {DEFAULT_TEXT_COLUMNS}")
|
1403 |
print(f"Remove duplicate rows: {REMOVE_DUPLICATE_ROWS}")
|
1404 |
+
|
1405 |
+
# Combine extraction options
|
1406 |
+
extraction_options = list(direct_mode_args['handwrite_signature_extraction']) if direct_mode_args['handwrite_signature_extraction'] else []
|
1407 |
+
if direct_mode_args['extract_forms']:
|
1408 |
+
extraction_options.append('Extract forms')
|
1409 |
+
if direct_mode_args['extract_tables']:
|
1410 |
+
extraction_options.append('Extract tables')
|
1411 |
+
if direct_mode_args['extract_layout']:
|
1412 |
+
extraction_options.append('Extract layout')
|
1413 |
+
direct_mode_args['handwrite_signature_extraction'] = extraction_options
|
1414 |
+
|
1415 |
# Run the CLI main function with direct mode arguments
|
1416 |
main(direct_mode_args=direct_mode_args)
|
cli_redact.py
CHANGED
@@ -133,6 +133,9 @@ python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_profe
|
|
133 |
## Redact specific pages with AWS OCR and signature extraction:
|
134 |
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
|
135 |
|
|
|
|
|
|
|
136 |
# Duplicate page detection
|
137 |
|
138 |
## Find duplicate pages in OCR files:
|
@@ -212,6 +215,9 @@ python cli_redact.py --task textract --textract_action list
|
|
212 |
pdf_group.add_argument('--allow_list_file', default=ALLOW_LIST_PATH, help='Custom words file to recognize for redaction.')
|
213 |
pdf_group.add_argument('--redact_whole_page_file', default=WHOLE_PAGE_REDACTION_LIST_PATH, help='File for pages to redact completely.')
|
214 |
pdf_group.add_argument('--handwrite_signature_extraction', nargs='+', default=default_handwrite_signature_checkbox, help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".')
|
|
|
|
|
|
|
215 |
|
216 |
# --- Word/Tabular Anonymisation Arguments ---
|
217 |
tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
|
@@ -279,6 +285,16 @@ python cli_redact.py --task textract --textract_action list
|
|
279 |
if args.save_to_user_folders == "True": args.save_to_user_folders = True
|
280 |
else: args.save_to_user_folders = False
|
281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
if args.task in ['redact', 'deduplicate']:
|
283 |
if args.input_file:
|
284 |
if isinstance(args.input_file, str):
|
@@ -298,8 +314,6 @@ python cli_redact.py --task textract --textract_action list
|
|
298 |
except Exception as e:
|
299 |
print(f"Warning: Could not initialise usage logger: {e}")
|
300 |
|
301 |
-
print(f"Argument args.save_to_user_folders: {args.save_to_user_folders} will be used to determine if outputs will be saved to user folders.")
|
302 |
-
|
303 |
# Get username and folders
|
304 |
session_hash, args.output_dir, _, args.input_dir, args.textract_input_prefix, args.textract_output_prefix, args.s3_textract_document_logs_subfolder, args.local_textract_document_logs_subfolder = get_username_and_folders(username=args.username, output_folder_textbox=args.output_dir, input_folder_textbox=args.input_dir, session_output_folder=args.save_to_user_folders, textract_document_upload_input_folder=args.textract_input_prefix, textract_document_upload_output_folder=args.textract_output_prefix, s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder)
|
305 |
|
@@ -711,7 +725,7 @@ python cli_redact.py --task textract --textract_action list
|
|
711 |
s3_bucket_name=textract_bucket,
|
712 |
general_s3_bucket_name=args.s3_bucket,
|
713 |
local_output_dir=args.output_dir,
|
714 |
-
|
715 |
aws_region=args.aws_region
|
716 |
)
|
717 |
|
|
|
133 |
## Redact specific pages with AWS OCR and signature extraction:
|
134 |
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
|
135 |
|
136 |
+
## Redact with AWS OCR and additional extraction options:
|
137 |
+
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --ocr_method "AWS Textract" --extract_forms --extract_tables --extract_layout
|
138 |
+
|
139 |
# Duplicate page detection
|
140 |
|
141 |
## Find duplicate pages in OCR files:
|
|
|
215 |
pdf_group.add_argument('--allow_list_file', default=ALLOW_LIST_PATH, help='Custom words file to recognize for redaction.')
|
216 |
pdf_group.add_argument('--redact_whole_page_file', default=WHOLE_PAGE_REDACTION_LIST_PATH, help='File for pages to redact completely.')
|
217 |
pdf_group.add_argument('--handwrite_signature_extraction', nargs='+', default=default_handwrite_signature_checkbox, help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".')
|
218 |
+
pdf_group.add_argument('--extract_forms', action='store_true', help='Extract forms during Textract analysis.')
|
219 |
+
pdf_group.add_argument('--extract_tables', action='store_true', help='Extract tables during Textract analysis.')
|
220 |
+
pdf_group.add_argument('--extract_layout', action='store_true', help='Extract layout during Textract analysis.')
|
221 |
|
222 |
# --- Word/Tabular Anonymisation Arguments ---
|
223 |
tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
|
|
|
285 |
if args.save_to_user_folders == "True": args.save_to_user_folders = True
|
286 |
else: args.save_to_user_folders = False
|
287 |
|
288 |
+
# Combine extraction options
|
289 |
+
extraction_options = list(args.handwrite_signature_extraction) if args.handwrite_signature_extraction else []
|
290 |
+
if args.extract_forms:
|
291 |
+
extraction_options.append('Extract forms')
|
292 |
+
if args.extract_tables:
|
293 |
+
extraction_options.append('Extract tables')
|
294 |
+
if args.extract_layout:
|
295 |
+
extraction_options.append('Extract layout')
|
296 |
+
args.handwrite_signature_extraction = extraction_options
|
297 |
+
|
298 |
if args.task in ['redact', 'deduplicate']:
|
299 |
if args.input_file:
|
300 |
if isinstance(args.input_file, str):
|
|
|
314 |
except Exception as e:
|
315 |
print(f"Warning: Could not initialise usage logger: {e}")
|
316 |
|
|
|
|
|
317 |
# Get username and folders
|
318 |
session_hash, args.output_dir, _, args.input_dir, args.textract_input_prefix, args.textract_output_prefix, args.s3_textract_document_logs_subfolder, args.local_textract_document_logs_subfolder = get_username_and_folders(username=args.username, output_folder_textbox=args.output_dir, input_folder_textbox=args.input_dir, session_output_folder=args.save_to_user_folders, textract_document_upload_input_folder=args.textract_input_prefix, textract_document_upload_output_folder=args.textract_output_prefix, s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder)
|
319 |
|
|
|
725 |
s3_bucket_name=textract_bucket,
|
726 |
general_s3_bucket_name=args.s3_bucket,
|
727 |
local_output_dir=args.output_dir,
|
728 |
+
handwrite_signature_checkbox=signature_options,
|
729 |
aws_region=args.aws_region
|
730 |
)
|
731 |
|
lambda_entrypoint.py
CHANGED
@@ -93,6 +93,9 @@ def lambda_handler(event, context):
|
|
93 |
'page_min': int(arguments.get('page_min', 0)),
|
94 |
'page_max': int(arguments.get('page_max', 0)),
|
95 |
'handwrite_signature_extraction': arguments.get('handwrite_signature_checkbox', ['Extract handwriting', 'Extract signatures']),
|
|
|
|
|
|
|
96 |
|
97 |
# General arguments
|
98 |
'local_redact_entities': arguments.get('local_redact_entities', []),
|
@@ -156,6 +159,16 @@ def lambda_handler(event, context):
|
|
156 |
'return_pdf_end_of_redaction': arguments.get('return_pdf_end_of_redaction', True)
|
157 |
}
|
158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
# Download optional files if they are specified
|
160 |
allow_list_key = arguments.get('allow_list_file')
|
161 |
if allow_list_key:
|
|
|
93 |
'page_min': int(arguments.get('page_min', 0)),
|
94 |
'page_max': int(arguments.get('page_max', 0)),
|
95 |
'handwrite_signature_extraction': arguments.get('handwrite_signature_checkbox', ['Extract handwriting', 'Extract signatures']),
|
96 |
+
'extract_forms': arguments.get('extract_forms', False),
|
97 |
+
'extract_tables': arguments.get('extract_tables', False),
|
98 |
+
'extract_layout': arguments.get('extract_layout', False),
|
99 |
|
100 |
# General arguments
|
101 |
'local_redact_entities': arguments.get('local_redact_entities', []),
|
|
|
159 |
'return_pdf_end_of_redaction': arguments.get('return_pdf_end_of_redaction', True)
|
160 |
}
|
161 |
|
162 |
+
# Combine extraction options
|
163 |
+
extraction_options = list(cli_args['handwrite_signature_extraction']) if cli_args['handwrite_signature_extraction'] else []
|
164 |
+
if cli_args['extract_forms']:
|
165 |
+
extraction_options.append('Extract forms')
|
166 |
+
if cli_args['extract_tables']:
|
167 |
+
extraction_options.append('Extract tables')
|
168 |
+
if cli_args['extract_layout']:
|
169 |
+
extraction_options.append('Extract layout')
|
170 |
+
cli_args['handwrite_signature_extraction'] = extraction_options
|
171 |
+
|
172 |
# Download optional files if they are specified
|
173 |
allow_list_key = arguments.get('allow_list_file')
|
174 |
if allow_list_key:
|
tools/aws_textract.py
CHANGED
@@ -7,7 +7,7 @@ import pikepdf
|
|
7 |
import time
|
8 |
import pandas as pd
|
9 |
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
|
10 |
-
from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
|
11 |
|
12 |
def extract_textract_metadata(response:object):
|
13 |
"""Extracts metadata from an AWS Textract response."""
|
@@ -20,20 +20,69 @@ def extract_textract_metadata(response:object):
|
|
20 |
'Pages': pages
|
21 |
})
|
22 |
|
23 |
-
def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Extract handwriting",
|
24 |
'''
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
'''
|
27 |
|
28 |
-
print("handwrite_signature_checkbox in analyse_page_with_textract:", handwrite_signature_checkbox)
|
29 |
if client == "":
|
30 |
try:
|
31 |
-
if
|
|
|
|
|
|
|
|
|
|
|
32 |
client = boto3.client('textract',
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
36 |
client = boto3.client('textract', region_name=AWS_REGION)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
except:
|
38 |
out_message = "Cannot connect to AWS Textract"
|
39 |
print(out_message)
|
@@ -41,15 +90,24 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
|
|
41 |
return [], "" # Return an empty list and an empty string
|
42 |
|
43 |
# Redact signatures if specified
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
try:
|
47 |
-
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=
|
48 |
except Exception as e:
|
49 |
print("Textract call failed due to:", e, "trying again in 3 seconds.")
|
50 |
time.sleep(3)
|
51 |
-
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=
|
52 |
-
|
|
|
|
|
53 |
# Call detect_document_text to extract plain text
|
54 |
try:
|
55 |
response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
|
@@ -98,16 +156,33 @@ def convert_pike_pdf_page_to_bytes(pdf:object, page_num:int):
|
|
98 |
|
99 |
def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
|
100 |
'''
|
101 |
-
Convert the json response from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
'''
|
103 |
-
all_ocr_results =
|
104 |
-
signature_or_handwriting_recogniser_results =
|
105 |
-
signature_recogniser_results =
|
106 |
-
handwriting_recogniser_results =
|
107 |
-
signatures =
|
108 |
-
handwriting =
|
109 |
-
ocr_results_with_words =
|
110 |
-
text_block=
|
111 |
|
112 |
text_line_number = 1
|
113 |
|
|
|
7 |
import time
|
8 |
import pandas as pd
|
9 |
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
|
10 |
+
from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, RUN_AWS_FUNCTIONS, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS
|
11 |
|
12 |
def extract_textract_metadata(response:object):
|
13 |
"""Extracts metadata from an AWS Textract response."""
|
|
|
20 |
'Pages': pages
|
21 |
})
|
22 |
|
23 |
+
def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Extract handwriting"], textract_output_found:bool=False, aws_access_key_textbox:str=AWS_ACCESS_KEY, aws_secret_key_textbox:str=AWS_SECRET_KEY, RUN_AWS_FUNCTIONS:str=RUN_AWS_FUNCTIONS, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:str=PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS):
|
24 |
'''
|
25 |
+
Analyzes a single page of a document using AWS Textract to extract text and other features.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
pdf_page_bytes (object): The content of the PDF page or image as bytes.
|
29 |
+
page_no (int): The page number being analyzed.
|
30 |
+
client (str, optional): An optional pre-initialized AWS Textract client. If not provided,
|
31 |
+
the function will attempt to create one based on configuration.
|
32 |
+
Defaults to "".
|
33 |
+
handwrite_signature_checkbox (List[str], optional): A list of feature types to extract
|
34 |
+
from the document. Options include
|
35 |
+
"Extract handwriting", "Extract signatures",
|
36 |
+
"Extract forms", "Extract layout", "Extract tables".
|
37 |
+
Defaults to ["Extract handwriting"].
|
38 |
+
textract_output_found (bool, optional): A flag indicating whether existing Textract output
|
39 |
+
for the document has been found. This can prevent
|
40 |
+
unnecessary API calls. Defaults to False.
|
41 |
+
aws_access_key_textbox (str, optional): AWS access key provided by the user, if not using
|
42 |
+
SSO or environment variables. Defaults to AWS_ACCESS_KEY.
|
43 |
+
aws_secret_key_textbox (str, optional): AWS secret key provided by the user, if not using
|
44 |
+
SSO or environment variables. Defaults to AWS_SECRET_KEY.
|
45 |
+
RUN_AWS_FUNCTIONS (str, optional): Configuration flag (e.g., "1" or "0") to enable or
|
46 |
+
disable AWS functions. Defaults to RUN_AWS_FUNCTIONS.
|
47 |
+
PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS (str, optional): Configuration flag (e.g., "1" or "0")
|
48 |
+
to prioritize AWS SSO credentials
|
49 |
+
over environment variables.
|
50 |
+
Defaults to PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS.
|
51 |
+
|
52 |
+
Returns:
|
53 |
+
Tuple[List[Dict], str]: A tuple containing:
|
54 |
+
- A list of dictionaries, where each dictionary represents a Textract block (e.g., LINE, WORD, FORM, TABLE).
|
55 |
+
- A string containing metadata about the Textract request.
|
56 |
'''
|
57 |
|
58 |
+
#print("handwrite_signature_checkbox in analyse_page_with_textract:", handwrite_signature_checkbox)
|
59 |
if client == "":
|
60 |
try:
|
61 |
+
# Try to connect to AWS Textract Client if using that text extraction method
|
62 |
+
if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
|
63 |
+
print("Connecting to Textract via existing SSO connection")
|
64 |
+
client = boto3.client('textract', region_name=AWS_REGION)
|
65 |
+
elif aws_access_key_textbox and aws_secret_key_textbox:
|
66 |
+
print("Connecting to Textract using AWS access key and secret keys from user input.")
|
67 |
client = boto3.client('textract',
|
68 |
+
aws_access_key_id=aws_access_key_textbox,
|
69 |
+
aws_secret_access_key=aws_secret_key_textbox, region_name=AWS_REGION)
|
70 |
+
elif RUN_AWS_FUNCTIONS == "1":
|
71 |
+
print("Connecting to Textract via existing SSO connection")
|
72 |
client = boto3.client('textract', region_name=AWS_REGION)
|
73 |
+
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
74 |
+
print("Getting Textract credentials from environment variables.")
|
75 |
+
client = boto3.client('textract',
|
76 |
+
aws_access_key_id=AWS_ACCESS_KEY,
|
77 |
+
aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
|
78 |
+
elif textract_output_found==True:
|
79 |
+
print("Existing Textract data found for file, no need to connect to AWS Textract")
|
80 |
+
client = boto3.client('textract', region_name=AWS_REGION)
|
81 |
+
else:
|
82 |
+
client = ""
|
83 |
+
out_message = "Cannot connect to AWS Textract service."
|
84 |
+
print(out_message)
|
85 |
+
raise Exception(out_message)
|
86 |
except:
|
87 |
out_message = "Cannot connect to AWS Textract"
|
88 |
print(out_message)
|
|
|
90 |
return [], "" # Return an empty list and an empty string
|
91 |
|
92 |
# Redact signatures if specified
|
93 |
+
feature_types = []
|
94 |
+
if "Extract signatures" in handwrite_signature_checkbox or "Extract forms" in handwrite_signature_checkbox or "Extract layout" in handwrite_signature_checkbox or "Extract tables" in handwrite_signature_checkbox:
|
95 |
+
feature_types.append("SIGNATURES")
|
96 |
+
if "Extract forms" in handwrite_signature_checkbox:
|
97 |
+
feature_types.append("FORMS")
|
98 |
+
if "Extract layout" in handwrite_signature_checkbox:
|
99 |
+
feature_types.append("LAYOUT")
|
100 |
+
if "Extract tables" in handwrite_signature_checkbox:
|
101 |
+
feature_types.append("TABLES")
|
102 |
try:
|
103 |
+
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=feature_types)
|
104 |
except Exception as e:
|
105 |
print("Textract call failed due to:", e, "trying again in 3 seconds.")
|
106 |
time.sleep(3)
|
107 |
+
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=feature_types)
|
108 |
+
|
109 |
+
|
110 |
+
if not "Extract signatures" in handwrite_signature_checkbox and not "Extract forms" in handwrite_signature_checkbox and not "Extract layout" in handwrite_signature_checkbox and not "Extract tables" in handwrite_signature_checkbox:
|
111 |
# Call detect_document_text to extract plain text
|
112 |
try:
|
113 |
response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
|
|
|
156 |
|
157 |
def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
|
158 |
'''
|
159 |
+
Convert the json response from Textract to the OCRResult format used elsewhere in the code.
|
160 |
+
Looks for lines, words, and signatures. Handwriting and signatures are set aside especially
|
161 |
+
for later in case the user wants to override the default behaviour and redact all
|
162 |
+
handwriting/signatures.
|
163 |
+
|
164 |
+
Args:
|
165 |
+
json_data (dict): The raw JSON response from AWS Textract for a document or page.
|
166 |
+
page_width (float): The absolute width of the page in pixels.
|
167 |
+
page_height (float): The absolute height of the page in pixels.
|
168 |
+
page_no (int): The 1-based page number being processed.
|
169 |
+
|
170 |
+
Returns:
|
171 |
+
tuple: A tuple containing:
|
172 |
+
- dict: OCR results structured as an OCRResult object (containing 'page' and 'results' list).
|
173 |
+
- list: Bounding boxes identified as handwriting or signatures.
|
174 |
+
- list: Bounding boxes identified specifically as signatures.
|
175 |
+
- list: Bounding boxes identified specifically as handwriting.
|
176 |
+
- dict: OCR results with word-level detail, structured for further processing.
|
177 |
'''
|
178 |
+
all_ocr_results = list()
|
179 |
+
signature_or_handwriting_recogniser_results = list()
|
180 |
+
signature_recogniser_results = list()
|
181 |
+
handwriting_recogniser_results = list()
|
182 |
+
signatures = list()
|
183 |
+
handwriting = list()
|
184 |
+
ocr_results_with_words = dict()
|
185 |
+
text_block=dict()
|
186 |
|
187 |
text_line_number = 1
|
188 |
|
tools/config.py
CHANGED
@@ -5,10 +5,19 @@ import logging
|
|
5 |
from datetime import datetime
|
6 |
from dotenv import load_dotenv
|
7 |
from tldextract import TLDExtract
|
|
|
8 |
|
9 |
today_rev = datetime.now().strftime("%Y%m%d")
|
10 |
HOST_NAME = socket.gethostname()
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
# Set or retrieve configuration variables for the redaction app
|
13 |
|
14 |
def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
|
@@ -297,6 +306,22 @@ CUSTOM_ENTITIES = get_or_create_env_var('CUSTOM_ENTITIES', "['TITLES', 'UKPOSTCO
|
|
297 |
|
298 |
DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var('DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX', "['Extract handwriting']")
|
299 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
DEFAULT_SEARCH_QUERY = get_or_create_env_var('DEFAULT_SEARCH_QUERY', '')
|
301 |
DEFAULT_FUZZY_SPELLING_MISTAKES_NUM = int(get_or_create_env_var('DEFAULT_FUZZY_SPELLING_MISTAKES_NUM', '1'))
|
302 |
|
@@ -309,6 +334,11 @@ DEFAULT_PAGE_MAX = int(get_or_create_env_var('DEFAULT_PAGE_MAX', '999'))
|
|
309 |
PAGE_BREAK_VALUE = int(get_or_create_env_var('PAGE_BREAK_VALUE', '99999'))
|
310 |
|
311 |
MAX_TIME_VALUE = int(get_or_create_env_var('MAX_TIME_VALUE', '999999'))
|
|
|
|
|
|
|
|
|
|
|
312 |
|
313 |
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
|
314 |
|
|
|
5 |
from datetime import datetime
|
6 |
from dotenv import load_dotenv
|
7 |
from tldextract import TLDExtract
|
8 |
+
from typing import List
|
9 |
|
10 |
today_rev = datetime.now().strftime("%Y%m%d")
|
11 |
HOST_NAME = socket.gethostname()
|
12 |
|
13 |
+
def _get_env_list(env_var_name: str) -> List[str]:
|
14 |
+
"""Parses a comma-separated environment variable into a list of strings."""
|
15 |
+
value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
|
16 |
+
if not value:
|
17 |
+
return []
|
18 |
+
# Split by comma and filter out any empty strings that might result from extra commas
|
19 |
+
return [s.strip() for s in value.split(',') if s.strip()]
|
20 |
+
|
21 |
# Set or retrieve configuration variables for the redaction app
|
22 |
|
23 |
def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
|
|
|
306 |
|
307 |
DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var('DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX', "['Extract handwriting']")
|
308 |
|
309 |
+
HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS = get_or_create_env_var('HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS', "['Extract handwriting', 'Extract signatures']")
|
310 |
+
|
311 |
+
if HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS: HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS = _get_env_list(HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS)
|
312 |
+
|
313 |
+
INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var('INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION', "False")
|
314 |
+
INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var('INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION', "False")
|
315 |
+
INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var('INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION', "False")
|
316 |
+
|
317 |
+
if INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION == "True":
|
318 |
+
HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append('Extract forms')
|
319 |
+
if INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION == "True":
|
320 |
+
HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append('Extract layout')
|
321 |
+
if INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION == "True":
|
322 |
+
HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append('Extract tables')
|
323 |
+
|
324 |
+
|
325 |
DEFAULT_SEARCH_QUERY = get_or_create_env_var('DEFAULT_SEARCH_QUERY', '')
|
326 |
DEFAULT_FUZZY_SPELLING_MISTAKES_NUM = int(get_or_create_env_var('DEFAULT_FUZZY_SPELLING_MISTAKES_NUM', '1'))
|
327 |
|
|
|
334 |
PAGE_BREAK_VALUE = int(get_or_create_env_var('PAGE_BREAK_VALUE', '99999'))
|
335 |
|
336 |
MAX_TIME_VALUE = int(get_or_create_env_var('MAX_TIME_VALUE', '999999'))
|
337 |
+
MAX_SIMULTANEOUS_FILES = int(get_or_create_env_var('MAX_SIMULTANEOUS_FILES', '10'))
|
338 |
+
MAX_DOC_PAGES = int(get_or_create_env_var('MAX_DOC_PAGES', '3000'))
|
339 |
+
MAX_TABLE_ROWS = int(get_or_create_env_var('MAX_TABLE_ROWS', '250000'))
|
340 |
+
MAX_TABLE_COLUMNS = int(get_or_create_env_var('MAX_TABLE_COLUMNS', '100'))
|
341 |
+
MAX_OPEN_TEXT_CHARACTERS = int(get_or_create_env_var('MAX_OPEN_TEXT_CHARACTERS', '50000'))
|
342 |
|
343 |
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
|
344 |
|
tools/data_anonymise.py
CHANGED
@@ -18,7 +18,7 @@ from botocore.client import BaseClient
|
|
18 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
19 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
|
20 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
21 |
-
from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices, DO_INITIAL_TABULAR_DATA_CLEAN, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS, AWS_REGION
|
22 |
from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type, _get_env_list
|
23 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, create_nlp_analyser, load_spacy_model
|
24 |
# Use custom version of analyze_dict to be able to track progress
|
@@ -261,6 +261,13 @@ def handle_docx_anonymisation(
|
|
261 |
text_elements = list() # This will store the actual docx objects (paragraphs, cells)
|
262 |
original_texts = list() # This will store the text from those objects
|
263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
# Extract from paragraphs
|
265 |
for para in doc.paragraphs:
|
266 |
if para.text.strip(): # Only process non-empty paragraphs
|
@@ -464,6 +471,14 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
464 |
else:
|
465 |
out_message = "Please enter text or a file to redact."
|
466 |
raise Exception(out_message)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
467 |
|
468 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
469 |
if latest_file_completed >= len(file_paths):
|
@@ -527,9 +542,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
527 |
|
528 |
# Create xlsx file:
|
529 |
anon_xlsx = pd.ExcelFile(file_path)
|
530 |
-
anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
|
531 |
-
|
532 |
-
|
533 |
|
534 |
# Iterate through the sheet names
|
535 |
for sheet_name in progress.tqdm(in_excel_sheets, desc="Anonymising sheets", unit = "sheets"):
|
@@ -675,7 +688,20 @@ def tabular_anonymise_wrapper_func(
|
|
675 |
anon_df_part = anon_df[chosen_cols_in_anon_df]
|
676 |
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
677 |
|
678 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
679 |
# Anonymise the selected columns
|
680 |
anon_df_part_out, key_string, decision_process_output_str, comprehend_query_number = anonymise_script(anon_df_part, anon_strategy, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser, do_initial_clean=do_initial_clean)
|
681 |
|
|
|
18 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
19 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
|
20 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
21 |
+
from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices, DO_INITIAL_TABULAR_DATA_CLEAN, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS, AWS_REGION, MAX_TABLE_ROWS, MAX_TABLE_COLUMNS, MAX_SIMULTANEOUS_FILES
|
22 |
from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type, _get_env_list
|
23 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, create_nlp_analyser, load_spacy_model
|
24 |
# Use custom version of analyze_dict to be able to track progress
|
|
|
261 |
text_elements = list() # This will store the actual docx objects (paragraphs, cells)
|
262 |
original_texts = list() # This will store the text from those objects
|
263 |
|
264 |
+
paragraph_count = len(doc.paragraphs)
|
265 |
+
|
266 |
+
if paragraph_count > MAX_TABLE_ROWS:
|
267 |
+
out_message = f"Number of paragraphs in document is greater than {MAX_TABLE_ROWS}. Please submit a smaller document."
|
268 |
+
print(out_message)
|
269 |
+
raise Exception(out_message)
|
270 |
+
|
271 |
# Extract from paragraphs
|
272 |
for para in doc.paragraphs:
|
273 |
if para.text.strip(): # Only process non-empty paragraphs
|
|
|
471 |
else:
|
472 |
out_message = "Please enter text or a file to redact."
|
473 |
raise Exception(out_message)
|
474 |
+
|
475 |
+
if not isinstance(file_paths, list):
|
476 |
+
file_paths = [file_paths]
|
477 |
+
|
478 |
+
if len(file_paths) > MAX_SIMULTANEOUS_FILES:
|
479 |
+
out_message = f"Number of files to anonymise is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
|
480 |
+
print(out_message)
|
481 |
+
raise Exception(out_message)
|
482 |
|
483 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
484 |
if latest_file_completed >= len(file_paths):
|
|
|
542 |
|
543 |
# Create xlsx file:
|
544 |
anon_xlsx = pd.ExcelFile(file_path)
|
545 |
+
anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
|
|
|
|
|
546 |
|
547 |
# Iterate through the sheet names
|
548 |
for sheet_name in progress.tqdm(in_excel_sheets, desc="Anonymising sheets", unit = "sheets"):
|
|
|
688 |
anon_df_part = anon_df[chosen_cols_in_anon_df]
|
689 |
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
690 |
|
691 |
+
row_count = anon_df_part.shape[0]
|
692 |
+
|
693 |
+
if row_count > MAX_TABLE_ROWS:
|
694 |
+
out_message = f"Number of rows in dataframe is greater than {MAX_TABLE_ROWS}. Please submit a smaller dataframe."
|
695 |
+
print(out_message)
|
696 |
+
raise Exception(out_message)
|
697 |
+
|
698 |
+
column_count = anon_df_part.shape[1]
|
699 |
+
|
700 |
+
if column_count > MAX_TABLE_COLUMNS:
|
701 |
+
out_message = f"Number of columns in dataframe is greater than {MAX_TABLE_COLUMNS}. Please submit a smaller dataframe."
|
702 |
+
print(out_message)
|
703 |
+
raise Exception(out_message)
|
704 |
+
|
705 |
# Anonymise the selected columns
|
706 |
anon_df_part_out, key_string, decision_process_output_str, comprehend_query_number = anonymise_script(anon_df_part, anon_strategy, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser, do_initial_clean=do_initial_clean)
|
707 |
|
tools/file_redaction.py
CHANGED
@@ -20,7 +20,7 @@ import gradio as gr
|
|
20 |
from gradio import Progress
|
21 |
from collections import defaultdict # For efficient grouping
|
22 |
|
23 |
-
from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS
|
24 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
|
25 |
from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
|
26 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer, load_spacy_model, download_tesseract_lang_pack, create_nlp_analyser
|
@@ -106,7 +106,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
106 |
page_min:int=0,
|
107 |
page_max:int=999,
|
108 |
estimated_time_taken_state:float=0.0,
|
109 |
-
handwrite_signature_checkbox:List[str]=list(["Extract handwriting"
|
110 |
all_request_metadata_str:str = "",
|
111 |
annotations_all_pages:List[dict]=list(),
|
112 |
all_page_line_level_ocr_results_df:pd.DataFrame=None,
|
@@ -273,6 +273,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
273 |
file_paths_list = [os.path.abspath(file_paths)]
|
274 |
else: file_paths_list = file_paths
|
275 |
|
|
|
|
|
|
|
|
|
|
|
276 |
valid_extensions = {".pdf", ".jpg", ".jpeg", ".png"}
|
277 |
# Filter only files with valid extensions. Currently only allowing one file to be redacted at a time
|
278 |
# Filter the file_paths_list to include only files with valid extensions
|
@@ -374,7 +379,12 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
374 |
|
375 |
page_sizes = page_sizes_df.to_dict(orient="records")
|
376 |
|
377 |
-
number_of_pages = pymupdf_doc.page_count
|
|
|
|
|
|
|
|
|
|
|
378 |
|
379 |
# If we have reached the last page, return message and outputs
|
380 |
if current_loop_page >= number_of_pages:
|
|
|
20 |
from gradio import Progress
|
21 |
from collections import defaultdict # For efficient grouping
|
22 |
|
23 |
+
from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS, MAX_DOC_PAGES, MAX_SIMULTANEOUS_FILES
|
24 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
|
25 |
from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
|
26 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer, load_spacy_model, download_tesseract_lang_pack, create_nlp_analyser
|
|
|
106 |
page_min:int=0,
|
107 |
page_max:int=999,
|
108 |
estimated_time_taken_state:float=0.0,
|
109 |
+
handwrite_signature_checkbox:List[str]=list(["Extract handwriting"]),
|
110 |
all_request_metadata_str:str = "",
|
111 |
annotations_all_pages:List[dict]=list(),
|
112 |
all_page_line_level_ocr_results_df:pd.DataFrame=None,
|
|
|
273 |
file_paths_list = [os.path.abspath(file_paths)]
|
274 |
else: file_paths_list = file_paths
|
275 |
|
276 |
+
if len(file_paths_list) > MAX_SIMULTANEOUS_FILES:
|
277 |
+
out_message = f"Number of files to redact is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
|
278 |
+
print(out_message)
|
279 |
+
raise Exception(out_message)
|
280 |
+
|
281 |
valid_extensions = {".pdf", ".jpg", ".jpeg", ".png"}
|
282 |
# Filter only files with valid extensions. Currently only allowing one file to be redacted at a time
|
283 |
# Filter the file_paths_list to include only files with valid extensions
|
|
|
379 |
|
380 |
page_sizes = page_sizes_df.to_dict(orient="records")
|
381 |
|
382 |
+
number_of_pages = pymupdf_doc.page_count
|
383 |
+
|
384 |
+
if number_of_pages > MAX_DOC_PAGES:
|
385 |
+
out_message = f"Number of pages in document is greater than {MAX_DOC_PAGES}. Please submit a smaller document."
|
386 |
+
print(out_message)
|
387 |
+
raise Exception(out_message)
|
388 |
|
389 |
# If we have reached the last page, return message and outputs
|
390 |
if current_loop_page >= number_of_pages:
|
tools/textract_batch_call.py
CHANGED
@@ -25,7 +25,7 @@ def analyse_document_with_textract_api(
|
|
25 |
job_df:pd.DataFrame,
|
26 |
s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
|
27 |
local_output_dir: str = OUTPUT_FOLDER,
|
28 |
-
|
29 |
successful_job_number:int=0,
|
30 |
total_document_page_count:int=1,
|
31 |
general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
|
@@ -43,7 +43,7 @@ def analyse_document_with_textract_api(
|
|
43 |
job_df (pd.DataFrame): Dataframe containing information from previous Textract API calls.
|
44 |
s3_bucket_name (str, optional): S3 bucket in which to save API call outputs.
|
45 |
local_output_dir (str, optional): Local directory to save the downloaded JSON results.
|
46 |
-
|
47 |
successful_job_number (int): The number of successful jobs that have been submitted in this session.
|
48 |
total_document_page_count (int): The number of pages in the document
|
49 |
aws_region (str, optional): AWS region name. Defaults to boto3 default region.
|
@@ -122,10 +122,10 @@ def analyse_document_with_textract_api(
|
|
122 |
if not job_df.empty:
|
123 |
|
124 |
if "file_name" in job_df.columns:
|
125 |
-
matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(
|
126 |
-
matching_job_id_file_names_dates = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(
|
127 |
-
matching_job_id = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(
|
128 |
-
matching_handwrite_signature = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(
|
129 |
|
130 |
if len(matching_job_id) > 0:
|
131 |
pass
|
@@ -142,7 +142,16 @@ def analyse_document_with_textract_api(
|
|
142 |
print(message)
|
143 |
|
144 |
try:
|
145 |
-
if "Extract signatures" in
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
response = textract_client.start_document_analysis(
|
147 |
DocumentLocation={
|
148 |
'S3Object': {
|
@@ -150,20 +159,15 @@ def analyse_document_with_textract_api(
|
|
150 |
'Name': s3_input_key
|
151 |
}
|
152 |
},
|
153 |
-
FeatureTypes=
|
154 |
OutputConfig={
|
155 |
'S3Bucket': s3_bucket_name,
|
156 |
'S3Prefix': s3_output_prefix
|
157 |
}
|
158 |
-
# Optional: Add NotificationChannel for SNS topic notifications
|
159 |
-
# NotificationChannel={
|
160 |
-
# 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
|
161 |
-
# 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
|
162 |
-
# }
|
163 |
)
|
164 |
job_type="document_analysis"
|
165 |
|
166 |
-
|
167 |
response = textract_client.start_document_text_detection(
|
168 |
DocumentLocation={
|
169 |
'S3Object': {
|
@@ -190,7 +194,7 @@ def analyse_document_with_textract_api(
|
|
190 |
'job_id': job_id,
|
191 |
'file_name': pdf_filename,
|
192 |
'job_type': job_type,
|
193 |
-
'signature_extraction':
|
194 |
'job_date_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
195 |
}])
|
196 |
|
@@ -236,7 +240,21 @@ def return_job_status(job_id:str,
|
|
236 |
max_polling_attempts: int = 1 # ~10 minutes total wait time
|
237 |
):
|
238 |
'''
|
239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
'''
|
241 |
|
242 |
job_status = response['JobStatus']
|
|
|
25 |
job_df:pd.DataFrame,
|
26 |
s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
|
27 |
local_output_dir: str = OUTPUT_FOLDER,
|
28 |
+
handwrite_signature_checkbox:List[str] = list(),
|
29 |
successful_job_number:int=0,
|
30 |
total_document_page_count:int=1,
|
31 |
general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
|
|
|
43 |
job_df (pd.DataFrame): Dataframe containing information from previous Textract API calls.
|
44 |
s3_bucket_name (str, optional): S3 bucket in which to save API call outputs.
|
45 |
local_output_dir (str, optional): Local directory to save the downloaded JSON results.
|
46 |
+
handwrite_signature_checkbox (List[str], optional): List of feature types to extract from the document.
|
47 |
successful_job_number (int): The number of successful jobs that have been submitted in this session.
|
48 |
total_document_page_count (int): The number of pages in the document
|
49 |
aws_region (str, optional): AWS region name. Defaults to boto3 default region.
|
|
|
122 |
if not job_df.empty:
|
123 |
|
124 |
if "file_name" in job_df.columns:
|
125 |
+
matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "file_name"]
|
126 |
+
matching_job_id_file_names_dates = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "job_date_time"]
|
127 |
+
matching_job_id = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "job_id"]
|
128 |
+
matching_handwrite_signature = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(handwrite_signature_checkbox)), "signature_extraction"]
|
129 |
|
130 |
if len(matching_job_id) > 0:
|
131 |
pass
|
|
|
142 |
print(message)
|
143 |
|
144 |
try:
|
145 |
+
if "Extract signatures" in handwrite_signature_checkbox or "Extract forms" in handwrite_signature_checkbox or "Extract layout" in handwrite_signature_checkbox or "Extract tables" in handwrite_signature_checkbox:
|
146 |
+
feature_types = list()
|
147 |
+
if 'Extract signatures' in handwrite_signature_checkbox:
|
148 |
+
feature_types.append('SIGNATURES')
|
149 |
+
if "Extract forms" in handwrite_signature_checkbox:
|
150 |
+
feature_types.append('FORMS')
|
151 |
+
if "Extract layout" in handwrite_signature_checkbox:
|
152 |
+
feature_types.append('LAYOUT')
|
153 |
+
if "Extract tables" in handwrite_signature_checkbox:
|
154 |
+
feature_types.append('TABLES')
|
155 |
response = textract_client.start_document_analysis(
|
156 |
DocumentLocation={
|
157 |
'S3Object': {
|
|
|
159 |
'Name': s3_input_key
|
160 |
}
|
161 |
},
|
162 |
+
FeatureTypes=feature_types, # Analyze for signatures, forms, and tables
|
163 |
OutputConfig={
|
164 |
'S3Bucket': s3_bucket_name,
|
165 |
'S3Prefix': s3_output_prefix
|
166 |
}
|
|
|
|
|
|
|
|
|
|
|
167 |
)
|
168 |
job_type="document_analysis"
|
169 |
|
170 |
+
if not "Extract signatures" in handwrite_signature_checkbox and not "Extract forms" in handwrite_signature_checkbox and not "Extract layout" in handwrite_signature_checkbox and not "Extract tables" in handwrite_signature_checkbox:
|
171 |
response = textract_client.start_document_text_detection(
|
172 |
DocumentLocation={
|
173 |
'S3Object': {
|
|
|
194 |
'job_id': job_id,
|
195 |
'file_name': pdf_filename,
|
196 |
'job_type': job_type,
|
197 |
+
'signature_extraction':handwrite_signature_checkbox,
|
198 |
'job_date_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
199 |
}])
|
200 |
|
|
|
240 |
max_polling_attempts: int = 1 # ~10 minutes total wait time
|
241 |
):
|
242 |
'''
|
243 |
+
Polls the AWS Textract service to retrieve the current status of an asynchronous document analysis job.
|
244 |
+
This function checks the job status from the provided response and logs relevant information or errors.
|
245 |
+
|
246 |
+
Args:
|
247 |
+
job_id (str): The unique identifier of the Textract job.
|
248 |
+
response (dict): The response dictionary received from Textract's `get_document_analysis` or `get_document_text_detection` call.
|
249 |
+
attempts (int): The current polling attempt number.
|
250 |
+
poll_interval_seconds (int, optional): The time in seconds to wait before the next poll (currently unused in this function, but kept for context). Defaults to 0.
|
251 |
+
max_polling_attempts (int, optional): The maximum number of polling attempts allowed (currently unused in this function, but kept for context). Defaults to 1.
|
252 |
+
|
253 |
+
Returns:
|
254 |
+
str: The current status of the Textract job (e.g., 'IN_PROGRESS', 'SUCCEEDED').
|
255 |
+
|
256 |
+
Raises:
|
257 |
+
Exception: If the Textract job status is 'FAILED' or 'PARTIAL_SUCCESS', or if an unexpected status is encountered.
|
258 |
'''
|
259 |
|
260 |
job_status = response['JobStatus']
|