Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on 10 days ago

Commit

bce761b

1 Parent(s): 9d7cf92

Added possibility of changing model and entity types in config file

Browse files

Files changed (7) hide show

app.py +47 -64
tools/config.py +76 -5
tools/data_anonymise.py +18 -10
tools/file_conversion.py +9 -10
tools/file_redaction.py +31 -31
tools/helper_functions.py +24 -33
tools/redaction_review.py +2 -2

app.py CHANGED Viewed

@@ -1,12 +1,10 @@
 import os
-import logging
 import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
-from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, AWS_REGION
-from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars
-from tools.aws_functions import upload_file_to_s3, download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text
@@ -20,30 +18,7 @@ from tools.textract_batch_call import analyse_document_with_textract_api, poll_w
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
-chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
-full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
-# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
-chosen_comprehend_entities.extend(custom_entities)
-full_comprehend_entity_list.extend(custom_entities)
-# Entities for local PII redaction option
-chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
-full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']
-log_file_name = 'log.csv'
-file_input_height = 200
-if RUN_AWS_FUNCTIONS == "1":
-    default_ocr_val = textract_option
-    default_pii_detector = local_pii_detector
-else:
-    default_ocr_val = text_ocr_option
-    default_pii_detector = local_pii_detector
 SAVE_LOGS_TO_CSV = eval(SAVE_LOGS_TO_CSV)
 SAVE_LOGS_TO_DYNAMODB = eval(SAVE_LOGS_TO_DYNAMODB)
@@ -55,6 +30,17 @@ if DYNAMODB_ACCESS_LOG_HEADERS: DYNAMODB_ACCESS_LOG_HEADERS = eval(DYNAMODB_ACCE
 if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
 if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
@@ -66,8 +52,7 @@ with app:
     # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
     pdf_doc_state = gr.State([])
-    all_image_annotations_state = gr.State([])
     all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas", wrap=True)
     review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
@@ -105,11 +90,11 @@ with app:
     backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
     # Logging state
-    feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + log_file_name, visible=False)
     feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=FEEDBACK_LOGS_FOLDER, visible=False)
-    access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + log_file_name, visible=False)
     access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=ACCESS_LOGS_FOLDER, visible=False)
-    usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + log_file_name, visible=False)
     usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=USAGE_LOGS_FOLDER, visible=False)
     session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
@@ -172,8 +157,8 @@ with app:
     s3_whole_document_textract_input_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
     s3_whole_document_textract_output_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
     successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
-    no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
-    textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
     load_s3_whole_document_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
     s3_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
@@ -233,7 +218,7 @@ with app:
     job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
     job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
-    textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
     convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
     ###
@@ -256,15 +241,15 @@ with app:
     ###
     with gr.Tab("Redact PDFs/images"):
         with gr.Accordion("Redact document", open = True):
-            in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
-            text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
             with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
                 handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
             with gr.Row(equal_height=True):
-                pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
             if SHOW_COSTS == "True":
                 with gr.Accordion("Estimated costs and time taken. Note that costs shown only include direct usage of AWS services and do not include other running costs (e.g. storage, run-time costs)", open = True, visible=True):
@@ -311,7 +296,7 @@ with app:
         with gr.Row():
             redaction_output_summary_textbox = gr.Textbox(label="Output summary", scale=1)
-            output_file = gr.File(label="Output files", scale = 2)#, height=file_input_height)
             latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
         # Feedback elements are invisible until revealed by redaction action
@@ -326,7 +311,7 @@ with app:
     with gr.Tab("Review redactions", id="tab_object_annotation"):
         with gr.Accordion(label = "Review PDF redactions", open=True):
-            output_review_files = gr.File(label="Upload original PDF and 'review_file' csv here to review suggested redactions. The 'ocr_output' file can also be optionally provided for text search.", file_count='multiple', height=file_input_height)
             upload_previous_review_file_btn = gr.Button("Review redactions based on original PDF and 'review_file' csv provided above ('ocr_output' csv  optional)", variant="secondary")
         with gr.Row():
             annotate_zoom_in = gr.Button("Zoom in", visible=False)
@@ -376,20 +361,18 @@ with app:
                         recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
                         page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
                     text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
                     recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=(4,"fixed"), type="pandas", label="Click table row to select and go to page", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, max_height=400, static_columns=[0,1,2,3])
                     with gr.Row(equal_height=True):
                         exclude_selected_btn = gr.Button(value="Exclude all redactions in table")
                     with gr.Accordion("Selected redaction row", open=True):
-                        selected_entity_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=True, headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True)
                         exclude_selected_row_btn = gr.Button(value="Exclude specific redaction row")
-                        exclude_text_with_same_as_selected_row_btn = gr.Button(value="Exclude all redactions with same text as selected row")
-                    with gr.Row(equal_height=True):
-                        reset_dropdowns_btn = gr.Button(value="Reset filters")
-                    undo_last_removal_btn = gr.Button(value="Undo last element removal")
                 with gr.Accordion("Search all extracted text", open=True):
                     all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
@@ -405,12 +388,12 @@ with app:
     ###
     with gr.Tab(label="Identify duplicate pages"):
         with gr.Accordion("Identify duplicate pages to redact", open = True):
-            in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
             with gr.Row():
                 duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
                 find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 4)
-            duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
     ###
     # TEXT / TABULAR DATA TAB
@@ -420,13 +403,13 @@ with app:
         with gr.Accordion("Redact open text", open = False):
             in_text = gr.Textbox(label="Enter open text", lines=10)
         with gr.Accordion("Upload xlsx or csv files", open = True):
-            in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'], height=file_input_height)
         in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
         in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
-        pii_identification_method_drop_tabular = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
         with gr.Accordion("Anonymisation output format", open = False):
             anon_strat = gr.Radio(choices=["replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely", "hash", "mask"], label="Select an anonymisation method.", value = "replace with 'REDACTED'") # , "encrypt", "fake_first_name" are also available, but are not currently included as not that useful in current form
@@ -452,13 +435,13 @@ with app:
         with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
             with gr.Row():
                 with gr.Column():
-                    in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will not be redacted.", file_count="multiple", height=file_input_height)
                     in_allow_list_text = gr.Textbox(label="Custom allow list load status")
                 with gr.Column():
-                    in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.", file_count="multiple", height=file_input_height)
                     in_deny_list_text = gr.Textbox(label="Custom deny list load status")
                 with gr.Column():
-                    in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=file_input_height)
                     in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
             with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists (NOTE: you need to press Enter after modifying/adding an entry to the lists to apply them)", open = False):
                 with gr.Row():
@@ -467,8 +450,8 @@ with app:
                     in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["fully_redacted_pages_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Fully redacted pages", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, datatype='number', wrap=True)
         with gr.Accordion("Select entity types to redact", open = True):
-                in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
-                in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
                 with gr.Row():
                     max_fuzzy_spelling_mistakes_num = gr.Number(label="Maximum number of spelling mistakes allowed for fuzzy matching (CUSTOM_FUZZY entity).", value=1, minimum=0, maximum=9, precision=0)
@@ -767,7 +750,7 @@ with app:
     ### ACCESS LOGS
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
-    access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
     access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
     session_hash_textbox.change(lambda *args: access_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_ACCESS_LOG_HEADERS, replacement_headers=CSV_ACCESS_LOG_HEADERS), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
     success(fn = upload_log_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
@@ -775,25 +758,25 @@ with app:
     ### FEEDBACK LOGS
     if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
         # User submitted feedback for pdf redactions
-        pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
         pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
         pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
         # User submitted feedback for data redactions
-        data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
         data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
         data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
     else:
         # User submitted feedback for pdf redactions
-        pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
         pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
         pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, placeholder_doc_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
         # User submitted feedback for data redactions
-        data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
         data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
         data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, placeholder_data_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
@@ -801,7 +784,7 @@ with app:
     ### USAGE LOGS
     # Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
-    usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
     if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
         usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
@@ -839,7 +822,7 @@ if __name__ == "__main__":
         main(first_loop_state, latest_file_completed=0, redaction_output_summary_textbox="", output_file_list=None,
          log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
-         current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
 # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
 # with gr.Tab(label="Advanced options"):

 import os
 import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
+from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC,  TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS
+from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars
+from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
+# Convert string environment variables to string or list
 SAVE_LOGS_TO_CSV = eval(SAVE_LOGS_TO_CSV)
 SAVE_LOGS_TO_DYNAMODB = eval(SAVE_LOGS_TO_DYNAMODB)
 if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
 if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
+if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = eval(CHOSEN_COMPREHEND_ENTITIES)
+if FULL_COMPREHEND_ENTITY_LIST: FULL_COMPREHEND_ENTITY_LIST = eval(FULL_COMPREHEND_ENTITY_LIST)
+if CHOSEN_REDACT_ENTITIES: CHOSEN_REDACT_ENTITIES = eval(CHOSEN_REDACT_ENTITIES)
+if FULL_ENTITY_LIST: FULL_ENTITY_LIST = eval(FULL_ENTITY_LIST)
+# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
+CHOSEN_COMPREHEND_ENTITIES.extend(custom_entities)
+FULL_COMPREHEND_ENTITY_LIST.extend(custom_entities)
+FILE_INPUT_HEIGHT = int(FILE_INPUT_HEIGHT)
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
     # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
     pdf_doc_state = gr.State([])
+    all_image_annotations_state = gr.State([])
     all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas", wrap=True)
     review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
     backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
     # Logging state
+    feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
     feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=FEEDBACK_LOGS_FOLDER, visible=False)
+    access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
     access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=ACCESS_LOGS_FOLDER, visible=False)
+    usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
     usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=USAGE_LOGS_FOLDER, visible=False)
     session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
     s3_whole_document_textract_input_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
     s3_whole_document_textract_output_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
     successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
+    no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = NO_REDACTION_PII_OPTION, choices=[NO_REDACTION_PII_OPTION], visible=False)
+    textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = TEXTRACT_TEXT_EXTRACT_OPTION, choices=[TEXTRACT_TEXT_EXTRACT_OPTION], visible=False)
     load_s3_whole_document_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
     s3_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
     job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
     job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
+    textract_job_output_file = gr.File(label="Textract job output files", height=FILE_INPUT_HEIGHT, visible=False)
     convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
     ###
     ###
     with gr.Tab("Redact PDFs/images"):
         with gr.Accordion("Redact document", open = True):
+            in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=FILE_INPUT_HEIGHT)
+            text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
             with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
                 handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
             with gr.Row(equal_height=True):
+                pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = DEFAULT_PII_DETECTION_MODEL, choices=PII_DETECTION_MODELS)
             if SHOW_COSTS == "True":
                 with gr.Accordion("Estimated costs and time taken. Note that costs shown only include direct usage of AWS services and do not include other running costs (e.g. storage, run-time costs)", open = True, visible=True):
         with gr.Row():
             redaction_output_summary_textbox = gr.Textbox(label="Output summary", scale=1)
+            output_file = gr.File(label="Output files", scale = 2)#, height=FILE_INPUT_HEIGHT)
             latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
         # Feedback elements are invisible until revealed by redaction action
     with gr.Tab("Review redactions", id="tab_object_annotation"):
         with gr.Accordion(label = "Review PDF redactions", open=True):
+            output_review_files = gr.File(label="Upload original PDF and 'review_file' csv here to review suggested redactions. The 'ocr_output' file can also be optionally provided for text search.", file_count='multiple', height=FILE_INPUT_HEIGHT)
             upload_previous_review_file_btn = gr.Button("Review redactions based on original PDF and 'review_file' csv provided above ('ocr_output' csv  optional)", variant="secondary")
         with gr.Row():
             annotate_zoom_in = gr.Button("Zoom in", visible=False)
                         recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
                         page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
                     text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
+                    reset_dropdowns_btn = gr.Button(value="Reset filters")
                     recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=(4,"fixed"), type="pandas", label="Click table row to select and go to page", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, max_height=400, static_columns=[0,1,2,3])
                     with gr.Row(equal_height=True):
                         exclude_selected_btn = gr.Button(value="Exclude all redactions in table")
                     with gr.Accordion("Selected redaction row", open=True):
+                        selected_entity_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=True, headers=["page", "label", "text", "id"], wrap=True)
                         exclude_selected_row_btn = gr.Button(value="Exclude specific redaction row")
+                        exclude_text_with_same_as_selected_row_btn = gr.Button(value="Exclude all redactions with same text as selected row")
+                    undo_last_removal_btn = gr.Button(value="Undo last element removal", variant="primary")
                 with gr.Accordion("Search all extracted text", open=True):
                     all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
     ###
     with gr.Tab(label="Identify duplicate pages"):
         with gr.Accordion("Identify duplicate pages to redact", open = True):
+            in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv'])
             with gr.Row():
                 duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
                 find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 4)
+            duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv'])
     ###
     # TEXT / TABULAR DATA TAB
         with gr.Accordion("Redact open text", open = False):
             in_text = gr.Textbox(label="Enter open text", lines=10)
         with gr.Accordion("Upload xlsx or csv files", open = True):
+            in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'], height=FILE_INPUT_HEIGHT)
         in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
         in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
+        pii_identification_method_drop_tabular = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = DEFAULT_PII_DETECTION_MODEL, choices=TABULAR_PII_DETECTION_MODELS)
         with gr.Accordion("Anonymisation output format", open = False):
             anon_strat = gr.Radio(choices=["replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely", "hash", "mask"], label="Select an anonymisation method.", value = "replace with 'REDACTED'") # , "encrypt", "fake_first_name" are also available, but are not currently included as not that useful in current form
         with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
             with gr.Row():
                 with gr.Column():
+                    in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will not be redacted.", file_count="multiple", height=FILE_INPUT_HEIGHT)
                     in_allow_list_text = gr.Textbox(label="Custom allow list load status")
                 with gr.Column():
+                    in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.", file_count="multiple", height=FILE_INPUT_HEIGHT)
                     in_deny_list_text = gr.Textbox(label="Custom deny list load status")
                 with gr.Column():
+                    in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=FILE_INPUT_HEIGHT)
                     in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
             with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists (NOTE: you need to press Enter after modifying/adding an entry to the lists to apply them)", open = False):
                 with gr.Row():
                     in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["fully_redacted_pages_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Fully redacted pages", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, datatype='number', wrap=True)
         with gr.Accordion("Select entity types to redact", open = True):
+                in_redact_entities = gr.Dropdown(value=CHOSEN_REDACT_ENTITIES, choices=FULL_ENTITY_LIST, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
+                in_redact_comprehend_entities = gr.Dropdown(value=CHOSEN_COMPREHEND_ENTITIES, choices=FULL_COMPREHEND_ENTITY_LIST, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
                 with gr.Row():
                     max_fuzzy_spelling_mistakes_num = gr.Number(label="Maximum number of spelling mistakes allowed for fuzzy matching (CUSTOM_FUZZY entity).", value=1, minimum=0, maximum=9, precision=0)
     ### ACCESS LOGS
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
+    access_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
     access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
     session_hash_textbox.change(lambda *args: access_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_ACCESS_LOG_HEADERS, replacement_headers=CSV_ACCESS_LOG_HEADERS), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
     success(fn = upload_log_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     ### FEEDBACK LOGS
     if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
         # User submitted feedback for pdf redactions
+        pdf_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
         pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
         pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
         # User submitted feedback for data redactions
+        data_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
         data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
         data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
     else:
         # User submitted feedback for pdf redactions
+        pdf_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
         pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
         pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, placeholder_doc_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
         # User submitted feedback for data redactions
+        data_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
         data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
         data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, placeholder_data_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
     ### USAGE LOGS
     # Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
+    usage_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
     if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
         usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
         main(first_loop_state, latest_file_completed=0, redaction_output_summary_textbox="", output_file_list=None,
          log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
+         current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),CHOSEN_COMPREHEND_ENTITIES = CHOSEN_COMPREHEND_ENTITIES, CHOSEN_REDACT_ENTITIES = CHOSEN_REDACT_ENTITIES, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
 # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
 # with gr.Tab(label="Advanced options"):

tools/config.py CHANGED Viewed

@@ -204,7 +204,7 @@ if LOGGING == 'True':
     # Configure logging
     logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 ###
@@ -218,6 +218,80 @@ POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on
 if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
 if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)
 # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
 PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
@@ -232,9 +306,6 @@ RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION
 COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
 ###
 # APP RUN OPTIONS
 ###
@@ -269,7 +340,7 @@ S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_a
 if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
 else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
 ###

     # Configure logging
     logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+LOG_FILE_NAME = get_or_create_env_var('LOG_FILE_NAME', 'log.csv')
 ###
 if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
 if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)
+# List of models to use for text extraction and PII detection
+# Text extraction models
+SELECTABLE_TEXT_EXTRACT_OPTION = get_or_create_env_var('SELECTABLE_TEXT_EXTRACT_OPTION', "Local model - selectable text")
+TESSERACT_TEXT_EXTRACT_OPTION = get_or_create_env_var('TESSERACT_TEXT_EXTRACT_OPTION', "Local OCR model - PDFs without selectable text")
+TEXTRACT_TEXT_EXTRACT_OPTION = get_or_create_env_var('TEXTRACT_TEXT_EXTRACT_OPTION', "AWS Textract service - all PDF types")
+# PII detection models
+NO_REDACTION_PII_OPTION = get_or_create_env_var('NO_REDACTION_PII_OPTION', "Only extract text (no redaction)")
+LOCAL_PII_OPTION = get_or_create_env_var('LOCAL_PII_OPTION', "Local")
+AWS_PII_OPTION  = get_or_create_env_var('AWS_PII_OPTION', "AWS Comprehend")
+SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = get_or_create_env_var('SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS', 'True')
+SHOW_AWS_TEXT_EXTRACTION_OPTIONS = get_or_create_env_var('SHOW_AWS_TEXT_EXTRACTION_OPTIONS', 'True')
+# Show at least local options if everything mistakenly removed
+if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS != "True" and SHOW_AWS_TEXT_EXTRACTION_OPTIONS != "True":
+    SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = "True"
+local_model_options = []
+aws_model_options = []
+text_extraction_models = []
+if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS == 'True':
+    local_model_options.append(SELECTABLE_TEXT_EXTRACT_OPTION)
+    local_model_options.append(TESSERACT_TEXT_EXTRACT_OPTION)
+if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == 'True':
+    aws_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION)
+TEXT_EXTRACTION_MODELS = local_model_options + aws_model_options
+SHOW_LOCAL_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_LOCAL_PII_DETECTION_OPTIONS', 'True')
+SHOW_AWS_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_AWS_PII_DETECTION_OPTIONS', 'True')
+if SHOW_LOCAL_PII_DETECTION_OPTIONS != "True" and SHOW_AWS_PII_DETECTION_OPTIONS != "True":
+    SHOW_LOCAL_PII_DETECTION_OPTIONS = "True"
+local_model_options = [NO_REDACTION_PII_OPTION]
+aws_model_options = []
+pii_detection_models = []
+if SHOW_LOCAL_PII_DETECTION_OPTIONS == 'True':
+    local_model_options.append(LOCAL_PII_OPTION)
+if SHOW_AWS_PII_DETECTION_OPTIONS == 'True':
+    aws_model_options.append(AWS_PII_OPTION)
+PII_DETECTION_MODELS = local_model_options + aws_model_options
+if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == "True":
+    DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var('DEFAULT_TEXT_EXTRACTION_MODEL', TEXTRACT_TEXT_EXTRACT_OPTION)
+else:
+    DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var('DEFAULT_TEXT_EXTRACTION_MODEL', SELECTABLE_TEXT_EXTRACT_OPTION)
+if SHOW_AWS_PII_DETECTION_OPTIONS == "True":
+    DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var('DEFAULT_PII_DETECTION_MODEL', AWS_PII_OPTION)
+else:
+    DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var('DEFAULT_PII_DETECTION_MODEL', LOCAL_PII_OPTION)
+# Create list of PII detection models for tabular redaction
+TABULAR_PII_DETECTION_MODELS = PII_DETECTION_MODELS.copy()
+if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
+    TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
+# Entities for redaction
+CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
+FULL_COMPREHEND_ENTITY_LIST = get_or_create_env_var('FULL_COMPREHEND_ENTITY_LIST', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', 'CUSTOM_FUZZY']")
+# Entities for local PII redaction option
+CHOSEN_REDACT_ENTITIES = get_or_create_env_var('CHOSEN_REDACT_ENTITIES', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CUSTOM']")
+FULL_ENTITY_LIST = get_or_create_env_var('FULL_ENTITY_LIST', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']")
 # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
 PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
 COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
 ###
 # APP RUN OPTIONS
 ###
 if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
 else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
+FILE_INPUT_HEIGHT = get_or_create_env_var('FILE_INPUT_HEIGHT', '200')
 ###

tools/data_anonymise.py CHANGED Viewed

@@ -6,20 +6,16 @@ import time
 import boto3
 import botocore
 import pandas as pd
-from openpyxl import Workbook, load_workbook
 from faker import Faker
 from gradio import Progress
 from typing import List, Dict, Any
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
-from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER
 from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
-from tools.custom_image_analyser_engine import do_aws_comprehend_call
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict
@@ -28,7 +24,7 @@ fake = Faker("en_UK")
 def fake_first_name(x):
     return fake.first_name()
-def initial_clean(text):
     #### Some of my cleaning functions
     html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
     html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
@@ -49,7 +45,7 @@ def initial_clean(text):
     return text
-def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
         output = []
         if hasattr(result, 'value'):
@@ -115,7 +111,7 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
     return decision_process_output_str
-def anon_consistent_names(df):
     # ## Pick out common names and replace them with the same person value
     df_dict = df.to_dict(orient="list")
@@ -553,7 +549,19 @@ def anon_wrapper_func(
     return out_file_paths, out_message, key_string, log_files_output_paths
-def anonymise_script(df:pd.DataFrame, anon_strat:str, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], in_deny_list:List[str]=[], max_fuzzy_spelling_mistakes_num:int=0, pii_identification_method:str="Local", chosen_redact_comprehend_entities:List[str]=[], comprehend_query_number:int=0, comprehend_client:botocore.client.BaseClient="", custom_entities=custom_entities, progress=Progress(track_tqdm=False)):
     '''
     Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
     '''

 import boto3
 import botocore
 import pandas as pd
+from openpyxl import Workbook
 from faker import Faker
 from gradio import Progress
 from typing import List, Dict, Any
+from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER
 from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict
 def fake_first_name(x):
     return fake.first_name()
+def initial_clean(text:str) -> str:
     #### Some of my cleaning functions
     html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
     html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
     return text
+def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
         output = []
         if hasattr(result, 'value'):
     return decision_process_output_str
+def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
     # ## Pick out common names and replace them with the same person value
     df_dict = df.to_dict(orient="list")
     return out_file_paths, out_message, key_string, log_files_output_paths
+def anonymise_script(df:pd.DataFrame,
+                     anon_strat:str,
+                     language:str,
+                     chosen_redact_entities:List[str],
+                     in_allow_list:List[str]=[],
+                     in_deny_list:List[str]=[],
+                     max_fuzzy_spelling_mistakes_num:int=0,
+                     pii_identification_method:str="Local",
+                     chosen_redact_comprehend_entities:List[str]=[],
+                     comprehend_query_number:int=0,
+                     comprehend_client:botocore.client.BaseClient="",
+                     custom_entities:List[str]=custom_entities,
+                     progress:Progress=Progress(track_tqdm=False)):
     '''
     Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
     '''

tools/file_conversion.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from pdf2image import convert_from_path, pdfinfo_from_path
 from PIL import Image, ImageFile
 import os
 import re
@@ -14,7 +13,7 @@ import zipfile
 from collections import defaultdict
 from tqdm import tqdm
 from gradio import Progress
-from typing import List, Optional, Dict, Any
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pdf2image import convert_from_path
 from PIL import Image
@@ -23,14 +22,14 @@ import random
 import string
 import warnings # To warn about potential type changes
 IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
 pd.set_option('future.no_silent_downcasting', True)
-from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR, COMPRESS_REDACTED_PDF
-from tools.helper_functions import get_file_name_without_type, tesseract_ocr_option, text_ocr_option, textract_option, read_file
-# from tools.aws_textract import load_and_convert_textract_json
 image_dpi = float(IMAGES_DPI)
 if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
 else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
@@ -596,8 +595,8 @@ def prepare_image_or_pdf(
         elif is_pdf_or_image(file_path):  # Alternatively, if it's an image
             # Check if the file is an image type and the user selected text ocr option
-            if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
-                in_redact_method = tesseract_ocr_option
             # Convert image to a pymupdf document
             pymupdf_doc = pymupdf.open()  # Create a new empty document
@@ -765,13 +764,13 @@ def prepare_image_or_pdf(
         # Must be something else, return with error message
         else:
-            if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
                 if is_pdf_or_image(file_path) == False:
                     out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
                     print(out_message)
                     raise Exception(out_message)
-            elif in_redact_method == text_ocr_option:
                 if is_pdf(file_path) == False:
                     out_message = "Please upload a PDF file for text analysis."
                     print(out_message)

 from pdf2image import convert_from_path, pdfinfo_from_path
 from PIL import Image, ImageFile
 import os
 import re
 from collections import defaultdict
 from tqdm import tqdm
 from gradio import Progress
+from typing import List, Dict, Any
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pdf2image import convert_from_path
 from PIL import Image
 import string
 import warnings # To warn about potential type changes
+from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR, COMPRESS_REDACTED_PDF, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION
+from tools.helper_functions import get_file_name_without_type, read_file
+# from tools.aws_textract import load_and_convert_textract_json
 IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
 pd.set_option('future.no_silent_downcasting', True)
 image_dpi = float(IMAGES_DPI)
 if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
 else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
         elif is_pdf_or_image(file_path):  # Alternatively, if it's an image
             # Check if the file is an image type and the user selected text ocr option
+            if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == SELECTABLE_TEXT_EXTRACT_OPTION:
+                in_redact_method = TESSERACT_TEXT_EXTRACT_OPTION
             # Convert image to a pymupdf document
             pymupdf_doc = pymupdf.open()  # Create a new empty document
         # Must be something else, return with error message
         else:
+            if in_redact_method == TESSERACT_TEXT_EXTRACT_OPTION or in_redact_method == TEXTRACT_TEXT_EXTRACT_OPTION:
                 if is_pdf_or_image(file_path) == False:
                     out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
                     print(out_message)
                     raise Exception(out_message)
+            elif in_redact_method == SELECTABLE_TEXT_EXTRACT_OPTION:
                 if is_pdf(file_path) == False:
                     out_message = "Please upload a PDF file for text analysis."
                     print(out_message)

tools/file_redaction.py CHANGED Viewed

@@ -19,11 +19,11 @@ import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
-from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION
-from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes, recreate_page_line_level_ocr_results_with_page
-from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
-from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
 ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
@@ -242,7 +242,7 @@ def choose_and_run_redactor(file_paths:List[str],
             combined_out_message = combined_out_message + end_message
         # Only send across review file if redaction has been done
-        if pii_identification_method != no_redaction_option:
             if len(review_out_file_paths) == 1:
                 #review_file_path = [x for x in out_file_paths if "review_file" in x]
@@ -262,12 +262,12 @@ def choose_and_run_redactor(file_paths:List[str],
     # Prepare documents and images as required if they don't already exist
     prepare_images_flag = None  # Determines whether to call prepare_image_or_pdf
-    if textract_output_found and text_extraction_method == textract_option:
         print("Existing Textract outputs found, not preparing images or documents.")
         prepare_images_flag = False
         #return  # No need to call `prepare_image_or_pdf`, exit early
-    elif text_extraction_method == text_ocr_option:
         print("Running text extraction analysis, not preparing images.")
         prepare_images_flag = False
@@ -316,7 +316,7 @@ def choose_and_run_redactor(file_paths:List[str],
             combined_out_message = combined_out_message + "\n" + out_message
         # Only send across review file if redaction has been done
-        if pii_identification_method != no_redaction_option:
             # If only pdf currently in review outputs, add on the latest review file
             if len(review_out_file_paths) == 1:
                 #review_file_path = [x for x in out_file_paths if "review_file" in x]
@@ -361,7 +361,7 @@ def choose_and_run_redactor(file_paths:List[str],
     # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
-    if pii_identification_method == aws_pii_detector:
         if aws_access_key_textbox and aws_secret_key_textbox:
             print("Connecting to Comprehend using AWS access key and secret keys from user input.")
             comprehend_client = boto3.client('comprehend',
@@ -384,7 +384,7 @@ def choose_and_run_redactor(file_paths:List[str],
         comprehend_client = ""
     # Try to connect to AWS Textract Client if using that text extraction method
-    if text_extraction_method == textract_option:
         if aws_access_key_textbox and aws_secret_key_textbox:
             print("Connecting to Textract using AWS access key and secret keys from user input.")
             textract_client = boto3.client('textract',
@@ -429,10 +429,10 @@ def choose_and_run_redactor(file_paths:List[str],
             pdf_file_name_with_ext = os.path.basename(file_path)
             is_a_pdf = is_pdf(file_path) == True
-            if is_a_pdf == False and text_extraction_method == text_ocr_option:
                 # If user has not submitted a pdf, assume it's an image
                 print("File is not a PDF, assuming that image analysis needs to be used.")
-                text_extraction_method = tesseract_ocr_option
         else:
             out_message = "No file selected"
             print(out_message)
@@ -443,7 +443,7 @@ def choose_and_run_redactor(file_paths:List[str],
         review_file_path = orig_pdf_file_path + '_review_file.csv'
         # Remove any existing review_file paths from the review file outputs
-        if text_extraction_method == tesseract_ocr_option or text_extraction_method == textract_option:
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
@@ -490,7 +490,7 @@ def choose_and_run_redactor(file_paths:List[str],
                 all_textract_request_metadata.extend(new_textract_request_metadata)
-        elif text_extraction_method == text_ocr_option:
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
@@ -541,7 +541,7 @@ def choose_and_run_redactor(file_paths:List[str],
             # Save redacted file
-            if pii_identification_method != no_redaction_option:
                 if RETURN_PDF_END_OF_REDACTION == True:
                     progress(0.9, "Saving redacted file")
@@ -589,7 +589,7 @@ def choose_and_run_redactor(file_paths:List[str],
             review_file_state.to_csv(review_file_path, index=None)
-            if pii_identification_method != no_redaction_option:
                 out_file_paths.append(review_file_path)
             # Make a combined message for the file
@@ -1249,7 +1249,7 @@ def redact_image_pdf(file_path:str,
                      allow_list:List[str]=None,
                      page_min:int=0,
                      page_max:int=999,
-                     text_extraction_method:str=tesseract_ocr_option,
                      handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
                      textract_request_metadata:list=[],
                      current_loop_page:int=0,
@@ -1287,7 +1287,7 @@ def redact_image_pdf(file_path:str,
     - allow_list (List[str], optional): A list of entity types to allow in the PDF. Defaults to None.
     - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
     - page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
-    - text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to tesseract_ocr_option.
     - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
     - textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
     - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
@@ -1336,7 +1336,7 @@ def redact_image_pdf(file_path:str,
         print(out_message)
         raise Exception(out_message)
-    if text_extraction_method == textract_option and textract_client == "":
         out_message_warning = "Connection to AWS Textract service unsuccessful. Redaction will only continue if local AWS Textract results can be found."
         print(out_message_warning)
         #raise Exception(out_message)
@@ -1353,7 +1353,7 @@ def redact_image_pdf(file_path:str,
     print("Page range:", str(page_min + 1), "to", str(page_max))
     # If running Textract, check if file already exists. If it does, load in existing data
-    if text_extraction_method == textract_option:
         textract_json_file_path = output_folder + file_name + "_textract.json"
         textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
         original_textract_data = textract_data.copy()
@@ -1361,7 +1361,7 @@ def redact_image_pdf(file_path:str,
         print("Successfully loaded in Textract analysis results from file")
     # If running local OCR option, check if file already exists. If it does, load in existing data
-    if text_extraction_method == tesseract_ocr_option:
         all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words.json"
         all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
         original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
@@ -1428,7 +1428,7 @@ def redact_image_pdf(file_path:str,
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
             # If using Tesseract
-            if text_extraction_method == tesseract_ocr_option:
                 if all_page_line_level_ocr_results_with_words:
                     # Find the first dict where 'page' matches
@@ -1452,7 +1452,7 @@ def redact_image_pdf(file_path:str,
                     all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
             # Check if page exists in existing textract data. If not, send to service to analyse
-            if text_extraction_method == textract_option:
                 text_blocks = []
                 if not textract_data:
@@ -1527,7 +1527,7 @@ def redact_image_pdf(file_path:str,
             all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
-            if pii_identification_method != no_redaction_option:
                 # Step 2: Analyse text and identify PII
                 if chosen_redact_entities or chosen_redact_comprehend_entities:
@@ -1667,7 +1667,7 @@ def redact_image_pdf(file_path:str,
                     annotations_all_pages.append(page_image_annotations)
-                if text_extraction_method == textract_option:
                     if original_textract_data != textract_data:
                         # Write the updated existing textract data back to the JSON file
                         with open(textract_json_file_path, 'w') as json_file:
@@ -1676,7 +1676,7 @@ def redact_image_pdf(file_path:str,
                     if textract_json_file_path not in log_files_output_paths:
                         log_files_output_paths.append(textract_json_file_path)
-                if text_extraction_method == tesseract_ocr_option:
                     if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
                         # Write the updated existing textract data back to the JSON file
                         with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
@@ -1715,7 +1715,7 @@ def redact_image_pdf(file_path:str,
             progress.close(_tqdm=progress_bar)
             tqdm._instances.clear()
-            if text_extraction_method == textract_option:
                 # Write the updated existing textract data back to the JSON file
                 if original_textract_data != textract_data:
                     with open(textract_json_file_path, 'w') as json_file:
@@ -1724,7 +1724,7 @@ def redact_image_pdf(file_path:str,
                 if textract_json_file_path not in log_files_output_paths:
                     log_files_output_paths.append(textract_json_file_path)
-            if text_extraction_method == tesseract_ocr_option:
                 if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
                     # Write the updated existing textract data back to the JSON file
                     with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
@@ -1739,7 +1739,7 @@ def redact_image_pdf(file_path:str,
             return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
-    if text_extraction_method == textract_option:
         # Write the updated existing textract data back to the JSON file
         if original_textract_data != textract_data:
@@ -1749,7 +1749,7 @@ def redact_image_pdf(file_path:str,
         if textract_json_file_path not in log_files_output_paths:
             log_files_output_paths.append(textract_json_file_path)
-    if text_extraction_method == tesseract_ocr_option:
         if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
             # Write the updated existing textract data back to the JSON file
             with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
@@ -2095,7 +2095,7 @@ def redact_text_pdf(
                     all_page_line_text_extraction_characters.extend(line_characters)
                 ### REDACTION
-                if pii_identification_method != no_redaction_option:
                     if chosen_redact_entities or chosen_redact_comprehend_entities:
                         page_redaction_bounding_boxes = run_page_text_redaction(

 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
+from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION
+from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction,  recreate_page_line_level_ocr_results_with_page
+from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
+from tools.helper_functions import get_file_name_without_type, clean_unicode_text
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
 ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
             combined_out_message = combined_out_message + end_message
         # Only send across review file if redaction has been done
+        if pii_identification_method != NO_REDACTION_PII_OPTION:
             if len(review_out_file_paths) == 1:
                 #review_file_path = [x for x in out_file_paths if "review_file" in x]
     # Prepare documents and images as required if they don't already exist
     prepare_images_flag = None  # Determines whether to call prepare_image_or_pdf
+    if textract_output_found and text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
         print("Existing Textract outputs found, not preparing images or documents.")
         prepare_images_flag = False
         #return  # No need to call `prepare_image_or_pdf`, exit early
+    elif text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
         print("Running text extraction analysis, not preparing images.")
         prepare_images_flag = False
             combined_out_message = combined_out_message + "\n" + out_message
         # Only send across review file if redaction has been done
+        if pii_identification_method != NO_REDACTION_PII_OPTION:
             # If only pdf currently in review outputs, add on the latest review file
             if len(review_out_file_paths) == 1:
                 #review_file_path = [x for x in out_file_paths if "review_file" in x]
     # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
+    if pii_identification_method == AWS_PII_OPTION:
         if aws_access_key_textbox and aws_secret_key_textbox:
             print("Connecting to Comprehend using AWS access key and secret keys from user input.")
             comprehend_client = boto3.client('comprehend',
         comprehend_client = ""
     # Try to connect to AWS Textract Client if using that text extraction method
+    if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
         if aws_access_key_textbox and aws_secret_key_textbox:
             print("Connecting to Textract using AWS access key and secret keys from user input.")
             textract_client = boto3.client('textract',
             pdf_file_name_with_ext = os.path.basename(file_path)
             is_a_pdf = is_pdf(file_path) == True
+            if is_a_pdf == False and text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
                 # If user has not submitted a pdf, assume it's an image
                 print("File is not a PDF, assuming that image analysis needs to be used.")
+                text_extraction_method = TESSERACT_TEXT_EXTRACT_OPTION
         else:
             out_message = "No file selected"
             print(out_message)
         review_file_path = orig_pdf_file_path + '_review_file.csv'
         # Remove any existing review_file paths from the review file outputs
+        if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 all_textract_request_metadata.extend(new_textract_request_metadata)
+        elif text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
             # Save redacted file
+            if pii_identification_method != NO_REDACTION_PII_OPTION:
                 if RETURN_PDF_END_OF_REDACTION == True:
                     progress(0.9, "Saving redacted file")
             review_file_state.to_csv(review_file_path, index=None)
+            if pii_identification_method != NO_REDACTION_PII_OPTION:
                 out_file_paths.append(review_file_path)
             # Make a combined message for the file
                      allow_list:List[str]=None,
                      page_min:int=0,
                      page_max:int=999,
+                     text_extraction_method:str=TESSERACT_TEXT_EXTRACT_OPTION,
                      handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
                      textract_request_metadata:list=[],
                      current_loop_page:int=0,
     - allow_list (List[str], optional): A list of entity types to allow in the PDF. Defaults to None.
     - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
     - page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
+    - text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to TESSERACT_TEXT_EXTRACT_OPTION.
     - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
     - textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
     - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
         print(out_message)
         raise Exception(out_message)
+    if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION and textract_client == "":
         out_message_warning = "Connection to AWS Textract service unsuccessful. Redaction will only continue if local AWS Textract results can be found."
         print(out_message_warning)
         #raise Exception(out_message)
     print("Page range:", str(page_min + 1), "to", str(page_max))
     # If running Textract, check if file already exists. If it does, load in existing data
+    if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
         textract_json_file_path = output_folder + file_name + "_textract.json"
         textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
         original_textract_data = textract_data.copy()
         print("Successfully loaded in Textract analysis results from file")
     # If running local OCR option, check if file already exists. If it does, load in existing data
+    if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
         all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words.json"
         all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
         original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
             # If using Tesseract
+            if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
                 if all_page_line_level_ocr_results_with_words:
                     # Find the first dict where 'page' matches
                     all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
             # Check if page exists in existing textract data. If not, send to service to analyse
+            if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
                 text_blocks = []
                 if not textract_data:
             all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
+            if pii_identification_method != NO_REDACTION_PII_OPTION:
                 # Step 2: Analyse text and identify PII
                 if chosen_redact_entities or chosen_redact_comprehend_entities:
                     annotations_all_pages.append(page_image_annotations)
+                if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
                     if original_textract_data != textract_data:
                         # Write the updated existing textract data back to the JSON file
                         with open(textract_json_file_path, 'w') as json_file:
                     if textract_json_file_path not in log_files_output_paths:
                         log_files_output_paths.append(textract_json_file_path)
+                if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
                     if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
                         # Write the updated existing textract data back to the JSON file
                         with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
             progress.close(_tqdm=progress_bar)
             tqdm._instances.clear()
+            if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
                 # Write the updated existing textract data back to the JSON file
                 if original_textract_data != textract_data:
                     with open(textract_json_file_path, 'w') as json_file:
                 if textract_json_file_path not in log_files_output_paths:
                     log_files_output_paths.append(textract_json_file_path)
+            if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
                 if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
                     # Write the updated existing textract data back to the JSON file
                     with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
             return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
+    if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
         # Write the updated existing textract data back to the JSON file
         if original_textract_data != textract_data:
         if textract_json_file_path not in log_files_output_paths:
             log_files_output_paths.append(textract_json_file_path)
+    if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
         if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
             # Write the updated existing textract data back to the JSON file
             with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
                     all_page_line_text_extraction_characters.extend(line_characters)
                 ### REDACTION
+                if pii_identification_method != NO_REDACTION_PII_OPTION:
                     if chosen_redact_entities or chosen_redact_comprehend_entities:
                         page_redaction_bounding_boxes = run_page_text_redaction(

tools/helper_functions.py CHANGED Viewed

@@ -9,16 +9,7 @@ import unicodedata
 from typing import List
 from math import ceil
 from gradio_image_annotation import image_annotator
-from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
-# Names for options labels
-text_ocr_option = "Local model - selectable text"
-tesseract_ocr_option = "Local OCR model - PDFs without selectable text"
-textract_option = "AWS Textract service - all PDF types"
-no_redaction_option = "Only extract text (no redaction)"
-local_pii_detector = "Local"
-aws_pii_detector  = "AWS Comprehend"
 def reset_state_vars():
     return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
@@ -438,9 +429,9 @@ def calculate_aws_costs(number_of_pages:str,
                         comprehend_unit_cost:float=0.0001,
                         comprehend_size_unit_average:float=250,
                         average_characters_per_page:float=2000,
-                        textract_option:str=textract_option,
-                        no_redaction_option:str=no_redaction_option,
-                        aws_pii_detector:str=aws_pii_detector):
     '''
     Calculate the approximate cost of submitting a document to AWS Textract and/or AWS Comprehend, assuming that Textract outputs do not already exist in the output folder.
@@ -457,9 +448,9 @@ def calculate_aws_costs(number_of_pages:str,
     - comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
     - comprehend_size_unit_average (float, optional): Average size of a 'unit' of text passed to AWS Comprehend by the app through the batching process
     - average_characters_per_page (float, optional): Average number of characters on an A4 page.
-    - textract_option (str, optional): String label for the text_extract_method_radio button for AWS Textract.
-    - no_redaction_option (str, optional): String label for pii_identification_method_drop for no redaction.
-    - aws_pii_detector (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
     '''
     text_extraction_cost = 0
     pii_identification_cost = 0
@@ -467,14 +458,14 @@ def calculate_aws_costs(number_of_pages:str,
     number_of_pages = int(number_of_pages)
     if textract_output_found_checkbox != True:
-        if text_extract_method_radio == textract_option:
             text_extraction_cost = number_of_pages * textract_page_cost
             if "Extract signatures" in handwrite_signature_checkbox:
                 text_extraction_cost += (textract_signature_cost * number_of_pages)
-    if pii_identification_method != no_redaction_option:
-        if pii_identification_method == aws_pii_detector:
             comprehend_page_cost = ceil(average_characters_per_page / comprehend_size_unit_average) * comprehend_unit_cost
             pii_identification_cost = comprehend_page_cost * number_of_pages
@@ -497,11 +488,11 @@ def calculate_time_taken(number_of_pages:str,
                         local_text_extraction_page_time:float=0.3,
                         local_pii_redaction_page_time:float=0.5,
                         local_ocr_extraction_page_time:float=1.5,
-                        textract_option:str=textract_option,
-                        text_ocr_option:str=text_ocr_option,
-                        local_ocr_option:str=tesseract_ocr_option,
-                        no_redaction_option:str=no_redaction_option,
-                        aws_pii_detector:str=aws_pii_detector):
     '''
     Calculate the approximate time to redact a document.
@@ -516,11 +507,11 @@ def calculate_time_taken(number_of_pages:str,
     - local_text_redaction_page_time (float, optional): Approximate time to extract text on a page with the local text redaction option.
     - local_pii_redaction_page_time (float, optional): Approximate time to redact text on a page with the local text redaction option.
     - local_ocr_extraction_page_time (float, optional): Approximate time to extract text from a page with the local OCR redaction option.
-    - textract_option (str, optional): String label for the text_extract_method_radio button for AWS Textract.
-    - text_ocr_option (str, optional): String label for text_extract_method_radio for text extraction.
     - local_ocr_option (str, optional): String label for text_extract_method_radio for local OCR.
-    - no_redaction_option (str, optional): String label for pii_identification_method_drop for no redaction.
-    - aws_pii_detector (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
     '''
     calculated_time_taken = 0
     page_conversion_time_taken = 0
@@ -530,22 +521,22 @@ def calculate_time_taken(number_of_pages:str,
     number_of_pages = int(number_of_pages)
     # Page preparation/conversion to image time
-    if (text_extract_method_radio != text_ocr_option) and (textract_output_found_checkbox != True):
         page_conversion_time_taken = number_of_pages * convert_page_time
     # Page text extraction time
-    if text_extract_method_radio == textract_option:
         if textract_output_found_checkbox != True:
             page_extraction_time_taken = number_of_pages * textract_page_time
     elif text_extract_method_radio == local_ocr_option:
         if local_ocr_output_found_checkbox != True:
             page_extraction_time_taken = number_of_pages * local_ocr_extraction_page_time
-    elif text_extract_method_radio == text_ocr_option:
         page_conversion_time_taken = number_of_pages * local_text_extraction_page_time
     # Page redaction time
-    if pii_identification_method != no_redaction_option:
-        if pii_identification_method == aws_pii_detector:
             page_redaction_time_taken = number_of_pages * comprehend_page_time
         else:
             page_redaction_time_taken = number_of_pages * local_pii_redaction_page_time

 from typing import List
 from math import ceil
 from gradio_image_annotation import image_annotator
+from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION
 def reset_state_vars():
     return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
                         comprehend_unit_cost:float=0.0001,
                         comprehend_size_unit_average:float=250,
                         average_characters_per_page:float=2000,
+                        TEXTRACT_TEXT_EXTRACT_OPTION:str=TEXTRACT_TEXT_EXTRACT_OPTION,
+                        NO_REDACTION_PII_OPTION:str=NO_REDACTION_PII_OPTION,
+                        AWS_PII_OPTION:str=AWS_PII_OPTION):
     '''
     Calculate the approximate cost of submitting a document to AWS Textract and/or AWS Comprehend, assuming that Textract outputs do not already exist in the output folder.
     - comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
     - comprehend_size_unit_average (float, optional): Average size of a 'unit' of text passed to AWS Comprehend by the app through the batching process
     - average_characters_per_page (float, optional): Average number of characters on an A4 page.
+    - TEXTRACT_TEXT_EXTRACT_OPTION (str, optional): String label for the text_extract_method_radio button for AWS Textract.
+    - NO_REDACTION_PII_OPTION (str, optional): String label for pii_identification_method_drop for no redaction.
+    - AWS_PII_OPTION (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
     '''
     text_extraction_cost = 0
     pii_identification_cost = 0
     number_of_pages = int(number_of_pages)
     if textract_output_found_checkbox != True:
+        if text_extract_method_radio == TEXTRACT_TEXT_EXTRACT_OPTION:
             text_extraction_cost = number_of_pages * textract_page_cost
             if "Extract signatures" in handwrite_signature_checkbox:
                 text_extraction_cost += (textract_signature_cost * number_of_pages)
+    if pii_identification_method != NO_REDACTION_PII_OPTION:
+        if pii_identification_method == AWS_PII_OPTION:
             comprehend_page_cost = ceil(average_characters_per_page / comprehend_size_unit_average) * comprehend_unit_cost
             pii_identification_cost = comprehend_page_cost * number_of_pages
                         local_text_extraction_page_time:float=0.3,
                         local_pii_redaction_page_time:float=0.5,
                         local_ocr_extraction_page_time:float=1.5,
+                        TEXTRACT_TEXT_EXTRACT_OPTION:str=TEXTRACT_TEXT_EXTRACT_OPTION,
+                        SELECTABLE_TEXT_EXTRACT_OPTION:str=SELECTABLE_TEXT_EXTRACT_OPTION,
+                        local_ocr_option:str=TESSERACT_TEXT_EXTRACT_OPTION,
+                        NO_REDACTION_PII_OPTION:str=NO_REDACTION_PII_OPTION,
+                        AWS_PII_OPTION:str=AWS_PII_OPTION):
     '''
     Calculate the approximate time to redact a document.
     - local_text_redaction_page_time (float, optional): Approximate time to extract text on a page with the local text redaction option.
     - local_pii_redaction_page_time (float, optional): Approximate time to redact text on a page with the local text redaction option.
     - local_ocr_extraction_page_time (float, optional): Approximate time to extract text from a page with the local OCR redaction option.
+    - TEXTRACT_TEXT_EXTRACT_OPTION (str, optional): String label for the text_extract_method_radio button for AWS Textract.
+    - SELECTABLE_TEXT_EXTRACT_OPTION (str, optional): String label for text_extract_method_radio for text extraction.
     - local_ocr_option (str, optional): String label for text_extract_method_radio for local OCR.
+    - NO_REDACTION_PII_OPTION (str, optional): String label for pii_identification_method_drop for no redaction.
+    - AWS_PII_OPTION (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
     '''
     calculated_time_taken = 0
     page_conversion_time_taken = 0
     number_of_pages = int(number_of_pages)
     # Page preparation/conversion to image time
+    if (text_extract_method_radio != SELECTABLE_TEXT_EXTRACT_OPTION) and (textract_output_found_checkbox != True):
         page_conversion_time_taken = number_of_pages * convert_page_time
     # Page text extraction time
+    if text_extract_method_radio == TEXTRACT_TEXT_EXTRACT_OPTION:
         if textract_output_found_checkbox != True:
             page_extraction_time_taken = number_of_pages * textract_page_time
     elif text_extract_method_radio == local_ocr_option:
         if local_ocr_output_found_checkbox != True:
             page_extraction_time_taken = number_of_pages * local_ocr_extraction_page_time
+    elif text_extract_method_radio == SELECTABLE_TEXT_EXTRACT_OPTION:
         page_conversion_time_taken = number_of_pages * local_text_extraction_page_time
     # Page redaction time
+    if pii_identification_method != NO_REDACTION_PII_OPTION:
+        if pii_identification_method == AWS_PII_OPTION:
             page_redaction_time_taken = number_of_pages * comprehend_page_time
         else:
             page_redaction_time_taken = number_of_pages * local_pii_redaction_page_time

tools/redaction_review.py CHANGED Viewed

@@ -14,8 +14,8 @@ import pymupdf
 from PIL import ImageDraw, Image
 from datetime import datetime, timezone, timedelta
-from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
-from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression
 from tools.helper_functions import get_file_name_without_type,  detect_file_type
 from tools.file_redaction import redact_page_with_pymupdf

 from PIL import ImageDraw, Image
 from datetime import datetime, timezone, timedelta
+from tools.config import OUTPUT_FOLDER, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
+from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression
 from tools.helper_functions import get_file_name_without_type,  detect_file_type
 from tools.file_redaction import redact_page_with_pymupdf