Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Nov 8, 2024

Commit

ec98119

1 Parent(s): c71d0c1

Comprehend now uses custom spacy recognisers on top of defaults. Added zoom functionality to annotator. Fixed some pdf mediabox issues and redacted image output issues.

Browse files

Files changed (8) hide show

app.py +37 -14
tools/aws_functions.py +1 -1
tools/custom_image_analyser_engine.py +10 -0
tools/file_conversion.py +6 -4
tools/file_redaction.py +126 -46
tools/helper_functions.py +12 -1
tools/load_spacy_model_custom_recognisers.py +1 -0
tools/redaction_review.py +16 -3

app.py CHANGED Viewed

@@ -13,9 +13,10 @@ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_pa
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
-from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 today_rev = datetime.now().strftime("%Y%m%d")
@@ -29,6 +30,10 @@ chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT
 full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
 chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
@@ -117,6 +122,12 @@ with app:
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
     ###
     # UI DESIGN
     ###
@@ -164,6 +175,9 @@ with app:
             annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
             annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
             annotation_next_page_button = gr.Button("Next page", scale = 3)
         annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
@@ -238,9 +252,9 @@ with app:
                     in_allow_list_text = gr.Textbox(label="Custom allow list load status")
             with gr.Accordion("Add or remove entity types to redact", open = False):
-                in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
                 in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
             handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
             #with gr.Row():
@@ -260,18 +274,19 @@ with app:
     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
-    document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
     then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
-                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc")#.\
-                    #then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
     current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
-                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number])
     # If a file has been completed, the function will continue onto the next document
-    latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
                     then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
         # latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
     # then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
@@ -284,12 +299,20 @@ with app:
     # Page controls at top
     annotate_current_page.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
     annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
@@ -297,12 +320,12 @@ with app:
     # Page controls at bottom
     annotate_current_page_bottom.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     ###
     # TABULAR DATA REDACTION

 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
+from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
+from tools.load_spacy_model_custom_recognisers import custom_entities
 today_rev = datetime.now().strftime("%Y%m%d")
 full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
+# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
+chosen_comprehend_entities.extend(custom_entities)
+full_comprehend_entity_list.extend(custom_entities)
 chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
+    ## Annotator zoom value
+    annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=100, precision=0, visible=False)
+    zoom_true_bool = gr.State(True)
+    zoom_false_bool = gr.State(False)
     ###
     # UI DESIGN
     ###
             annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
             annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
             annotation_next_page_button = gr.Button("Next page", scale = 3)
+        with gr.Row():
+            annotate_zoom_in = gr.Button("Zoom in")
+            annotate_zoom_out = gr.Button("Zoom out")
         annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
                     in_allow_list_text = gr.Textbox(label="Custom allow list load status")
             with gr.Accordion("Add or remove entity types to redact", open = False):
                 in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
+                in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
             handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
             #with gr.Row():
     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
+    document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
     then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
+                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
+                    then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
     current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
+                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
+                    then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
     # If a file has been completed, the function will continue onto the next document
+    latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
                     then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
         # latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
     # then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
     # Page controls at top
     annotate_current_page.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
+    # Zoom in and out on annotator
+    annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        then(update_zoom, inputs=[annotator_zoom_number, zoom_true_bool], outputs=[annotator_zoom_number]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
+    annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        then(update_zoom, inputs=[annotator_zoom_number, zoom_false_bool], outputs=[annotator_zoom_number]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
     annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
     # Page controls at bottom
     annotate_current_page_bottom.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     ###
     # TABULAR DATA REDACTION

tools/aws_functions.py CHANGED Viewed

@@ -10,7 +10,7 @@ PandasDataFrame = Type[pd.DataFrame]
 # Get AWS credentials
 bucket_name=""
-RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
 print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')

 # Get AWS credentials
 bucket_name=""
+RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "1")
 print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -13,6 +13,7 @@ from copy import deepcopy
 from tools.helper_functions import clean_unicode_text
 from tools.aws_functions import comprehend_client
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 #import string  # Import string to get a list of common punctuation characters
 @dataclass
@@ -491,6 +492,14 @@ class CustomImageAnalyzerEngine:
                 analyzer_results_by_line[i] = analyzer_result
             elif pii_identification_method == "AWS Comprehend":
                 if len(line_level_ocr_result.text) >= 3:
                     # Add line to current batch with a separator
                     if current_batch:
@@ -509,6 +518,7 @@ class CustomImageAnalyzerEngine:
                                 Text=current_batch,
                                 LanguageCode=text_analyzer_kwargs["language"]
                             )
                         except Exception as e:
                             print(e)
                             time.sleep(3)

 from tools.helper_functions import clean_unicode_text
 from tools.aws_functions import comprehend_client
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
+from tools.load_spacy_model_custom_recognisers import custom_entities
 #import string  # Import string to get a list of common punctuation characters
 @dataclass
                 analyzer_results_by_line[i] = analyzer_result
             elif pii_identification_method == "AWS Comprehend":
+                # If using AWS Comprehend, Spacy model is only used to identify the custom entities created. Comprehend can't pick up Titles, Streetnames, and UKPostcodes specifically
+                text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
+                spacy_analyzer_result = self.analyzer_engine.analyze(
+                text=line_level_ocr_result.text, **text_analyzer_kwargs)
+                analyzer_results_by_line[i].extend(spacy_analyzer_result)
                 if len(line_level_ocr_result.text) >= 3:
                     # Add line to current batch with a separator
                     if current_batch:
                                 Text=current_batch,
                                 LanguageCode=text_analyzer_kwargs["language"]
                             )
                         except Exception as e:
                             print(e)
                             time.sleep(3)

tools/file_conversion.py CHANGED Viewed

@@ -11,6 +11,8 @@ import pymupdf
 from gradio import Progress
 from typing import List, Optional
 def is_pdf_or_image(filename):
     """
     Check if a file name is a PDF or an image file.
@@ -42,7 +44,7 @@ def is_pdf(filename):
 # %%
 ## Convert pdf to image if necessary
-def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(track_tqdm=True)):
     # Get the number of pages in the PDF
     page_count = pdfinfo_from_path(pdf_path)['Pages']
@@ -70,7 +72,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
         else:
-            image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
             image = image_l[0]
@@ -334,7 +336,7 @@ def prepare_image_or_pdf(
     return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
-def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     file_path_without_ext = get_file_path_end(in_file_path)
     out_file_paths = out_text_file_path
@@ -344,7 +346,7 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     pdf_text_image_paths = process_file(out_text_file_path[0])
     out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
-    pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=300.0, save_all=True, append_images=pdf_text_image_paths[1:])
     # out_file_paths.append(out_text_image_file_path)

 from gradio import Progress
 from typing import List, Optional
+image_dpi = 300.0
 def is_pdf_or_image(filename):
     """
     Check if a file name is a PDF or an image file.
 # %%
 ## Convert pdf to image if necessary
+def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
     # Get the number of pages in the PDF
     page_count = pdfinfo_from_path(pdf_path)['Pages']
         else:
+            image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
             image = image_l[0]
     return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
+def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
     file_path_without_ext = get_file_path_end(in_file_path)
     out_file_paths = out_text_file_path
     pdf_text_image_paths = process_file(out_text_file_path[0])
     out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
+    pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=image_dpi, save_all=True, append_images=pdf_text_image_paths[1:])
     # out_file_paths.append(out_text_image_file_path)

tools/file_redaction.py CHANGED Viewed

@@ -27,8 +27,8 @@ from collections import defaultdict  # For efficient grouping
 from presidio_analyzer import RecognizerResult
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
-from tools.file_conversion import process_file
-from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 # from tools.data_anonymise import generate_decision_process_output
@@ -314,8 +314,8 @@ def choose_and_run_redactor(file_paths:List[str],
             # Save file
             if is_pdf(file_path) == False:
-                out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
-                pymupdf_doc[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pymupdf_doc[1:])
             else:
                 out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
@@ -413,35 +413,40 @@ def choose_and_run_redactor(file_paths:List[str],
     return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
-def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
     '''
-    Convert annotations from pikepdf to pymupdf format
     '''
-    mediabox_height = pymupdf_page.mediabox[3] - pymupdf_page.mediabox[1]
-    mediabox_width = pymupdf_page.mediabox[2] - pymupdf_page.mediabox[0]
-    rect_height = pymupdf_page.rect.height
-    rect_width = pymupdf_page.rect.width
-    # Adjust coordinates based on scaling factors
-    page_x_adjust = (rect_width - mediabox_width) / 2  # Center adjustment
-    page_y_adjust = (rect_height - mediabox_height) / 2  # Center adjustment
-    #print("In the pikepdf conversion function")
-    # Extract the /Rect field
-    rect_field = annot["/Rect"]
-    # Convert the extracted /Rect field to a list of floats (since pikepdf uses Decimal objects)
-    rect_coordinates = [float(coord) for coord in rect_field]
-    # Convert the Y-coordinates (flip using the page height)
     x1, y1, x2, y2 = rect_coordinates
-    x1 = x1 + page_x_adjust
-    new_y1 = (rect_height - y2) - page_y_adjust
-    x2 = x2 + page_x_adjust
-    new_y2 = (rect_height - y1) - page_y_adjust
-    return x1, new_y1, x2, new_y2
 def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
     '''
@@ -496,6 +501,64 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerRes
     return x1, new_y1, x2, new_y2
 def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
     '''
     Converts an image with redaction coordinates from a gradio annotation component to pymupdf coordinates.
@@ -587,25 +650,25 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
         # Else it should be a pikepdf annotation object
         else:
-            x1, pymupdf_y1, x2, pymupdf_y2 = convert_pikepdf_coords_to_pymudf(page, annot)
             rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
             img_annotation_box = {}
             if image:
-                image_x1, image_y1, image_x2, image_y2 = convert_pikepdf_to_image_coords(page, annot, image)
-                img_annotation_box["xmin"] = image_x1
-                img_annotation_box["ymin"] = image_y1
-                img_annotation_box["xmax"] = image_x2
-                img_annotation_box["ymax"] = image_y2
-                img_annotation_box["color"] = (0,0,0)
                 if isinstance(annot, Dictionary):
-                    #print("Trying to get label out of annotation", annot["/T"])
                     img_annotation_box["label"] = str(annot["/T"])
-                    #print("Label is:", img_annotation_box["label"])
                 else:
                     img_annotation_box["label"] = "REDACTION"
@@ -646,6 +709,18 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
     merged_bboxes = []
     grouped_bboxes = defaultdict(list)
     # Reconstruct bounding boxes for substrings of interest
     reconstructed_bboxes = []
     for bbox in bboxes:
@@ -735,16 +810,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
         merged_bboxes.append(merged_box)
-        # Process signature and handwriting results
-    if signature_recogniser_results or handwriting_recogniser_results:
-        if "Redact all identified handwriting" in handwrite_signature_checkbox:
-            #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
-            merged_bboxes.extend(handwriting_recogniser_results)
-        if "Redact all identified signatures" in handwrite_signature_checkbox:
-            #print("Signature boxes exist at merge:", signature_recogniser_results)
-            merged_bboxes.extend(signature_recogniser_results)
     #print("bboxes:", bboxes)
     return merged_bboxes
@@ -1483,6 +1548,21 @@ def redact_text_pdf(
                                     all_text_line_results.append((i, text_line_analyser_result))
                                 elif pii_identification_method == "AWS Comprehend":
                                     if len(text_line.text) >= 3:
                                         # Add separator between lines
                                         if current_batch:

 from presidio_analyzer import RecognizerResult
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
+from tools.file_conversion import process_file, image_dpi
+from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities
 from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 # from tools.data_anonymise import generate_decision_process_output
             # Save file
             if is_pdf(file_path) == False:
+                out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_pdf.pdf"
+                pymupdf_doc[0].save(out_image_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
             else:
                 out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
     return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
+def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox):
     '''
+    Convert annotations from pikepdf to pymupdf format, handling the mediabox larger than rect.
     '''
+    # Use cropbox if available, otherwise use mediabox
+    reference_box = pymupdf_page.rect
+    mediabox = pymupdf_page.mediabox
+    reference_box_height = reference_box.height
+    reference_box_width = reference_box.width
+    # Convert PyMuPDF coordinates back to PDF coordinates (bottom-left origin)
+    media_height = mediabox.height
+    media_width = mediabox.width
+    media_reference_y_diff = media_height - reference_box_height
+    media_reference_x_diff = media_width - reference_box_width
+    y_diff_ratio = media_reference_y_diff / reference_box_height
+    x_diff_ratio = media_reference_x_diff / reference_box_width
+    # Extract the annotation rectangle field
+    rect_field = pikepdf_bbox["/Rect"]
+    rect_coordinates = [float(coord) for coord in rect_field]  # Convert to floats
+    # Unpack coordinates
     x1, y1, x2, y2 = rect_coordinates
+    new_x1 = x1 - (media_reference_x_diff * x_diff_ratio)
+    new_y1 = media_height - y2 - (media_reference_y_diff * y_diff_ratio)
+    new_x2 = x2 - (media_reference_x_diff * x_diff_ratio)
+    new_y2 = media_height - y1 - (media_reference_y_diff * y_diff_ratio)
+    return new_x1, new_y1, new_x2, new_y2
 def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
     '''
     return x1, new_y1, x2, new_y2
+# def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
+#     '''
+#     Converts coordinates from pymupdf format to image coordinates.
+#     '''
+#     rect_height = pymupdf_page.rect.height
+#     rect_width = pymupdf_page.rect.width
+#     image_page_width, image_page_height = image.size
+#     # Calculate scaling factors between pymupdf and PIL image
+#     scale_width = image_page_width / rect_width
+#     scale_height = image_page_height / rect_height
+#     x1_image = x1 * scale_width
+#     y1_image = ((rect_height - y2) * scale_height)
+#     x2_image = x2 * scale_width
+#     y2_image = ((rect_height - y1) * scale_height)
+#     return x1_image, y1_image, x2_image, y2_image
+def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
+    '''
+    Converts coordinates from pymupdf format to image coordinates,
+    accounting for mediabox dimensions.
+    '''
+    rect_height = pymupdf_page.rect.height
+    rect_width = pymupdf_page.rect.width
+    # Get mediabox dimensions
+    mediabox = pymupdf_page.mediabox
+    mediabox_width = mediabox.width
+    mediabox_height = mediabox.height
+    image_page_width, image_page_height = image.size
+    # Calculate scaling factors using mediabox dimensions
+    scale_width = image_page_width / mediabox_width
+    scale_height = image_page_height / mediabox_height
+    print("scale_width:", scale_width)
+    print("scale_height:", scale_height)
+    rect_to_mediabox_x_scale = mediabox_width / rect_width
+    rect_to_mediabox_y_scale = mediabox_height / rect_height
+    print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
+    print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
+    # Adjust coordinates based on scaling factors
+    x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
+    y1_image = (y1 * scale_height) * rect_to_mediabox_y_scale
+    x2_image = (x2 * scale_width) * rect_to_mediabox_x_scale
+    y2_image = (y2 * scale_height) * rect_to_mediabox_y_scale
+    return x1_image, y1_image, x2_image, y2_image
 def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
     '''
     Converts an image with redaction coordinates from a gradio annotation component to pymupdf coordinates.
         # Else it should be a pikepdf annotation object
         else:
+            x1, pymupdf_y1, x2, pymupdf_y2 = convert_pikepdf_coords_to_pymupdf(page, annot)
             rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
             img_annotation_box = {}
             if image:
+                img_width, img_height = image.size
+                x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
+                img_annotation_box["xmin"] = x1  #* (img_width / rect_width) # Use adjusted x1
+                img_annotation_box["ymin"] = image_y1  #* (img_width / rect_width) # Use adjusted y1
+                img_annotation_box["xmax"] = x2# * (img_height / rect_height) # Use adjusted x2
+                img_annotation_box["ymax"] = image_y2 #* (img_height / rect_height) # Use adjusted y2
+                img_annotation_box["color"] = (0, 0, 0)
                 if isinstance(annot, Dictionary):
                     img_annotation_box["label"] = str(annot["/T"])
                 else:
                     img_annotation_box["label"] = "REDACTION"
     merged_bboxes = []
     grouped_bboxes = defaultdict(list)
+        # Process signature and handwriting results
+    if signature_recogniser_results or handwriting_recogniser_results:
+        if "Redact all identified handwriting" in handwrite_signature_checkbox:
+            #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
+            merged_bboxes.extend(handwriting_recogniser_results)
+        if "Redact all identified signatures" in handwrite_signature_checkbox:
+            #print("Signature boxes exist at merge:", signature_recogniser_results)
+            merged_bboxes.extend(signature_recogniser_results)
     # Reconstruct bounding boxes for substrings of interest
     reconstructed_bboxes = []
     for bbox in bboxes:
         merged_bboxes.append(merged_box)
     #print("bboxes:", bboxes)
     return merged_bboxes
                                     all_text_line_results.append((i, text_line_analyser_result))
                                 elif pii_identification_method == "AWS Comprehend":
+                                    # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
+                                    custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
+                                    text_line_analyser_result = nlp_analyser.analyze(
+                                        text=text_line.text,
+                                        language=language,
+                                        entities=custom_redact_entities,
+                                        score_threshold=score_threshold,
+                                        return_decision_process=True,
+                                        allow_list=allow_list
+                                    )
+                                    all_text_line_results.append((i, text_line_analyser_result))
                                     if len(text_line.text) >= 3:
                                         # Add separator between lines
                                         if current_batch:

tools/helper_functions.py CHANGED Viewed

@@ -3,9 +3,20 @@ import re
 import gradio as gr
 import pandas as pd
 import unicodedata
 def reset_state_vars():
-    return [], [], pd.DataFrame(), pd.DataFrame(), 0, ""
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists

 import gradio as gr
 import pandas as pd
 import unicodedata
+from gradio_image_annotation import image_annotator
 def reset_state_vars():
+    return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
+            label="Modify redaction boxes",
+            label_list=["Redaction"],
+            label_colors=[(0, 0, 0)],
+            show_label=False,
+            sources=None,#["upload"],
+            show_clear_button=False,
+            show_share_button=False,
+            show_remove_button=False,
+            interactive=False
+        )
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -10,6 +10,7 @@ import re
 # %%
 model_name = "en_core_web_lg" #"en_core_web_trf"
 score_threshold = 0.001
 # %% [markdown]
 # #### Custom recognisers

 # %%
 model_name = "en_core_web_lg" #"en_core_web_trf"
 score_threshold = 0.001
+custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME"]
 # %% [markdown]
 # #### Custom recognisers

tools/redaction_review.py CHANGED Viewed

@@ -37,9 +37,22 @@ def increase_page(number:int, image_annotator_object:AnnotatedImageData):
     else:
         return max_pages, max_pages
-def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
     # print("\nImage annotator object:", image_annotator_object)
     if not image_annotator_object:
         return image_annotator(
         label="Modify redaction boxes",
@@ -76,8 +89,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
         #label_list=["Redaction"],
         #label_colors=[(0, 0, 0)],
         show_label=False,
-        height='100%',
-        width='100%',
         box_min_size=1,
         box_selected_thickness=2,
         handle_size=4,

     else:
         return max_pages, max_pages
+def update_zoom(current_zoom_level:int, decrease:bool=True):
+    if decrease == False:
+        if current_zoom_level >= 50:
+            current_zoom_level -= 10
+    else:
+        if current_zoom_level < 100:
+            current_zoom_level += 10
+    return current_zoom_level
+def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=100):
     # print("\nImage annotator object:", image_annotator_object)
+    zoom_str = str(zoom) + '%'
     if not image_annotator_object:
         return image_annotator(
         label="Modify redaction boxes",
         #label_list=["Redaction"],
         #label_colors=[(0, 0, 0)],
         show_label=False,
+        height=zoom_str,
+        width=zoom_str,
         box_min_size=1,
         box_selected_thickness=2,
         handle_size=4,