Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

Sean Pedrick-Case commited on Jan 15

Commit

45c751d

unverified ·

2 Parent(s): 9de60e6 c3a8cd7

Merge pull request #2 from seanpedrick-case/dev

Browse files

Files changed (9) hide show

app.py +28 -17
doc_redaction_amplify_app +1 -0
tools/auth.py +22 -20
tools/aws_textract.py +45 -23
tools/custom_image_analyser_engine.py +27 -11
tools/file_conversion.py +41 -43
tools/file_redaction.py +23 -64
tools/load_spacy_model_custom_recognisers.py +13 -5
tools/redaction_review.py +53 -58

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from gradio_image_annotation.image_annotator import AnnotatedImageData
 from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
-from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
@@ -41,6 +41,8 @@ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREET
 language = 'en'
 host_name = socket.gethostname()
 feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
 access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
@@ -84,6 +86,8 @@ with app:
     log_files_output_list_state = gr.State([])
     review_file_state = gr.State(pd.DataFrame())
     # Logging state
     log_file_name = 'log.csv'
@@ -117,7 +121,7 @@ with app:
     ## Annotator zoom value
-    annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
     zoom_true_bool = gr.State(True)
     zoom_false_bool = gr.State(False)
@@ -344,16 +348,18 @@ with app:
     # Page controls at top
     annotate_current_page.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # Zoom in and out on annotator
     annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
@@ -370,18 +376,23 @@ with app:
     #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
     annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
     # Page controls at bottom
     annotate_current_page_bottom.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # Review side bar controls
     recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
@@ -420,13 +431,13 @@ with app:
     app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
     # If running on AWS, load in the default allow list file from S3
-    if RUN_AWS_FUNCTIONS == "1":
-        print("default_allow_list_output_folder_location:", default_allow_list_loc)
-        if not os.path.exists(default_allow_list_loc):
-            app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
-            then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
-        else:
-            app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=log_file_name)

 from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
+from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
 from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 language = 'en'
 host_name = socket.gethostname()
 feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
 access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
     log_files_output_list_state = gr.State([])
     review_file_state = gr.State(pd.DataFrame())
+    do_not_save_pdf_state = gr.State(False)
     # Logging state
     log_file_name = 'log.csv'
     ## Annotator zoom value
+    annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=100, precision=0, visible=False)
     zoom_true_bool = gr.State(True)
     zoom_false_bool = gr.State(False)
     # Page controls at top
     annotate_current_page.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     # Zoom in and out on annotator
     annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
     #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
     annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
+    do_not_save_pdf_state
     # Page controls at bottom
     annotate_current_page_bottom.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     # Review side bar controls
     recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
     app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
     # If running on AWS, load in the default allow list file from S3
+    # if RUN_AWS_FUNCTIONS == "1":
+    #     print("default_allow_list_output_folder_location:", default_allow_list_loc)
+    #     if not os.path.exists(default_allow_list_loc):
+    #         app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
+    #         then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
+    #     else:
+    #         app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=log_file_name)

doc_redaction_amplify_app ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 9585642e4d1f72fc49971789693d5584661084c8

tools/auth.py CHANGED Viewed

@@ -7,13 +7,13 @@ import base64
 from tools.helper_functions import get_or_create_env_var
 client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
-print(f'The value of AWS_CLIENT_ID is {client_id}')
 client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
-print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
 user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
-print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
 def calculate_secret_hash(client_id, client_secret, username):
     message = username + client_id
@@ -46,24 +46,26 @@ def authenticate_user(username:str, password:str, user_pool_id:str=user_pool_id,
     try:
-        # response = client.initiate_auth(
-        #     AuthFlow='USER_PASSWORD_AUTH',
-        #     AuthParameters={
-        #         'USERNAME': username,
-        #         'PASSWORD': password,
-        #     },
-        #     ClientId=client_id
-        # )
-        response = client.initiate_auth(
-        AuthFlow='USER_PASSWORD_AUTH',
-        AuthParameters={
-            'USERNAME': username,
-            'PASSWORD': password,
-            'SECRET_HASH': secret_hash
-        },
-        ClientId=client_id
-        )
         # If successful, you'll receive an AuthenticationResult in the response
         if response.get('AuthenticationResult'):

 from tools.helper_functions import get_or_create_env_var
 client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
+#print(f'The value of AWS_CLIENT_ID is {client_id}')
 client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
+#print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
 user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
+#print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
 def calculate_secret_hash(client_id, client_secret, username):
     message = username + client_id
     try:
+        if client_secret == '':
+            response = client.initiate_auth(
+                AuthFlow='USER_PASSWORD_AUTH',
+                AuthParameters={
+                    'USERNAME': username,
+                    'PASSWORD': password,
+                },
+                ClientId=client_id
+            )
+        else:
+            response = client.initiate_auth(
+            AuthFlow='USER_PASSWORD_AUTH',
+            AuthParameters={
+                'USERNAME': username,
+                'PASSWORD': password,
+                'SECRET_HASH': secret_hash
+            },
+            ClientId=client_id
+            )
         # If successful, you'll receive an AuthenticationResult in the response
         if response.get('AuthenticationResult'):

tools/aws_textract.py CHANGED Viewed

@@ -145,8 +145,9 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
                 # Extract text and bounding box for the line
                 line_text = text_block.get('Text', '')
                 words = []
                 if 'Relationships' in text_block:
                     for relationship in text_block['Relationships']:
                         if relationship['Type'] == 'CHILD':
@@ -179,35 +180,56 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
                                     if text_type == "HANDWRITING":
                                         is_handwriting = True
                                         entity_name = "HANDWRITING"
-                                        word_end = len(entity_name)
-                                        recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= word_text, score= confidence, start=0, end=word_end, left=word_left, top=word_top, width=word_width_abs, height=word_height_abs)
-                                        if recogniser_result not in handwriting:
-                                            handwriting.append(recogniser_result)
-                                            #print("Handwriting found:", handwriting[-1])
             # If handwriting or signature, add to bounding box
             elif (text_block['BlockType'] == 'SIGNATURE'):
                 line_text = "SIGNATURE"
                 is_signature = True
                 entity_name = "SIGNATURE"
-                confidence = text_block['Confidence']
-                word_end = len(entity_name)
-                recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
-                if recogniser_result not in signatures:
-                    signatures.append(recogniser_result)
-                    #print("Signature found:", signatures[-1])
-                words = []
-                words.append({
-                            'text': line_text,
-                            'bounding_box': (line_left, line_top, line_right, line_bottom)
-                        })
             ocr_results_with_children["text_line_" + str(i)] = {
                 "line": i,

                 # Extract text and bounding box for the line
                 line_text = text_block.get('Text', '')
                 words = []
+                current_line_handwriting_results = []  # Track handwriting results for this line
                 if 'Relationships' in text_block:
                     for relationship in text_block['Relationships']:
                         if relationship['Type'] == 'CHILD':
                                     if text_type == "HANDWRITING":
                                         is_handwriting = True
                                         entity_name = "HANDWRITING"
+                                        word_end = len(word_text)
+                                        recogniser_result = CustomImageRecognizerResult(
+                                            entity_type=entity_name,
+                                            text=word_text,
+                                            score=confidence,
+                                            start=0,
+                                            end=word_end,
+                                            left=word_left,
+                                            top=word_top,
+                                            width=word_width_abs,
+                                            height=word_height_abs
+                                        )
+                                        # Add to handwriting collections immediately
+                                        handwriting.append(recogniser_result)
+                                        handwriting_recogniser_results.append(recogniser_result)
+                                        signature_or_handwriting_recogniser_results.append(recogniser_result)
+                                        current_line_handwriting_results.append(recogniser_result)
             # If handwriting or signature, add to bounding box
             elif (text_block['BlockType'] == 'SIGNATURE'):
                 line_text = "SIGNATURE"
                 is_signature = True
                 entity_name = "SIGNATURE"
+                confidence = text_block.get('Confidence', 0)
+                word_end = len(line_text)
+                recogniser_result = CustomImageRecognizerResult(
+                    entity_type=entity_name,
+                    text=line_text,
+                    score=confidence,
+                    start=0,
+                    end=word_end,
+                    left=line_left,
+                    top=line_top,
+                    width=width_abs,
+                    height=height_abs
+                )
+                # Add to signature collections immediately
+                signatures.append(recogniser_result)
+                signature_recogniser_results.append(recogniser_result)
+                signature_or_handwriting_recogniser_results.append(recogniser_result)
+                words = [{
+                    'text': line_text,
+                    'bounding_box': (line_left, line_top, line_right, line_bottom)
+                }]
             ocr_results_with_children["text_line_" + str(i)] = {
                 "line": i,

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -14,6 +14,7 @@ from tools.helper_functions import clean_unicode_text
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 from tools.load_spacy_model_custom_recognisers import custom_entities
 #import string  # Import string to get a list of common punctuation characters
 @dataclass
 class OCRResult:
@@ -493,11 +494,12 @@ class CustomImageAnalyzerEngine:
             elif pii_identification_method == "AWS Comprehend":
-                # If using AWS Comprehend, Spacy model is only used to identify the custom entities created. Comprehend can't pick up Titles, Streetnames, and UKPostcodes specifically
                 text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
                 spacy_analyzer_result = self.analyzer_engine.analyze(
                 text=line_level_ocr_result.text, **text_analyzer_kwargs)
                 analyzer_results_by_line[i].extend(spacy_analyzer_result)
                 if len(line_level_ocr_result.text) >= 3:
@@ -573,7 +575,7 @@ class CustomImageAnalyzerEngine:
                 for result in analyzer_result:
                     # Extract the relevant portion of text based on start and end
                     relevant_text = line_level_ocr_results[i].text[result.start:result.end]
                     # Find the corresponding entry in ocr_results_with_children
                     child_words = ocr_results_with_children_line_level['words']
@@ -583,13 +585,23 @@ class CustomImageAnalyzerEngine:
                     word_num = 0  # Initialize word count
                     total_width = 0  # Initialize total width
-                    for word_text in relevant_text.split():  # Iterate through each word in relevant_text
-                        #print("Looking for word_text:", word_text)
-                        for word in child_words:
-                            #if word['text'].strip(string.punctuation).strip() == word_text.strip(string.punctuation).strip():  # Check for exact match
-                            if word_text in word['text']:
                                 found_word = word
-                                #print("found_word:", found_word)
                                 if word_num == 0:  # First word
                                     left = found_word['bounding_box'][0]
@@ -598,6 +610,10 @@ class CustomImageAnalyzerEngine:
                                 all_words += found_word['text'] + " "  # Concatenate words
                                 total_width = found_word['bounding_box'][2] - left  # Add each word's width
                                 word_num += 1
                                 break  # Move to the next word in relevant_text
                     width = total_width + horizontal_buffer # Set width to total width of all matched words
@@ -621,9 +637,9 @@ class CustomImageAnalyzerEngine:
                     result_reset_pos.start = 0
                     result_reset_pos.end = len(relevant_text)
-                    #print("result_reset_pos:", result_reset_pos)
-                    #print("relevant_line_ocr_result:", relevant_line_ocr_result)
-                    #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
                     # Map the analyzer results to bounding boxes for this line
                     line_results = self.map_analyzer_results_to_bounding_boxes(

 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 from tools.load_spacy_model_custom_recognisers import custom_entities
 #import string  # Import string to get a list of common punctuation characters
+import re  # Add this import at the top of the file
 @dataclass
 class OCRResult:
             elif pii_identification_method == "AWS Comprehend":
+                # If using AWS Comprehend, Spacy model is only used to identify the custom entities created. This is because Comprehend can't pick up Titles, Streetnames, and UKPostcodes, or a custom deny list specifically
                 text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
                 spacy_analyzer_result = self.analyzer_engine.analyze(
                 text=line_level_ocr_result.text, **text_analyzer_kwargs)
                 analyzer_results_by_line[i].extend(spacy_analyzer_result)
                 if len(line_level_ocr_result.text) >= 3:
                 for result in analyzer_result:
                     # Extract the relevant portion of text based on start and end
                     relevant_text = line_level_ocr_results[i].text[result.start:result.end]
                     # Find the corresponding entry in ocr_results_with_children
                     child_words = ocr_results_with_children_line_level['words']
                     word_num = 0  # Initialize word count
                     total_width = 0  # Initialize total width
+                    split_relevant_text = relevant_text.split()
+                    loop_child_words = child_words.copy()
+                    for word_text in split_relevant_text:  # Iterate through each word in relevant_text
+                        quote_str = '"'
+                        replace_str = '(?:"|"|")'
+                        word_regex = rf'(?<!\w){re.escape(word_text.strip()).replace(quote_str, replace_str)}(?!\w)'
+                        for word in loop_child_words:
+                            # Check for regex as whole word
+                            if re.search(word_regex, word['text']):
+                            #if re.search(r'\b' + re.escape(word_text) + r'\b', word['text']):
                                 found_word = word
                                 if word_num == 0:  # First word
                                     left = found_word['bounding_box'][0]
                                 all_words += found_word['text'] + " "  # Concatenate words
                                 total_width = found_word['bounding_box'][2] - left  # Add each word's width
                                 word_num += 1
+                                # Drop the first word of child_words
+                                loop_child_words = loop_child_words[1:]  # Skip the first word
                                 break  # Move to the next word in relevant_text
                     width = total_width + horizontal_buffer # Set width to total width of all matched words
                     result_reset_pos.start = 0
                     result_reset_pos.end = len(relevant_text)
+                    print("result_reset_pos:", result_reset_pos)
+                    print("relevant_line_ocr_result:", relevant_line_ocr_result)
+                    print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
                     # Map the analyzer results to bounding boxes for this line
                     line_results = self.map_analyzer_results_to_bounding_boxes(

tools/file_conversion.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from pdf2image import convert_from_path, pdfinfo_from_path
-from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, read_file
 from PIL import Image, ImageFile
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 import os
@@ -48,7 +48,8 @@ def is_pdf(filename):
 # %%
 ## Convert pdf to image if necessary
 def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
     try:
@@ -261,7 +262,10 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
         else:
             out_colour = img_annotation_box["color"]
     else:
-        out_colour = (0,0,0)
     shape.finish(color=out_colour, fill=out_colour)  # Black fill for the rectangle
     #shape.finish(color=(0, 0, 0))  # Black fill for the rectangle
@@ -478,11 +482,12 @@ def prepare_image_or_pdf(
                     annotation["image"] = image_path
                     all_annotations_object.append(annotation)
-                #print("all_annotations_object:", all_annotations_object)
         elif is_pdf_or_image(file_path):  # Alternatively, if it's an image
             # Convert image to a pymupdf document
             pymupdf_doc = pymupdf.open()  # Create a new empty document
@@ -491,14 +496,16 @@ def prepare_image_or_pdf(
             page = pymupdf_doc.new_page(width=img.width, height=img.height)  # Add a new page
             page.insert_image(rect, filename=file_path)  # Insert the image into the page
-        # Check if the file is an image type and the user selected text ocr option
-        elif file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
-            in_redact_method = tesseract_ocr_option
         elif file_extension in ['.csv']:
             review_file_csv = read_file(file)
-            all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
             json_from_csv = True
             print("Converted CSV review file to json")
@@ -618,12 +625,7 @@ def prepare_image_or_pdf(
         out_message.append(out_time)
         out_message_out = '\n'.join(out_message)
-    #if prepare_for_review == False:
     number_of_pages = len(image_file_paths)
-    #else:
-    #    number_of_pages = len(all_annotations_object)
-    #print("all_annotations_object at end:", all_annotations_object)
     return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
@@ -650,23 +652,6 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
     return out_message, out_file_paths
-# Example DataFrames
-# df1 = pd.DataFrame({
-#     'xmin': [10, 20, 30],
-#     'xmax': [15, 25, 35],
-#     'ymin': [40, 50, 60],
-#     'ymax': [45, 55, 65],
-#     'info1': ['A', 'B', 'C']
-# })
-# df2 = pd.DataFrame({
-#     'xmin': [12, 18, 32],
-#     'xmax': [14, 24, 34],
-#     'ymin': [42, 48, 62],
-#     'ymax': [44, 54, 66],
-#     'info2': ['X', 'Y', 'Z']
-# })
 def join_values_within_threshold(df1, df2):
     # Threshold for matching
     threshold = 5
@@ -757,25 +742,38 @@ def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFram
     return df
-def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
     # Keep only necessary columns
     df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
     # Group the DataFrame by the 'image' column
-    grouped = df.groupby('image')
     # Create a list to hold the JSON data
     json_data = []
-    # Iterate over each group
-    for image_path, group in grouped:
-        # Convert each group to a list of box dictionaries
-        boxes = group.drop(columns=['image', 'page']).to_dict(orient='records')
         # Append the structured data to the json_data list
-        json_data.append({
-            "image": image_path,
-            "boxes": boxes
-        })
     return json_data

 from pdf2image import convert_from_path, pdfinfo_from_path
+from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
 from PIL import Image, ImageFile
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 import os
 # %%
 ## Convert pdf to image if necessary
+CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
+print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
 def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
     try:
         else:
             out_colour = img_annotation_box["color"]
     else:
+        if CUSTOM_BOX_COLOUR == "grey":
+            out_colour = (0.5, 0.5, 0.5)
+        else:
+            out_colour = (0,0,0)
     shape.finish(color=out_colour, fill=out_colour)  # Black fill for the rectangle
     #shape.finish(color=(0, 0, 0))  # Black fill for the rectangle
                     annotation["image"] = image_path
                     all_annotations_object.append(annotation)
         elif is_pdf_or_image(file_path):  # Alternatively, if it's an image
+            # Check if the file is an image type and the user selected text ocr option
+            if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
+                in_redact_method = tesseract_ocr_option
             # Convert image to a pymupdf document
             pymupdf_doc = pymupdf.open()  # Create a new empty document
             page = pymupdf_doc.new_page(width=img.width, height=img.height)  # Add a new page
             page.insert_image(rect, filename=file_path)  # Insert the image into the page
+            file_path_str = str(file_path)
+            image_file_paths = process_file(file_path_str, prepare_for_review)
+            print("Inserted image into PDF file")
         elif file_extension in ['.csv']:
             review_file_csv = read_file(file)
+            all_annotations_object = convert_pandas_df_to_review_json(review_file_csv, image_file_paths)
             json_from_csv = True
             print("Converted CSV review file to json")
         out_message.append(out_time)
         out_message_out = '\n'.join(out_message)
     number_of_pages = len(image_file_paths)
     return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
     return out_message, out_file_paths
 def join_values_within_threshold(df1, df2):
     # Threshold for matching
     threshold = 5
     return df
+def convert_pandas_df_to_review_json(df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
+    '''
+    Convert a review csv to a json file for use by the Gradio Annotation object
+    '''
     # Keep only necessary columns
     df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
     # Group the DataFrame by the 'image' column
+    grouped_csv_pages = df.groupby('page')
     # Create a list to hold the JSON data
     json_data = []
+    for n, pdf_image_path in enumerate(image_paths):
+        reported_page_number = int(n + 1)
+        if reported_page_number in df["page"].values:
+            # Convert each relevant group to a list of box dictionaries
+            selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
+            annotation_boxes = selected_csv_pages.drop(columns=['image', 'page']).to_dict(orient='records')
+            annotation = {
+                "image": pdf_image_path,
+                "boxes": annotation_boxes
+            }
+        else:
+            annotation = {}
+            annotation["image"] = pdf_image_path
         # Append the structured data to the json_data list
+        json_data.append(annotation)
     return json_data

tools/file_redaction.py CHANGED Viewed

@@ -269,7 +269,7 @@ def choose_and_run_redactor(file_paths:List[str],
             print("Redacting file:", file_path_without_ext)
             is_a_pdf = is_pdf(file_path) == True
-            if is_a_pdf == False:
                 # If user has not submitted a pdf, assume it's an image
                 print("File is not a pdf, assuming that image analysis needs to be used.")
                 in_redact_method = tesseract_ocr_option
@@ -708,8 +708,6 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
             if image:
                 img_width, img_height = image.size
-                print("annot:", annot)
                 x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
                 img_annotation_box["xmin"] = x1  #* (img_width / rect_width) # Use adjusted x1
@@ -745,16 +743,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
         "boxes": all_image_annotation_boxes
     }
     page.apply_redactions(images=0, graphics=0)
     page.clean_contents()
     return page, out_annotation_boxes
 def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
     all_bboxes = []
@@ -832,7 +825,11 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
         for next_box in group[1:]:
             if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
                 new_text = merged_box.text + " " + next_box.text
-                new_entity_type = merged_box.entity_type + " - " + next_box.entity_type
                 new_left = min(merged_box.left, next_box.left)
                 new_top = min(merged_box.top, next_box.top)
@@ -973,9 +970,6 @@ def redact_image_pdf(file_path:str,
     print("Page range:", str(page_min + 1), "to", str(page_max))
     #print("Current_loop_page:", current_loop_page)
-    if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
-    elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
     # If running Textract, check if file already exists. If it does, load in existing data
     # Import results from json and convert
     if analysis_type == textract_option:
@@ -984,7 +978,6 @@ def redact_image_pdf(file_path:str,
         log_files_output_paths.append(json_file_path)
         if not os.path.exists(json_file_path):
-            no_textract_file = True
             print("No existing Textract results file found.")
             existing_data = {}
             #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
@@ -1042,12 +1035,8 @@ def redact_image_pdf(file_path:str,
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
             if analysis_type == tesseract_ocr_option:
                 word_level_ocr_results = image_analyser.perform_ocr(image)
-                # Combine OCR results
                 line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
             # Import results from json and convert
             if analysis_type == textract_option:
@@ -1085,44 +1074,6 @@ def redact_image_pdf(file_path:str,
                         text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
-                # if not os.path.exists(json_file_path):
-                #     text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
-                #     log_files_output_paths.append(json_file_path)
-                #     request_metadata = request_metadata + "\n" + new_request_metadata
-                #     existing_data = {"pages":[text_blocks]}
-                # else:
-                #     # Open the file and load the JSON data
-                #     print("Found existing Textract json results file.")
-                #     with open(json_file_path, 'r') as json_file:
-                #         existing_data = json.load(json_file)
-                #         # Check if the current reported_page_number exists in the loaded JSON
-                #         page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
-                #         if not page_exists:  # If the page does not exist, analyze again
-                #             print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
-                #             text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
-                #             # Check if "pages" key exists, if not, initialize it as an empty list
-                #             if "pages" not in existing_data:
-                #                 existing_data["pages"] = []
-                #             # Append the new page data
-                #             existing_data["pages"].append(text_blocks)
-                #             # Write the updated existing_data back to the JSON file
-                #             with open(json_file_path, 'w') as json_file:
-                #                 json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
-                #             log_files_output_paths.append(json_file_path)
-                #             request_metadata = request_metadata + "\n" + new_request_metadata
-                #         else:
-                #             # If the page exists, retrieve the data
-                #             text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
                 line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
             # Step 2: Analyze text and identify PII
@@ -1194,7 +1145,8 @@ def redact_image_pdf(file_path:str,
             else:
                 #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
-                    if current_loop_page in redact_whole_page_list: redact_whole_page = True
                     else: redact_whole_page = False
                 else: redact_whole_page = False
@@ -1345,8 +1297,14 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
         character_objects_out.append(char)  # Collect character objects
         if isinstance(char, LTAnno):
             # Handle space separately by finalizing the word
-            full_text += char.get_text()  # Adds space or newline
             if current_word:  # Only finalize if there is a current word
                 word_bboxes.append((current_word, current_word_bbox))
@@ -1354,7 +1312,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
                 current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]  # Reset for next word
             # Check for line break (assuming a new line is indicated by a specific character)
-            if '\n' in char.get_text():
                 #print("char_anno:", char)
                 # Finalize the current line
                 if current_word:
@@ -1373,7 +1331,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
         # Concatenate text for LTChar
         #full_text += char.get_text()
         #added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
         added_text = char.get_text()
@@ -1382,8 +1339,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
             added_text = clean_unicode_text(added_text)
         full_text += added_text  # Adds space or newline, removing
         # Update overall bounding box
         x0, y0, x1, y1 = char.bbox
         overall_bbox[0] = min(overall_bbox[0], x0)  # x0
@@ -1480,7 +1435,10 @@ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combin
                     merged_box[3] = max(current_box[3], next_box[3])  # Adjust the top
                     merged_result.end = max(current_result.end, result.end)  # Extend text range
                     try:
-                        merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
                     except Exception as e:
                         print("Unable to combine result entity types:", e)
                     if current_text:
@@ -1877,8 +1835,9 @@ def redact_text_pdf(
                 # Make pymupdf page redactions
                 #print("redact_whole_page_list:", redact_whole_page_list)
-                if redact_whole_page_list:
-                    if current_loop_page in redact_whole_page_list: redact_whole_page = True
                     else: redact_whole_page = False
                 else: redact_whole_page = False

             print("Redacting file:", file_path_without_ext)
             is_a_pdf = is_pdf(file_path) == True
+            if is_a_pdf == False and in_redact_method == text_ocr_option:
                 # If user has not submitted a pdf, assume it's an image
                 print("File is not a pdf, assuming that image analysis needs to be used.")
                 in_redact_method = tesseract_ocr_option
             if image:
                 img_width, img_height = image.size
                 x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
                 img_annotation_box["xmin"] = x1  #* (img_width / rect_width) # Use adjusted x1
         "boxes": all_image_annotation_boxes
     }
     page.apply_redactions(images=0, graphics=0)
     page.clean_contents()
     return page, out_annotation_boxes
 def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
     all_bboxes = []
         for next_box in group[1:]:
             if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
                 new_text = merged_box.text + " " + next_box.text
+                if merged_box.entity_type != next_box.entity_type:
+                    new_entity_type = merged_box.entity_type + " - " + next_box.entity_type
+                else:
+                    new_entity_type = merged_box.entity_type
                 new_left = min(merged_box.left, next_box.left)
                 new_top = min(merged_box.top, next_box.top)
     print("Page range:", str(page_min + 1), "to", str(page_max))
     #print("Current_loop_page:", current_loop_page)
     # If running Textract, check if file already exists. If it does, load in existing data
     # Import results from json and convert
     if analysis_type == textract_option:
         log_files_output_paths.append(json_file_path)
         if not os.path.exists(json_file_path):
             print("No existing Textract results file found.")
             existing_data = {}
             #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
             if analysis_type == tesseract_ocr_option:
                 word_level_ocr_results = image_analyser.perform_ocr(image)
                 line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
             # Import results from json and convert
             if analysis_type == textract_option:
                         text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
                 line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
             # Step 2: Analyze text and identify PII
             else:
                 #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
+                    int_reported_page_number = int(reported_page_number)
+                    if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
                     else: redact_whole_page = False
                 else: redact_whole_page = False
         character_objects_out.append(char)  # Collect character objects
         if isinstance(char, LTAnno):
+            added_text = char.get_text()
+            # Handle double quotes
+            added_text = added_text.replace('"', '\\"')  # Escape double quotes
             # Handle space separately by finalizing the word
+            full_text += added_text  # Adds space or newline
             if current_word:  # Only finalize if there is a current word
                 word_bboxes.append((current_word, current_word_bbox))
                 current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]  # Reset for next word
             # Check for line break (assuming a new line is indicated by a specific character)
+            if '\n' in added_text:
                 #print("char_anno:", char)
                 # Finalize the current line
                 if current_word:
         # Concatenate text for LTChar
         #full_text += char.get_text()
         #added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
         added_text = char.get_text()
             added_text = clean_unicode_text(added_text)
         full_text += added_text  # Adds space or newline, removing
         # Update overall bounding box
         x0, y0, x1, y1 = char.bbox
         overall_bbox[0] = min(overall_bbox[0], x0)  # x0
                     merged_box[3] = max(current_box[3], next_box[3])  # Adjust the top
                     merged_result.end = max(current_result.end, result.end)  # Extend text range
                     try:
+                        if current_result.entity_type != result.entity_type:
+                            merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
+                        else:
+                            merged_result.entity_type = current_result.entity_type
                     except Exception as e:
                         print("Unable to combine result entity types:", e)
                     if current_text:
                 # Make pymupdf page redactions
                 #print("redact_whole_page_list:", redact_whole_page_list)
+                if redact_whole_page_list:
+                    int_reported_page_number = int(reported_page_number)
+                    if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
                     else: redact_whole_page = False
                 else: redact_whole_page = False

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -24,14 +24,22 @@ except:
 	print("Successfully downloaded and imported spaCy model", model_name)
 # #### Custom recognisers
-# Allow user to create their own recogniser
 def custom_word_list_recogniser(custom_list:List[str]=[]):
-    custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
-    custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
-    #print("custom_pattern:", custom_pattern)
     custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
-    global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
     return custom_recogniser

 	print("Successfully downloaded and imported spaCy model", model_name)
 # #### Custom recognisers
 def custom_word_list_recogniser(custom_list:List[str]=[]):
+    # Create regex pattern, handling quotes carefully
+    quote_str = '"'
+    replace_str = '(?:"|"|")'
+    custom_regex = '|'.join(
+        rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
+        for term in custom_list
+    )
+    print(custom_regex)
+    custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1)
     custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
+        global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
     return custom_recogniser

tools/redaction_review.py CHANGED Viewed

@@ -49,18 +49,12 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
     return current_zoom_level, annotate_current_page
-def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=80):
     '''
     Update a gradio_image_annotation object with new annotation data
     '''
     recogniser_entities = []
     recogniser_dataframe = pd.DataFrame()
-    #recogniser_entities_drop = gr.Dropdown(value="ALL", allow_custom_value=True)
-    #recogniser_dataframe_gr = gr.Dataframe(pd.DataFrame(data={"page":[""], "label":[""]}))
-    #print("recogniser_dataframe_gr", recogniser_dataframe_gr)
-    #print("recogniser_dataframe_gr shape", recogniser_dataframe_gr.shape)
-    #print("recogniser_dataframe_gr.iloc[0,0]:",  recogniser_dataframe_gr.iloc[0,0])
     if recogniser_dataframe_gr.iloc[0,0] == "":
         try:
@@ -228,7 +222,7 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
     return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
-def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, progress=gr.Progress(track_tqdm=True)):
     '''
     Apply modified redactions to a pymupdf and export review files
     '''
@@ -251,75 +245,76 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
         file_paths = [file_paths]
     for file_path in file_paths:
-        print("file_path:", file_path)
         file_base = get_file_path_end(file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
-        # If working with image docs
-        if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
-            image = Image.open(file_paths[-1])
-            #image = pdf_doc
-            draw = ImageDraw.Draw(image)
-            for img_annotation_box in image_annotated['boxes']:
-                coords = [img_annotation_box["xmin"],
-                img_annotation_box["ymin"],
-                img_annotation_box["xmax"],
-                img_annotation_box["ymax"]]
-                fill = img_annotation_box["color"]
-                draw.rectangle(coords, fill=fill)
-                image.save(output_folder + file_base + "_redacted.png")
-            doc = [image]
-        elif file_extension in '.csv':
-            print("This is a csv")
-            pdf_doc = []
-        # If working with pdfs
-        elif is_pdf(file_path) == True:
-            pdf_doc = pymupdf.open(file_path)
-            number_of_pages = pdf_doc.page_count
-            print("Saving pages to file.")
-            for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
-                #print("Saving page", str(i))
-                image_loc = all_image_annotations[i]['image']
-                #print("Image location:", image_loc)
-                # Load in image object
-                if isinstance(image_loc, np.ndarray):
-                    image = Image.fromarray(image_loc.astype('uint8'))
-                    #all_image_annotations[i]['image'] = image_loc.tolist()
-                elif isinstance(image_loc, Image.Image):
-                    image = image_loc
-                    #image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
-                    #image_loc.save(image_out_folder)
-                    #all_image_annotations[i]['image'] = image_out_folder
-                elif isinstance(image_loc, str):
-                    image = Image.open(image_loc)
-                pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
-                pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
-        else:
-            print("File type not recognised.")
-        #try:
-        if pdf_doc:
-            out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
-            pdf_doc.save(out_pdf_file_path)
-            output_files.append(out_pdf_file_path)
         try:
             print("Saving annotations to JSON")
@@ -331,7 +326,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
             print("Saving annotations to CSV review file")
-            print("review_file_state:", review_file_state)
             # Convert json to csv and also save this
             review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)

     return current_zoom_level, annotate_current_page
+def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100):
     '''
     Update a gradio_image_annotation object with new annotation data
     '''
     recogniser_entities = []
     recogniser_dataframe = pd.DataFrame()
     if recogniser_dataframe_gr.iloc[0,0] == "":
         try:
     return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
+def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)):
     '''
     Apply modified redactions to a pymupdf and export review files
     '''
         file_paths = [file_paths]
     for file_path in file_paths:
+        #print("file_path:", file_path)
         file_base = get_file_path_end(file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
+        if save_pdf == True:
+            # If working with image docs
+            if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
+                image = Image.open(file_paths[-1])
+                #image = pdf_doc
+                draw = ImageDraw.Draw(image)
+                for img_annotation_box in image_annotated['boxes']:
+                    coords = [img_annotation_box["xmin"],
+                    img_annotation_box["ymin"],
+                    img_annotation_box["xmax"],
+                    img_annotation_box["ymax"]]
+                    fill = img_annotation_box["color"]
+                    draw.rectangle(coords, fill=fill)
+                    image.save(output_folder + file_base + "_redacted.png")
+                doc = [image]
+            elif file_extension in '.csv':
+                print("This is a csv")
+                pdf_doc = []
+            # If working with pdfs
+            elif is_pdf(file_path) == True:
+                pdf_doc = pymupdf.open(file_path)
+                number_of_pages = pdf_doc.page_count
+                print("Saving pages to file.")
+                for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
+                    #print("Saving page", str(i))
+                    image_loc = all_image_annotations[i]['image']
+                    #print("Image location:", image_loc)
+                    # Load in image object
+                    if isinstance(image_loc, np.ndarray):
+                        image = Image.fromarray(image_loc.astype('uint8'))
+                        #all_image_annotations[i]['image'] = image_loc.tolist()
+                    elif isinstance(image_loc, Image.Image):
+                        image = image_loc
+                        #image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
+                        #image_loc.save(image_out_folder)
+                        #all_image_annotations[i]['image'] = image_out_folder
+                    elif isinstance(image_loc, str):
+                        image = Image.open(image_loc)
+                    pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
+                    pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
+            else:
+                print("File type not recognised.")
+            #try:
+            if pdf_doc:
+                out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
+                pdf_doc.save(out_pdf_file_path)
+                output_files.append(out_pdf_file_path)
         try:
             print("Saving annotations to JSON")
             print("Saving annotations to CSV review file")
+            #print("review_file_state:", review_file_state)
             # Convert json to csv and also save this
             review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)