Spaces:

seanpedrickcase
/

document_redaction

Sleeping

App Files Files Community

seanpedrickcase commited on Dec 23, 2024

Commit

a770956

1 Parent(s): 928b1e9

Enhance file handling and UI features: improved Gradio app layout with fill width option, and integrated new settings for deny, and fully redacted lists (placeholders so far). Updated file conversion functions to handle CSV inputs and added CSV review file generation for redactions. Now retains all original and merged redaction boxes.

Browse files

Files changed (8) hide show

.dockerignore +1 -0
.gitignore +1 -0
Dockerfile +1 -0
app.py +53 -35
tools/file_conversion.py +86 -18
tools/file_redaction.py +399 -136
tools/helper_functions.py +12 -12
tools/redaction_review.py +108 -73

.dockerignore CHANGED Viewed

@@ -6,6 +6,7 @@
 *.ipynb
 examples/*
 processing/*
 output/*
 tools/__pycache__/*
 old_code/*

 *.ipynb
 examples/*
 processing/*
+input/*
 output/*
 tools/__pycache__/*
 old_code/*

.gitignore CHANGED Viewed

@@ -6,6 +6,7 @@
 *.ipynb
 examples/*
 processing/*
 output/*
 tools/__pycache__/*
 old_code/*

 *.ipynb
 examples/*
 processing/*
+input/*
 output/*
 tools/__pycache__/*
 old_code/*

Dockerfile CHANGED Viewed

@@ -52,6 +52,7 @@ RUN useradd -m -u 1000 user
 # Create required directories
 RUN mkdir -p /home/user/app/output \
     && mkdir -p /home/user/app/tld \
     && mkdir -p /home/user/app/logs \
     && chown -R user:user /home/user/app

 # Create required directories
 RUN mkdir -p /home/user/app/output \
+    && mkdir -p /home/user/app/input \
     && mkdir -p /home/user/app/tld \
     && mkdir -p /home/user/app/logs \
     && chown -R user:user /home/user/app

app.py CHANGED Viewed

@@ -54,7 +54,7 @@ else:
     default_pii_detector = local_pii_detector
 # Create the gradio interface
-app = gr.Blocks(theme = gr.themes.Base())
 with app:
@@ -67,7 +67,7 @@ with app:
     all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
     all_decision_process_table_state = gr.State(pd.DataFrame())
-    in_allow_list_state = gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
@@ -106,15 +106,7 @@ with app:
     estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
     annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
-    s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
-    ## S3 default bucket and allow list file state
-    default_allow_list_file_name = "default_allow_list.csv"
-    default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
-    s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
-    s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
-    default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
     ## Annotator zoom value
@@ -125,6 +117,25 @@ with app:
     clear_all_page_redactions = gr.State(True)
     prepare_for_review_bool = gr.Checkbox(value=True, visible=False)
     ###
     # UI DESIGN
@@ -172,6 +183,10 @@ with app:
     # Object annotation
     with gr.Tab("Review redactions", id="tab_object_annotation"):
         with gr.Row():
             annotation_last_page_button = gr.Button("Previous page", scale = 3)
             annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
@@ -203,9 +218,7 @@ with app:
             annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
             annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
-        output_review_files = gr.File(label="Review output files", file_count='multiple')
-        upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...redactions.json)")
     # TEXT / TABULAR DATA TAB
     with gr.Tab(label="Open text or Excel/csv files"):
         gr.Markdown(
@@ -236,8 +249,6 @@ with app:
         data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
     # SETTINGS TAB
     with gr.Tab(label="Redaction settings"):
         gr.Markdown(
@@ -250,14 +261,18 @@ with app:
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
         with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
             with gr.Row():
-                in_allow_list = gr.File(label="Import allow list file", file_count="multiple")
-                with gr.Column():
-                    gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
                     in_allow_list_text = gr.Textbox(label="Custom allow list load status")
             with gr.Accordion("Add or remove entity types to redact", open = False):
                 in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
@@ -266,15 +281,11 @@ with app:
             handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting"])
             #with gr.Row():
             in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
         with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
-        log_files_output = gr.File(label="Log file output", interactive=False)
-    # If a custom allow list is uploaded
-    in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
     ###
     # PDF/IMAGE REDACTION
@@ -283,25 +294,22 @@ with app:
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
-    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
-    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     # If a file has been completed, the function will continue onto the next document
     latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page]).\
                     then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
-        # latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
-    # then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
-                    # outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
-                    #then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page]).\
-                    #then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
-    ### REVIEW REDACTIONS
     # Page controls at top
     annotate_current_page.submit(
@@ -326,7 +334,7 @@ with app:
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
-    annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
     # Page controls at bottom
     annotate_current_page_bottom.submit(
@@ -355,6 +363,16 @@ with app:
     text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     ###
     # APP LOAD AND LOGGING
     ###

     default_pii_detector = local_pii_detector
 # Create the gradio interface
+app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
 with app:
     all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
     all_decision_process_table_state = gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
     estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
     annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
+    s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
     ## Annotator zoom value
     clear_all_page_redactions = gr.State(True)
     prepare_for_review_bool = gr.Checkbox(value=True, visible=False)
+    ## Settings page variables
+    default_allow_list_file_name = "default_allow_list.csv"
+    default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
+    in_allow_list_state = gr.State(pd.DataFrame())
+    default_deny_list_file_name = "default_deny_list.csv"
+    default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
+    in_deny_list_state = gr.State(pd.DataFrame())
+    in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
+    fully_redacted_list_file_name = "default_fully_redacted_list.csv"
+    fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
+    in_fully_redacted_list_state = gr.State(pd.DataFrame())
+    in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
+    # S3 settings for default allow list load
+    s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
+    s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
+    default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
     ###
     # UI DESIGN
     # Object annotation
     with gr.Tab("Review redactions", id="tab_object_annotation"):
+        with gr.Accordion(label = "Review previous redactions", open=True):
+            output_review_files = gr.File(label="Review output files", file_count='multiple')
+            upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
         with gr.Row():
             annotation_last_page_button = gr.Button("Previous page", scale = 3)
             annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
             annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
             annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
     # TEXT / TABULAR DATA TAB
     with gr.Tab(label="Open text or Excel/csv files"):
         gr.Markdown(
         data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
     # SETTINGS TAB
     with gr.Tab(label="Redaction settings"):
         gr.Markdown(
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
         with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
             with gr.Row():
+                with gr.Column():
+                    in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.", file_count="multiple", height=50)
                     in_allow_list_text = gr.Textbox(label="Custom allow list load status")
+                with gr.Column():
+                    in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will always be redacted.", file_count="multiple", height=50)
+                    in_deny_list_text = gr.Textbox(label="Custom deny list load status")
+                with gr.Column():
+                    in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=50)
+                    in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
             with gr.Accordion("Add or remove entity types to redact", open = False):
                 in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
             handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting"])
             #with gr.Row():
             in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
         with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
+        log_files_output = gr.File(label="Log file output", interactive=False)
     ###
     # PDF/IMAGE REDACTION
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
+    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
+    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     # If a file has been completed, the function will continue onto the next document
     latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page]).\
                     then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
+    ###
+    # REVIEW PDF REDACTIONS
+    ###
     # Page controls at top
     annotate_current_page.submit(
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
+    annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
     # Page controls at bottom
     annotate_current_page_bottom.submit(
     text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
+    ###
+    # SETTINGS PAGE INPUT / OUTPUT
+    ###
+    # If a custom allow list is uploaded
+    in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
+    in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
+    in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
     ###
     # APP LOAD AND LOGGING
     ###

tools/file_conversion.py CHANGED Viewed

@@ -1,13 +1,13 @@
 from pdf2image import convert_from_path, pdfinfo_from_path
-from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from PIL import Image, ImageFile
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 import os
 import re
-import gradio as gr
 import time
 import json
 import pymupdf
 from tqdm import tqdm
 from gradio import Progress
 from typing import List, Optional
@@ -48,10 +48,15 @@ def is_pdf(filename):
-def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> tuple[int, str]:
     try:
-        out_path = f"{pdf_path}_{page_num}.png"
         os.makedirs(os.path.dirname(out_path), exist_ok=True)
         if os.path.exists(out_path):
             print(f"Loading existing image for page {page_num + 1}")
             image = Image.open(out_path)
@@ -67,7 +72,7 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> tuple
         print(f"Error processing page {page_num + 1}: {e}")
         return page_num, None
-def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
     # If preparing for review, just load the first page
     if prepare_for_review == True:
@@ -252,6 +257,7 @@ def prepare_image_or_pdf(
     """
     tic = time.perf_counter()
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
@@ -341,10 +347,15 @@ def prepare_image_or_pdf(
         if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
             in_redact_method = tesseract_ocr_option
         # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
-        if file_path.endswith(".json"):
-            if prepare_for_review == True:
                 print("Preparing file for review")
                 if isinstance(file_path, str):
                     with open(file_path, 'r') as json_file:
@@ -353,6 +364,20 @@ def prepare_image_or_pdf(
                     # Assuming file_path is a NamedString or similar
                     all_annotations_object = json.loads(file_path)  # Use loads for string content
                 # Get list of page numbers
                 image_file_paths_pages = [
                 int(re.search(r'_(\d+)\.png$', os.path.basename(s)).group(1))
@@ -380,19 +405,11 @@ def prepare_image_or_pdf(
                     #print("all_annotations_object:", all_annotations_object)
                 # Write the response to a JSON file in output folder
-                out_folder = output_folder + file_path_without_ext + file_extension
                 with open(out_folder, 'w') as json_file:
                     json.dump(all_annotations_object, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
                 continue
-            else:
-                # If the file loaded has end textract.json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
-                json_contents = json.load(file_path)
-                # Write the response to a JSON file in output folder
-                out_folder = output_folder + file_path_without_ext + file_extension
-                with open(out_folder, 'w') as json_file:
-                    json.dump(json_contents, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
-                continue
         # Must be a pdf or image at this point
         else:
@@ -428,7 +445,6 @@ def prepare_image_or_pdf(
             page = pymupdf_doc.new_page(width=img.width, height=img.height)  # Add a new page
             page.insert_image(rect, filename=file_path)  # Insert the image into the page
         toc = time.perf_counter()
         out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
@@ -467,3 +483,55 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
     #print("Out file paths:", out_file_paths)
     return out_message, out_file_paths

 from pdf2image import convert_from_path, pdfinfo_from_path
+from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, read_file
 from PIL import Image, ImageFile
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 import os
 import re
 import time
 import json
 import pymupdf
+import pandas as pd
 from tqdm import tqdm
 from gradio import Progress
 from typing import List, Optional
+def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
     try:
+        # Construct the full output directory path relative to the current working directory
+        output_dir = os.path.join(os.getcwd(), output_dir)
+        # Use the output_dir to construct the out_path
+        out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
         os.makedirs(os.path.dirname(out_path), exist_ok=True)
         if os.path.exists(out_path):
             print(f"Loading existing image for page {page_num + 1}")
             image = Image.open(out_path)
         print(f"Error processing page {page_num + 1}: {e}")
         return page_num, None
+def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8, output_dir: str = '/input'):
     # If preparing for review, just load the first page
     if prepare_for_review == True:
     """
     tic = time.perf_counter()
+    json_from_csv = False
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
             in_redact_method = tesseract_ocr_option
+        if file_extension in ['.csv']:
+            review_file_csv = read_file(file)
+            all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
+            json_from_csv = True
         # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
+        if (file_extension in ['.json']) | (json_from_csv == True):
+            if (file_extension in ['.json']) &  (prepare_for_review == True):
                 print("Preparing file for review")
                 if isinstance(file_path, str):
                     with open(file_path, 'r') as json_file:
                     # Assuming file_path is a NamedString or similar
                     all_annotations_object = json.loads(file_path)  # Use loads for string content
+            # Assume it's a textract json
+            elif (file_extension in ['.json']) & (prepare_for_review != True):
+                # If the file loaded has end textract.json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
+                json_contents = json.load(file_path)
+                # Write the response to a JSON file in output folder
+                out_folder = output_folder + file_path_without_ext + ".json"
+                with open(out_folder, 'w') as json_file:
+                    json.dump(json_contents, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
+                continue
+            # If you have an annotations object from the above code
+            if all_annotations_object:
+                #print("out_annotations_object found:", all_annotations_object)
                 # Get list of page numbers
                 image_file_paths_pages = [
                 int(re.search(r'_(\d+)\.png$', os.path.basename(s)).group(1))
                     #print("all_annotations_object:", all_annotations_object)
                 # Write the response to a JSON file in output folder
+                out_folder = output_folder + file_path_without_ext + ".json"
                 with open(out_folder, 'w') as json_file:
                     json.dump(all_annotations_object, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
                 continue
         # Must be a pdf or image at this point
         else:
             page = pymupdf_doc.new_page(width=img.width, height=img.height)  # Add a new page
             page.insert_image(rect, filename=file_path)  # Insert the image into the page
         toc = time.perf_counter()
         out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
     #print("Out file paths:", out_file_paths)
     return out_message, out_file_paths
+def convert_review_json_to_pandas_df(data:List[dict]) -> pd.DataFrame:
+    # Flatten the data
+    flattened_data = []
+    for entry in data:
+        #print("entry:", entry)
+        #print("flattened_data:", flattened_data)
+        image_path = entry["image"]
+        # Use regex to find the number before .png
+        match = re.search(r'_(\d+)\.png$', image_path)
+        if match:
+            number = match.group(1)  # Extract the number
+            print(number)  # Output: 0
+            reported_number = int(number) + 1
+        else:
+            print("No number found before .png")
+        for box in entry["boxes"]:
+            data_to_add = {"image": image_path, "page":reported_number, **box}
+            #print("data_to_add:", data_to_add)
+            flattened_data.append(data_to_add)
+    # Convert to a DataFrame
+    df = pd.DataFrame(flattened_data)
+    return df
+def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
+    # Keep only necessary columns
+    df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
+    # Group the DataFrame by the 'image' column
+    grouped = df.groupby('image')
+    # Create a list to hold the JSON data
+    json_data = []
+    # Iterate over each group
+    for image_path, group in grouped:
+        # Convert each group to a list of box dictionaries
+        boxes = group.drop(columns=['image', 'page']).to_dict(orient='records')
+        # Append the structured data to the json_data list
+        json_data.append({
+            "image": image_path,
+            "boxes": boxes
+        })
+    return json_data

tools/file_redaction.py CHANGED Viewed

@@ -4,6 +4,7 @@ import json
 import io
 import os
 import boto3
 from tqdm import tqdm
 from PIL import Image, ImageChops, ImageFile, ImageDraw
@@ -25,7 +26,7 @@ from collections import defaultdict  # For efficient grouping
 from presidio_analyzer import RecognizerResult
 from tools.aws_functions import RUN_AWS_FUNCTIONS
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
-from tools.file_conversion import process_file, image_dpi
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
 from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
@@ -68,6 +69,8 @@ def choose_and_run_redactor(file_paths:List[str],
  chosen_redact_comprehend_entities:List[str],
  in_redact_method:str,
  in_allow_list:List[List[str]]=None,
  latest_file_completed:int=0,
  out_message:list=[],
  out_file_paths:list=[],
@@ -99,6 +102,8 @@ def choose_and_run_redactor(file_paths:List[str],
     - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
     - in_redact_method (str): The method to use for redaction.
     - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
     - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
     - out_message (list, optional): A list to store output messages. Defaults to an empty list.
     - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
@@ -188,7 +193,7 @@ def choose_and_run_redactor(file_paths:List[str],
     if not in_allow_list.empty:
         in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
-        print("In allow list:", in_allow_list_flat)
     else:
         in_allow_list_flat = []
@@ -236,7 +241,7 @@ def choose_and_run_redactor(file_paths:List[str],
         file_paths_list = file_paths
         file_paths_loop = [file_paths_list[int(latest_file_completed)]]
-    print("file_paths_list in choose_redactor function:", file_paths_list)
     for file in file_paths_loop:
@@ -269,7 +274,7 @@ def choose_and_run_redactor(file_paths:List[str],
             print("Redacting file " + file_path_without_ext + " as an image-based file")
-            pymupdf_doc,all_decision_process_table,logging_file_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
              prepared_pdf_image_paths,
              language,
              chosen_redact_entities,
@@ -300,7 +305,7 @@ def choose_and_run_redactor(file_paths:List[str],
         elif in_redact_method == text_ocr_option:
-            logging_file_paths = ""
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
@@ -353,12 +358,12 @@ def choose_and_run_redactor(file_paths:List[str],
             out_file_paths.append(out_image_file_path)
-            if logging_file_paths:
-                log_files_output_paths.extend(logging_file_paths)
             logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
             all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
-            out_file_paths.append(logs_output_file_name)
             all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
             all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
@@ -366,12 +371,23 @@ def choose_and_run_redactor(file_paths:List[str],
             # Save the gradio_annotation_boxes to a JSON file
             try:
-                out_annotation_file_path = out_image_file_path + '_redactions.json'
                 with open(out_annotation_file_path, 'w') as f:
                     json.dump(annotations_all_pages, f)
-                out_file_paths.append(out_annotation_file_path)
-            except:
-                print("Could not save annotations to json file.")
             # Make a combined message for the file
             if isinstance(out_message, list):
@@ -578,7 +594,50 @@ def move_page_info(file_path: str) -> str:
     return new_file_path
-def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None, custom_colours=False):
     mediabox_height = page.mediabox[3] - page.mediabox[1]
     mediabox_width = page.mediabox[2] - page.mediabox[0]
@@ -669,40 +728,42 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None, custo
         all_image_annotation_boxes.append(img_annotation_box)
-        # Calculate the middle y value and set a small height (not used)
-        #print("Rect:", rect)
-        #middle_y = (pymupdf_y1 + pymupdf_y2) / 2
-        rect_small_pixel_height = Rect(pymupdf_x1, pymupdf_y1 + 2, pymupdf_x2, pymupdf_y2 - 2)  # Slightly smaller than outside box
-        # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
-        #page.add_redact_annot(rect)#rect_small_pixel_height)
-        page.add_redact_annot(rect_small_pixel_height)
-        # Set up drawing a black box over the whole rect
-        shape = page.new_shape()
-        shape.draw_rect(rect)
-        if custom_colours == True:
-            def convert_color_to_range_0_1(color):
-                return tuple(component / 255 for component in color)
-            if img_annotation_box["color"][0] > 1:
-                out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
-            else:
-                out_colour = img_annotation_box["color"]
-        else:
-            out_colour = (0,0,0)
-        shape.finish(color=out_colour, fill=out_colour)  # Black fill for the rectangle
-        #shape.finish(color=(0, 0, 0))  # Black fill for the rectangle
-        shape.commit()
     out_annotation_boxes = {
         "image": image_path, #Image.open(image_path), #image_path,
         "boxes": all_image_annotation_boxes
     }
     page.apply_redactions(images=0, graphics=0)
     page.clean_contents()
@@ -713,33 +774,38 @@ def bounding_boxes_overlap(box1, box2):
     return (box1[0] < box2[2] and box2[0] < box1[2] and
             box1[1] < box2[3] and box2[1] < box1[3])
 def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
     merged_bboxes = []
     grouped_bboxes = defaultdict(list)
-        # Process signature and handwriting results
     if signature_recogniser_results or handwriting_recogniser_results:
         if "Redact all identified handwriting" in handwrite_signature_checkbox:
-            #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
-            merged_bboxes.extend(handwriting_recogniser_results)
         if "Redact all identified signatures" in handwrite_signature_checkbox:
-            #print("Signature boxes exist at merge:", signature_recogniser_results)
-            merged_bboxes.extend(signature_recogniser_results)
     # Reconstruct bounding boxes for substrings of interest
     reconstructed_bboxes = []
     for bbox in bboxes:
-        #print("bbox:", bbox)
         bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
         for line_text, line_info in combined_results.items():
             line_box = line_info['bounding_box']
-            if bounding_boxes_overlap(bbox_box, line_box):
                 if bbox.text in line_text:
                     start_char = line_text.index(bbox.text)
                     end_char = start_char + len(bbox.text)
                     relevant_words = []
                     current_char = 0
                     for word in line_info['words']:
@@ -753,16 +819,13 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
                             current_char += 1  # +1 for space if the word doesn't already end with a space
                     if relevant_words:
-                        #print("Relevant words:", relevant_words)
                         left = min(word['bounding_box'][0] for word in relevant_words)
                         top = min(word['bounding_box'][1] for word in relevant_words)
                         right = max(word['bounding_box'][2] for word in relevant_words)
                         bottom = max(word['bounding_box'][3] for word in relevant_words)
-                        # Combine the text of all relevant words
                         combined_text = " ".join(word['text'] for word in relevant_words)
-                        # Calculate new dimensions for the merged box
                         reconstructed_bbox = CustomImageRecognizerResult(
                             bbox.entity_type,
                             bbox.start,
@@ -771,13 +834,13 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
                             left,
                             top,
                             right - left,  # width
-                            bottom - top,  # height
                             combined_text
                         )
-                        reconstructed_bboxes.append(reconstructed_bbox)
                         break
         else:
-            # If the bbox text is not found in any line in combined_results, keep the original bbox
             reconstructed_bboxes.append(bbox)
     # Group reconstructed bboxes by approximate vertical proximity
@@ -791,35 +854,141 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
         merged_box = group[0]
         for next_box in group[1:]:
             if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
-                # Calculate new dimensions for the merged box
-                if merged_box.text == next_box.text:
-                    new_text = merged_box.text
-                else:
-                    new_text = merged_box.text + " " + next_box.text
-                if merged_box.text == next_box.text:
-                    new_text = merged_box.text
-                    new_entity_type = merged_box.entity_type  # Keep the original entity type
-                else:
-                    new_text = merged_box.text + " " + next_box.text
-                    new_entity_type = merged_box.entity_type + " - " + next_box.entity_type  # Concatenate entity types
                 new_left = min(merged_box.left, next_box.left)
                 new_top = min(merged_box.top, next_box.top)
                 new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
                 new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
                 merged_box = CustomImageRecognizerResult(
                     new_entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
                 )
             else:
                 merged_bboxes.append(merged_box)
-                merged_box = next_box
         merged_bboxes.append(merged_box)
-    #print("bboxes:", bboxes)
-    return merged_bboxes
 def redact_image_pdf(file_path:str,
                      prepared_pdf_file_paths:List[str],
@@ -846,7 +1015,7 @@ def redact_image_pdf(file_path:str,
                      custom_recogniser_word_list:List[str]=[],
                      redact_whole_page_list:List[str]=[],
                      page_break_val:int=int(page_break_value),
-                     logging_file_paths:List=[],
                      max_time:int=int(max_time_value),
                      progress=Progress(track_tqdm=True)):
@@ -878,7 +1047,7 @@ def redact_image_pdf(file_path:str,
     - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
-    - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
@@ -901,12 +1070,12 @@ def redact_image_pdf(file_path:str,
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         print("Connection to AWS Comprehend service unsuccessful.")
-        return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
     if analysis_type == textract_option and textract_client == "":
         print("Connection to AWS Textract service unsuccessful.")
-        return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
     tic = time.perf_counter()
@@ -937,14 +1106,14 @@ def redact_image_pdf(file_path:str,
     if analysis_type == textract_option:
         json_file_path = output_folder + file_name + "_textract.json"
-        logging_file_paths.append(json_file_path)
         if not os.path.exists(json_file_path):
             no_textract_file = True
             print("No existing Textract results file found.")
             existing_data = {}
             #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
-            #logging_file_paths.append(json_file_path)
             #request_metadata = request_metadata + "\n" + new_request_metadata
             #wrapped_text_blocks = {"pages":[text_blocks]}
         else:
@@ -1015,7 +1184,7 @@ def redact_image_pdf(file_path:str,
                 if not existing_data:
                     text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
-                    logging_file_paths.append(json_file_path)
                     request_metadata = request_metadata + "\n" + new_request_metadata
                     existing_data = {"pages":[text_blocks]}
@@ -1043,7 +1212,7 @@ def redact_image_pdf(file_path:str,
                 # if not os.path.exists(json_file_path):
                 #     text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
-                #     logging_file_paths.append(json_file_path)
                 #     request_metadata = request_metadata + "\n" + new_request_metadata
                 #     existing_data = {"pages":[text_blocks]}
@@ -1073,7 +1242,7 @@ def redact_image_pdf(file_path:str,
                 #             with open(json_file_path, 'w') as json_file:
                 #                 json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
-                #             logging_file_paths.append(json_file_path)
                 #             request_metadata = request_metadata + "\n" + new_request_metadata
                 #         else:
                 #             # If the page exists, retrieve the data
@@ -1204,7 +1373,7 @@ def redact_image_pdf(file_path:str,
                 current_loop_page += 1
-                return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
         if is_pdf(file_path) == False:
             images.append(image)
@@ -1225,7 +1394,7 @@ def redact_image_pdf(file_path:str,
                 with open(json_file_path, 'w') as json_file:
                     json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
-            return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
     if analysis_type == textract_option:
         # Write the updated existing textract data back to the JSON file
@@ -1233,7 +1402,7 @@ def redact_image_pdf(file_path:str,
         with open(json_file_path, 'w') as json_file:
             json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
-    return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
 ###
@@ -1349,16 +1518,18 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
     return line_level_results_out, line_level_characters_out  # Return both results and character objects
-def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, characters:List[LTChar], combine_pixel_dist:int=20, vertical_padding:int=0):
     '''
     Merge identified bounding boxes containing PII that are very close to one another
     '''
     analysed_bounding_boxes = []
     if len(analyser_results) > 0 and len(characters) > 0:
         # Extract bounding box coordinates for sorting
         bounding_boxes = []
-        text_out = []
         for result in analyser_results:
             char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
             char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
             if char_boxes:
@@ -1367,9 +1538,12 @@ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, char
                 bottom = min(box[1] for box in char_boxes)
                 right = max(box[2] for box in char_boxes)
                 top = max(box[3] for box in char_boxes) + vertical_padding
-                bounding_boxes.append((bottom, left, result, [left, bottom, right, top], char_text))  # (y, x, result, bbox, text)
-        char_text = "".join(char_text)
         # Sort the results by y-coordinate and then by x-coordinate
         bounding_boxes.sort()
@@ -1380,74 +1554,163 @@ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, char
         current_result = None
         current_text = []
-        for y, x, result, char_box, text in bounding_boxes:
-            #print(f"Considering result: {result}")
-            #print(f"Character box: {char_box}")
             if current_y is None or current_box is None:
-                current_box = char_box
-                current_y = char_box[1]
                 current_result = result
                 current_text = list(text)
-                #print(f"Starting new box: {current_box}")
             else:
-                vertical_diff_bboxes = abs(char_box[1] - current_y)
-                horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
-                #print(f"Comparing boxes: current_box={current_box}, char_box={char_box}, current_text={current_text}, char_text={text}")
-                #print(f"Vertical diff: {vertical_diff_bboxes}, Horizontal diff: {horizontal_diff_bboxes}")
-                if (
-                    vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist
-                ):
-                    #print("box is being extended")
-                    current_box[2] = char_box[2]  # Extend the current box horizontally
-                    current_box[3] = max(current_box[3], char_box[3])  # Ensure the top is the highest
-                    current_result.end = max(current_result.end, result.end)  # Extend the text range
                     try:
-                        current_result.entity_type = current_result.entity_type + " - " + result.entity_type
                     except Exception as e:
-                        print("Unable to combine result entity types:")
-                        print(e)
-                    # Add a space if current_text is not empty
                     if current_text:
-                        current_text.append(" ")  # Add space between texts
-                    current_text.extend(text)
-                    #print(f"Latest merged box: {current_box[-1]}")
                 else:
-                    merged_bounding_boxes.append(
-                        {"text":"".join(current_text),"boundingBox": current_box, "result": current_result})
-                    #print(f"Appending merged box: {current_box}")
-                    #print(f"Latest merged box: {merged_bounding_boxes[-1]}")
-                    # Reset current_box and current_y after appending
-                    current_box = char_box
-                    current_y = char_box[1]
                     current_result = result
                     current_text = list(text)
-                    #print(f"Starting new box: {current_box}")
-        # After finishing with the current result, add the last box for this result
-        if current_box:
-            merged_bounding_boxes.append({"text":"".join(current_text), "boundingBox": current_box, "result": current_result})
-            #print(f"Appending final box for result: {current_box}")
-        if not merged_bounding_boxes:
-            analysed_bounding_boxes.extend(
-                {"text":text, "boundingBox": char.bbox, "result": result}
-                for result in analyser_results
-                for char in characters[result.start:result.end]
-                if isinstance(char, LTChar)
-            )
-        else:
-            analysed_bounding_boxes.extend(merged_bounding_boxes)
-        #print("Analyzed bounding boxes:\n\n", analysed_bounding_boxes)
     return analysed_bounding_boxes
 def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()

 import io
 import os
 import boto3
+import copy
 from tqdm import tqdm
 from PIL import Image, ImageChops, ImageFile, ImageDraw
 from presidio_analyzer import RecognizerResult
 from tools.aws_functions import RUN_AWS_FUNCTIONS
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
+from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
 from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
  chosen_redact_comprehend_entities:List[str],
  in_redact_method:str,
  in_allow_list:List[List[str]]=None,
+ in_deny_list:List[List[str]]=None,
+ in_fully_redacted_list:List[List[str]]=None,
  latest_file_completed:int=0,
  out_message:list=[],
  out_file_paths:list=[],
     - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
     - in_redact_method (str): The method to use for redaction.
     - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
+    - in_deny_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
+    - in_fully_redacted_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
     - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
     - out_message (list, optional): A list to store output messages. Defaults to an empty list.
     - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
     if not in_allow_list.empty:
         in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
+        #print("In allow list:", in_allow_list_flat)
     else:
         in_allow_list_flat = []
         file_paths_list = file_paths
         file_paths_loop = [file_paths_list[int(latest_file_completed)]]
+    # print("file_paths_list in choose_redactor function:", file_paths_list)
     for file in file_paths_loop:
             print("Redacting file " + file_path_without_ext + " as an image-based file")
+            pymupdf_doc,all_decision_process_table,log_files_output_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
              prepared_pdf_image_paths,
              language,
              chosen_redact_entities,
         elif in_redact_method == text_ocr_option:
+            #log_files_output_paths = []
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
             out_file_paths.append(out_image_file_path)
+            #if log_files_output_paths:
+            #    log_files_output_paths.extend(log_files_output_paths)
             logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
             all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
+            log_files_output_paths.append(logs_output_file_name)
             all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
             all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
             # Save the gradio_annotation_boxes to a JSON file
             try:
+                print("Saving annotations to JSON")
+                out_annotation_file_path = out_image_file_path + '_review_file.json'
                 with open(out_annotation_file_path, 'w') as f:
                     json.dump(annotations_all_pages, f)
+                log_files_output_paths.append(out_annotation_file_path)
+                print("Saving annotations to CSV")
+                # Convert json to csv and also save this
+                review_df = convert_review_json_to_pandas_df(annotations_all_pages)
+                out_review_file_file_path = out_image_file_path + '_review_file.csv'
+                review_df.to_csv(out_review_file_file_path, index=None)
+                out_file_paths.append(out_review_file_file_path)
+            except Exception as e:
+                print("Could not save annotations to json file:", e)
             # Make a combined message for the file
             if isinstance(out_message, list):
     return new_file_path
+def convert_color_to_range_0_1(color):
+    return tuple(component / 255 for component in color)
+def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:dict, custom_colours:bool=False):
+    pymupdf_x1 = pymupdf_rect[0]
+    pymupdf_y1 = pymupdf_rect[1]
+    pymupdf_x2 = pymupdf_rect[2]
+    pymupdf_y2 = pymupdf_rect[3]
+    # Calculate area to actually remove text from the pdf (different from black box size)
+    redact_bottom_y = pymupdf_y1 + 2
+    redact_top_y = pymupdf_y2 - 2
+    # Calculate the middle y value and set a small height if default values are too close together
+    if (redact_top_y - redact_bottom_y) < 1:
+        middle_y = (pymupdf_y1 + pymupdf_y2) / 2
+        redact_bottom_y = middle_y - 1
+        redact_top_y = middle_y + 1
+    #print("Rect:", rect)
+    rect_small_pixel_height = Rect(pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y)  # Slightly smaller than outside box
+    # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
+    #page.add_redact_annot(rect)#rect_small_pixel_height)
+    pymupdf_page.add_redact_annot(rect_small_pixel_height)
+    # Set up drawing a black box over the whole rect
+    shape = pymupdf_page.new_shape()
+    shape.draw_rect(pymupdf_rect)
+    if custom_colours == True:
+        if img_annotation_box["color"][0] > 1:
+            out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
+        else:
+            out_colour = img_annotation_box["color"]
+    else:
+        out_colour = (0,0,0)
+    shape.finish(color=out_colour, fill=out_colour)  # Black fill for the rectangle
+    #shape.finish(color=(0, 0, 0))  # Black fill for the rectangle
+    shape.commit()
+def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Image=None, custom_colours:bool=False, redact_whole_page:bool=False):
     mediabox_height = page.mediabox[3] - page.mediabox[1]
     mediabox_width = page.mediabox[2] - page.mediabox[0]
         all_image_annotation_boxes.append(img_annotation_box)
+        redact_single_box(page, rect, img_annotation_box, custom_colours)
+    # If whole page is to be redacted, do that here
+    if redact_whole_page == True:
+        # Small border to page that remains white
+        border = 5
+        # Define the coordinates for the Rect
+        whole_page_x1, whole_page_y1 = 0 + border, 0 + border  # Bottom-left corner
+        whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border  # Top-right corner
+        whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
+        # Create new image annotation element based on whole page coordinates
+        whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
+        # Write whole page annotation to annotation boxes
+        whole_page_img_annotation_box = {}
+        whole_page_img_annotation_box["xmin"] = whole_page_image_x1
+        whole_page_img_annotation_box["ymin"] = whole_page_image_y1
+        whole_page_img_annotation_box["xmax"] = whole_page_image_x2
+        whole_page_img_annotation_box["ymax"] = whole_page_image_y2
+        whole_page_img_annotation_box["color"] = (0,0,0)
+        whole_page_img_annotation_box["label"] = "Whole page"
+        redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
+        all_image_annotation_boxes.append(whole_page_img_annotation_box)
     out_annotation_boxes = {
         "image": image_path, #Image.open(image_path), #image_path,
         "boxes": all_image_annotation_boxes
     }
     page.apply_redactions(images=0, graphics=0)
     page.clean_contents()
     return (box1[0] < box2[2] and box2[0] < box1[2] and
             box1[1] < box2[3] and box2[1] < box1[3])
+from collections import defaultdict
+from typing import List, Dict
+import copy
 def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
+    all_bboxes = []
     merged_bboxes = []
     grouped_bboxes = defaultdict(list)
+    # Deep copy original bounding boxes to retain them
+    original_bboxes = copy.deepcopy(bboxes)
+    # Process signature and handwriting results
     if signature_recogniser_results or handwriting_recogniser_results:
         if "Redact all identified handwriting" in handwrite_signature_checkbox:
+            merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
         if "Redact all identified signatures" in handwrite_signature_checkbox:
+            merged_bboxes.extend(copy.deepcopy(signature_recogniser_results))
     # Reconstruct bounding boxes for substrings of interest
     reconstructed_bboxes = []
     for bbox in bboxes:
         bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
         for line_text, line_info in combined_results.items():
             line_box = line_info['bounding_box']
+            if bounding_boxes_overlap(bbox_box, line_box):
                 if bbox.text in line_text:
                     start_char = line_text.index(bbox.text)
                     end_char = start_char + len(bbox.text)
                     relevant_words = []
                     current_char = 0
                     for word in line_info['words']:
                             current_char += 1  # +1 for space if the word doesn't already end with a space
                     if relevant_words:
                         left = min(word['bounding_box'][0] for word in relevant_words)
                         top = min(word['bounding_box'][1] for word in relevant_words)
                         right = max(word['bounding_box'][2] for word in relevant_words)
                         bottom = max(word['bounding_box'][3] for word in relevant_words)
                         combined_text = " ".join(word['text'] for word in relevant_words)
                         reconstructed_bbox = CustomImageRecognizerResult(
                             bbox.entity_type,
                             bbox.start,
                             left,
                             top,
                             right - left,  # width
+                            bottom - top,  # height,
                             combined_text
                         )
+                        #reconstructed_bboxes.append(bbox)  # Add original bbox
+                        reconstructed_bboxes.append(reconstructed_bbox)  # Add merged bbox
                         break
         else:
             reconstructed_bboxes.append(bbox)
     # Group reconstructed bboxes by approximate vertical proximity
         merged_box = group[0]
         for next_box in group[1:]:
             if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
+                new_text = merged_box.text + " " + next_box.text
+                new_entity_type = merged_box.entity_type + " - " + next_box.entity_type
                 new_left = min(merged_box.left, next_box.left)
                 new_top = min(merged_box.top, next_box.top)
                 new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
                 new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
                 merged_box = CustomImageRecognizerResult(
                     new_entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
                 )
             else:
                 merged_bboxes.append(merged_box)
+                merged_box = next_box
         merged_bboxes.append(merged_box)
+    all_bboxes.extend(original_bboxes)
+    #all_bboxes.extend(reconstructed_bboxes)
+    all_bboxes.extend(merged_bboxes)
+    # Return the unique original and merged bounding boxes
+    unique_bboxes = list({(bbox.left, bbox.top, bbox.width, bbox.height): bbox for bbox in all_bboxes}.values())
+    return unique_bboxes
+# def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
+#     merged_bboxes = []
+#     grouped_bboxes = defaultdict(list)
+#         # Process signature and handwriting results
+#     if signature_recogniser_results or handwriting_recogniser_results:
+#         if "Redact all identified handwriting" in handwrite_signature_checkbox:
+#             #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
+#             merged_bboxes.extend(handwriting_recogniser_results)
+#         if "Redact all identified signatures" in handwrite_signature_checkbox:
+#             #print("Signature boxes exist at merge:", signature_recogniser_results)
+#             merged_bboxes.extend(signature_recogniser_results)
+#     # Reconstruct bounding boxes for substrings of interest
+#     reconstructed_bboxes = []
+#     for bbox in bboxes:
+#         #print("bbox:", bbox)
+#         bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
+#         for line_text, line_info in combined_results.items():
+#             line_box = line_info['bounding_box']
+#             if bounding_boxes_overlap(bbox_box, line_box):
+#                 if bbox.text in line_text:
+#                     start_char = line_text.index(bbox.text)
+#                     end_char = start_char + len(bbox.text)
+#                     relevant_words = []
+#                     current_char = 0
+#                     for word in line_info['words']:
+#                         word_end = current_char + len(word['text'])
+#                         if current_char <= start_char < word_end or current_char < end_char <= word_end or (start_char <= current_char and word_end <= end_char):
+#                             relevant_words.append(word)
+#                         if word_end >= end_char:
+#                             break
+#                         current_char = word_end
+#                         if not word['text'].endswith(' '):
+#                             current_char += 1  # +1 for space if the word doesn't already end with a space
+#                     if relevant_words:
+#                         #print("Relevant words:", relevant_words)
+#                         left = min(word['bounding_box'][0] for word in relevant_words)
+#                         top = min(word['bounding_box'][1] for word in relevant_words)
+#                         right = max(word['bounding_box'][2] for word in relevant_words)
+#                         bottom = max(word['bounding_box'][3] for word in relevant_words)
+#                         # Combine the text of all relevant words
+#                         combined_text = " ".join(word['text'] for word in relevant_words)
+#                         # Calculate new dimensions for the merged box
+#                         reconstructed_bbox = CustomImageRecognizerResult(
+#                             bbox.entity_type,
+#                             bbox.start,
+#                             bbox.end,
+#                             bbox.score,
+#                             left,
+#                             top,
+#                             right - left,  # width
+#                             bottom - top,  # height
+#                             combined_text
+#                         )
+#                         # Add both the original and the merged bounding box
+#                         reconstructed_bboxes.append(bbox)  # Retain the original bbox
+#                         reconstructed_bboxes.append(reconstructed_bbox)  # Add the merged bbox
+#                         break
+#         else:
+#             # If the bbox text is not found in any line in combined_results, keep the original bbox
+#             reconstructed_bboxes.append(bbox)
+#     # Group reconstructed bboxes by approximate vertical proximity
+#     for box in reconstructed_bboxes:
+#         grouped_bboxes[round(box.top / vertical_threshold)].append(box)
+#     # Merge within each group
+#     for _, group in grouped_bboxes.items():
+#         group.sort(key=lambda box: box.left)
+#         merged_box = group[0]
+#         for next_box in group[1:]:
+#             if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
+#                 # Calculate new dimensions for the merged box
+#                 if merged_box.text == next_box.text:
+#                     new_text = merged_box.text
+#                 else:
+#                     new_text = merged_box.text + " " + next_box.text
+#                 if merged_box.text == next_box.text:
+#                     new_text = merged_box.text
+#                     new_entity_type = merged_box.entity_type  # Keep the original entity type
+#                 else:
+#                     new_text = merged_box.text + " " + next_box.text
+#                     new_entity_type = merged_box.entity_type + " - " + next_box.entity_type  # Concatenate entity types
+#                 new_left = min(merged_box.left, next_box.left)
+#                 new_top = min(merged_box.top, next_box.top)
+#                 new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
+#                 new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
+#                 merged_box = CustomImageRecognizerResult(
+#                     new_entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
+#                 )
+#             else:
+#                 merged_bboxes.append(merged_box)
+#                 merged_box = next_box
+#         merged_bboxes.append(merged_box)
+#     #print("bboxes:", bboxes)
+#     return merged_bboxes
 def redact_image_pdf(file_path:str,
                      prepared_pdf_file_paths:List[str],
                      custom_recogniser_word_list:List[str]=[],
                      redact_whole_page_list:List[str]=[],
                      page_break_val:int=int(page_break_value),
+                     log_files_output_paths:List=[],
                      max_time:int=int(max_time_value),
                      progress=Progress(track_tqdm=True)):
     - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
+    - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         print("Connection to AWS Comprehend service unsuccessful.")
+        return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
     if analysis_type == textract_option and textract_client == "":
         print("Connection to AWS Textract service unsuccessful.")
+        return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
     tic = time.perf_counter()
     if analysis_type == textract_option:
         json_file_path = output_folder + file_name + "_textract.json"
+        log_files_output_paths.append(json_file_path)
         if not os.path.exists(json_file_path):
             no_textract_file = True
             print("No existing Textract results file found.")
             existing_data = {}
             #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
+            #log_files_output_paths.append(json_file_path)
             #request_metadata = request_metadata + "\n" + new_request_metadata
             #wrapped_text_blocks = {"pages":[text_blocks]}
         else:
                 if not existing_data:
                     text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
+                    log_files_output_paths.append(json_file_path)
                     request_metadata = request_metadata + "\n" + new_request_metadata
                     existing_data = {"pages":[text_blocks]}
                 # if not os.path.exists(json_file_path):
                 #     text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
+                #     log_files_output_paths.append(json_file_path)
                 #     request_metadata = request_metadata + "\n" + new_request_metadata
                 #     existing_data = {"pages":[text_blocks]}
                 #             with open(json_file_path, 'w') as json_file:
                 #                 json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
+                #             log_files_output_paths.append(json_file_path)
                 #             request_metadata = request_metadata + "\n" + new_request_metadata
                 #         else:
                 #             # If the page exists, retrieve the data
                 current_loop_page += 1
+                return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
         if is_pdf(file_path) == False:
             images.append(image)
                 with open(json_file_path, 'w') as json_file:
                     json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
+            return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
     if analysis_type == textract_option:
         # Write the updated existing textract data back to the JSON file
         with open(json_file_path, 'w') as json_file:
             json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
+    return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
 ###
     return line_level_results_out, line_level_characters_out  # Return both results and character objects
+def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
     '''
     Merge identified bounding boxes containing PII that are very close to one another
     '''
     analysed_bounding_boxes = []
+    original_bounding_boxes = []  # List to hold original bounding boxes
     if len(analyser_results) > 0 and len(characters) > 0:
         # Extract bounding box coordinates for sorting
         bounding_boxes = []
         for result in analyser_results:
+            #print("Result:", result)
             char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
             char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
             if char_boxes:
                 bottom = min(box[1] for box in char_boxes)
                 right = max(box[2] for box in char_boxes)
                 top = max(box[3] for box in char_boxes) + vertical_padding
+                bbox = [left, bottom, right, top]
+                bounding_boxes.append((bottom, left, result, bbox, char_text))  # (y, x, result, bbox, text)
+                # Store original bounding boxes
+                original_bounding_boxes.append({"text": "".join(char_text), "boundingBox": bbox, "result": copy.deepcopy(result)})
+                #print("Original bounding boxes:", original_bounding_boxes)
         # Sort the results by y-coordinate and then by x-coordinate
         bounding_boxes.sort()
         current_result = None
         current_text = []
+        for y, x, result, next_box, text in bounding_boxes:
             if current_y is None or current_box is None:
+                # Initialize the first bounding box
+                current_box = next_box
+                current_y = next_box[1]
                 current_result = result
                 current_text = list(text)
             else:
+                vertical_diff_bboxes = abs(next_box[1] - current_y)
+                horizontal_diff_bboxes = abs(next_box[0] - current_box[2])
+                if vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist:
+                    # Merge bounding boxes
+                    #print("Merging boxes")
+                    merged_box = current_box.copy()
+                    merged_result = current_result
+                    merged_text = current_text.copy()
+                    #print("current_box_max_x:", current_box[2])
+                    #print("char_max_x:", next_box[2])
+                    merged_box[2] = next_box[2]  # Extend horizontally
+                    merged_box[3] = max(current_box[3], next_box[3])  # Adjust the top
+                    merged_result.end = max(current_result.end, result.end)  # Extend text range
                     try:
+                        merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
                     except Exception as e:
+                        print("Unable to combine result entity types:", e)
                     if current_text:
+                        merged_text.append(" ")  # Add space between texts
+                    merged_text.extend(text)
+                    merged_bounding_boxes.append({
+                        "text": "".join(merged_text),
+                        "boundingBox": merged_box,
+                        "result": merged_result
+                    })
                 else:
+                    # Save the current merged box before starting a new one
+                    # merged_bounding_boxes.append({
+                    #     "text": "".join(current_text),
+                    #     "boundingBox": current_box,
+                    #     "result": current_result
+                    # })
+                    # Start a new bounding box
+                    current_box = next_box
+                    current_y = next_box[1]
                     current_result = result
                     current_text = list(text)
+        # Handle the last box
+        # if current_box is not None:
+        #     merged_bounding_boxes.append({
+        #         "text": "".join(current_text),
+        #         "boundingBox": current_box,
+        #         "result": current_result
+        #     })
+        # Combine original and merged bounding boxes
+        analysed_bounding_boxes.extend(original_bounding_boxes)
+        analysed_bounding_boxes.extend(merged_bounding_boxes)
+        #print("Analysed bounding boxes:", analysed_bounding_boxes)
     return analysed_bounding_boxes
+# def merge_text_bounding_boxes(analyser_results, characters:List[LTChar], combine_pixel_dist:int=20, vertical_padding:int=0):
+#     '''
+#     Merge identified bounding boxes containing PII that are very close to one another
+#     '''
+#     analysed_bounding_boxes = []
+#     if len(analyser_results) > 0 and len(characters) > 0:
+#         # Extract bounding box coordinates for sorting
+#         bounding_boxes = []
+#         text_out = []
+#         for result in analyser_results:
+#             char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
+#             char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
+#             if char_boxes:
+#                 # Calculate the bounding box that encompasses all characters
+#                 left = min(box[0] for box in char_boxes)
+#                 bottom = min(box[1] for box in char_boxes)
+#                 right = max(box[2] for box in char_boxes)
+#                 top = max(box[3] for box in char_boxes) + vertical_padding
+#                 bounding_boxes.append((bottom, left, result, [left, bottom, right, top], char_text))  # (y, x, result, bbox, text)
+#         char_text = "".join(char_text)
+#         # Sort the results by y-coordinate and then by x-coordinate
+#         bounding_boxes.sort()
+#         merged_bounding_boxes = []
+#         current_box = None
+#         current_y = None
+#         current_result = None
+#         current_text = []
+#         for y, x, result, char_box, text in bounding_boxes:
+#             #print(f"Considering result: {result}")
+#             #print(f"Character box: {char_box}")
+#             if current_y is None or current_box is None:
+#                 current_box = char_box
+#                 current_y = char_box[1]
+#                 current_result = result
+#                 current_text = list(text)
+#                 #print(f"Starting new box: {current_box}")
+#             else:
+#                 vertical_diff_bboxes = abs(char_box[1] - current_y)
+#                 horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
+#                 if (
+#                     vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist
+#                 ):
+#                     #print("box is being extended")
+#                     current_box[2] = char_box[2]  # Extend the current box horizontally
+#                     current_box[3] = max(current_box[3], char_box[3])  # Ensure the top is the highest
+#                     current_result.end = max(current_result.end, result.end)  # Extend the text range
+#                     try:
+#                         current_result.entity_type = current_result.entity_type + " - " + result.entity_type
+#                     except Exception as e:
+#                         print("Unable to combine result entity types:")
+#                         print(e)
+#                     # Add a space if current_text is not empty
+#                     if current_text:
+#                         current_text.append(" ")  # Add space between texts
+#                     current_text.extend(text)
+#                     #print(f"Latest merged box: {current_box[-1]}")
+#                 else:
+#                     merged_bounding_boxes.append(
+#                         {"text":"".join(current_text),"boundingBox": current_box, "result": current_result})
+#                     # Reset current_box and current_y after appending
+#                     current_box = char_box
+#                     current_y = char_box[1]
+#                     current_result = result
+#                     current_text = list(text)
+#         # After finishing with the current result, add the last box for this result
+#         if current_box:
+#             merged_bounding_boxes.append({"text":"".join(current_text), "boundingBox": current_box, "result": current_result})
+#         if not merged_bounding_boxes:
+#             analysed_bounding_boxes.extend(
+#                 {"text":text, "boundingBox": char.bbox, "result": result}
+#                 for result in analyser_results
+#                 for char in characters[result.start:result.end]
+#                 if isinstance(char, LTChar)
+#             )
+#         else:
+#             analysed_bounding_boxes.extend(merged_bounding_boxes)
+#     return analysed_bounding_boxes
 def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()

tools/helper_functions.py CHANGED Viewed

@@ -3,6 +3,7 @@ import re
 import gradio as gr
 import pandas as pd
 import unicodedata
 from gradio_image_annotation import image_annotator
 def reset_state_vars():
@@ -38,13 +39,11 @@ textract_option = "AWS Textract service - all PDF types"
 local_pii_detector = "Local"
 aws_pii_detector  = "AWS Comprehend"
-# Retrieving or setting output folder
-env_var_name = 'GRADIO_OUTPUT_FOLDER'
-default_value = 'output/'
-output_folder = get_or_create_env_var(env_var_name, default_value)
-print(f'The value of {env_var_name} is {output_folder}')
 def load_in_default_allow_list(allow_list_file_path):
     if isinstance(allow_list_file_path, str):
@@ -105,7 +104,7 @@ def ensure_output_folder_exists():
     else:
         print(f"The 'output/' folder already exists.")
-def custom_regex_load(in_file):
     '''
     When file is loaded, update the column dropdown choices and write to relevant data states.
     '''
@@ -113,6 +112,7 @@ def custom_regex_load(in_file):
     custom_regex = pd.DataFrame()
     if in_file:
         file_list = [string.name for string in in_file]
@@ -122,13 +122,13 @@ def custom_regex_load(in_file):
             custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
             #regex_file_name_no_ext = get_file_path_end(regex_file_name)
-            output_text = "Allow list file loaded."
             print(output_text)
     else:
-        error = "No allow list file provided."
-        print(error)
-        output_text = error
-        return error, custom_regex
     return output_text, custom_regex

 import gradio as gr
 import pandas as pd
 import unicodedata
+from typing import List
 from gradio_image_annotation import image_annotator
 def reset_state_vars():
 local_pii_detector = "Local"
 aws_pii_detector  = "AWS Comprehend"
+output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
+print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
+input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
+print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
 def load_in_default_allow_list(allow_list_file_path):
     if isinstance(allow_list_file_path, str):
     else:
         print(f"The 'output/' folder already exists.")
+def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
     '''
     When file is loaded, update the column dropdown choices and write to relevant data states.
     '''
     custom_regex = pd.DataFrame()
     if in_file:
+        print("File type:", file_type)
         file_list = [string.name for string in in_file]
             custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
             #regex_file_name_no_ext = get_file_path_end(regex_file_name)
+            output_text = file_type + " file loaded."
             print(output_text)
     else:
+        output_text = "No file provided."
+        print(output_text)
+        return output_text, custom_regex
     return output_text, custom_regex

tools/redaction_review.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import gradio as gr
 import numpy as np
 from typing import List
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
-from tools.file_conversion import is_pdf, convert_pdf_to_images
 from tools.helper_functions import get_file_path_end, output_folder
 from tools.file_redaction import redact_page_with_pymupdf
 import json
 import pymupdf
 from fitz import Document
 from PIL import ImageDraw, Image
@@ -138,13 +140,14 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
     return all_image_annotations, current_page, current_page
-def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)):
     '''
-    Apply modified redactions to a pymupdf
     '''
     #print("all_image_annotations:", all_image_annotations)
     output_files = []
     image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
@@ -154,86 +157,100 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
         print("No image annotations found")
         return doc, all_image_annotations
-    if isinstance(file_paths, list):
-        file_path = file_paths[-1].name
-    else:
-        file_path = file_paths
-    print("file_path:", file_path)
-    file_base = get_file_path_end(file_path)
-    # If working with image docs
-    if is_pdf(file_path) == False:
-        pdf_doc = Image.open(file_paths[-1])
-        image = pdf_doc
-        # try:
-        #     image = Image.open(image_annotated['image'])
-        # except:
-        #     image = Image.fromarray(image_annotated['image'].astype('uint8'))
-        draw = ImageDraw.Draw(pdf_doc)
-        for img_annotation_box in image_annotated['boxes']:
-            coords = [img_annotation_box["xmin"],
-            img_annotation_box["ymin"],
-            img_annotation_box["xmax"],
-            img_annotation_box["ymax"]]
-            fill = img_annotation_box["color"]
-            draw.rectangle(coords, fill=fill)
-            image.save(output_folder + file_base + "_redacted.png")
-        doc = [image]
-    # If working with pdfs
-    else:
-        pdf_doc = pymupdf.open(file_path)
-        number_of_pages = pdf_doc.page_count
-        print("Saving pages to file.")
-        for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
-            #print("Saving page", str(i))
-            image_loc = all_image_annotations[i]['image']
-            #print("Image location:", image_loc)
-            # Load in image object
-            if isinstance(image_loc, np.ndarray):
-                image = Image.fromarray(image_loc.astype('uint8'))
-                #all_image_annotations[i]['image'] = image_loc.tolist()
-            elif isinstance(image_loc, Image.Image):
-                image = image_loc
-                #image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
-                #image_loc.save(image_out_folder)
-                #all_image_annotations[i]['image'] = image_out_folder
-            elif isinstance(image_loc, str):
-                image = Image.open(image_loc)
-            pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
-            pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
-    #try:
-    out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
-    pdf_doc.save(out_pdf_file_path)
-    output_files.append(out_pdf_file_path)
-    # Save the gradio_annotation_boxes to a JSON file
-    try:
-        out_annotation_file_path = output_folder + file_base + '_redactions.json'
-        with open(out_annotation_file_path, 'w') as f:
-            json.dump(all_image_annotations, f)
-        output_files.append(out_annotation_file_path)
-    except:
-        print("Could not save annotations to json file.")
-    return doc, all_image_annotations, output_files
 def crop(annotations:AnnotatedImageData):
     if annotations["boxes"]:
@@ -246,3 +263,21 @@ def crop(annotations:AnnotatedImageData):
 def get_boxes_json(annotations:AnnotatedImageData):
     return annotations["boxes"]

 import gradio as gr
+import pandas as pd
 import numpy as np
 from typing import List
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
+from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
 from tools.helper_functions import get_file_path_end, output_folder
 from tools.file_redaction import redact_page_with_pymupdf
 import json
+import os
 import pymupdf
 from fitz import Document
 from PIL import ImageDraw, Image
     return all_image_annotations, current_page, current_page
+def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)):
     '''
+    Apply modified redactions to a pymupdf and export review files
     '''
     #print("all_image_annotations:", all_image_annotations)
     output_files = []
+    output_log_files = []
     image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
         print("No image annotations found")
         return doc, all_image_annotations
+    if isinstance(file_paths, str):
+        file_paths = [file_paths]
+    for file_path in file_paths:
+        print("file_path:", file_path)
+        file_base = get_file_path_end(file_path)
+        file_extension = os.path.splitext(file_path)[1].lower()
+        # If working with image docs
+        if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
+            image = Image.open(file_paths[-1])
+            #image = pdf_doc
+            draw = ImageDraw.Draw(image)
+            for img_annotation_box in image_annotated['boxes']:
+                coords = [img_annotation_box["xmin"],
+                img_annotation_box["ymin"],
+                img_annotation_box["xmax"],
+                img_annotation_box["ymax"]]
+                fill = img_annotation_box["color"]
+                draw.rectangle(coords, fill=fill)
+                image.save(output_folder + file_base + "_redacted.png")
+            doc = [image]
+        elif file_extension in '.csv':
+            print("This is a csv")
+            pdf_doc = []
+        # If working with pdfs
+        elif is_pdf(file_path) == True:
+            pdf_doc = pymupdf.open(file_path)
+            number_of_pages = pdf_doc.page_count
+            print("Saving pages to file.")
+            for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
+                #print("Saving page", str(i))
+                image_loc = all_image_annotations[i]['image']
+                #print("Image location:", image_loc)
+                # Load in image object
+                if isinstance(image_loc, np.ndarray):
+                    image = Image.fromarray(image_loc.astype('uint8'))
+                    #all_image_annotations[i]['image'] = image_loc.tolist()
+                elif isinstance(image_loc, Image.Image):
+                    image = image_loc
+                    #image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
+                    #image_loc.save(image_out_folder)
+                    #all_image_annotations[i]['image'] = image_out_folder
+                elif isinstance(image_loc, str):
+                    image = Image.open(image_loc)
+                pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
+                pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
+        else:
+            print("File type not recognised.")
+        #try:
+        if pdf_doc:
+            out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
+            pdf_doc.save(out_pdf_file_path)
+            output_files.append(out_pdf_file_path)
+        try:
+            # print("Saving annotations to JSON")
+            out_annotation_file_path = output_folder + file_base + '_review_file.json'
+            with open(out_annotation_file_path, 'w') as f:
+                json.dump(all_image_annotations, f)
+            output_log_files.append(out_annotation_file_path)
+            print("Saving annotations to CSV review file")
+            # Convert json to csv and also save this
+            review_df = convert_review_json_to_pandas_df(all_image_annotations)
+            out_review_file_file_path = output_folder + file_base + '_review_file.csv'
+            review_df.to_csv(out_review_file_file_path, index=None)
+            output_files.append(out_review_file_file_path)
+        except Exception as e:
+            print("Could not save annotations to json file:", e)
+    return doc, all_image_annotations, output_files, output_log_files
 def crop(annotations:AnnotatedImageData):
     if annotations["boxes"]:
 def get_boxes_json(annotations:AnnotatedImageData):
     return annotations["boxes"]
+    # Group the DataFrame by the 'image' column
+    grouped = df.groupby('image')
+    # Create a list to hold the JSON data
+    json_data = []
+    # Iterate over each group
+    for image_path, group in grouped:
+        # Convert each group to a list of box dictionaries
+        boxes = group.drop(columns='image').to_dict(orient='records')
+        # Append the structured data to the json_data list
+        json_data.append({
+            "image": image_path,
+            "boxes": boxes
+        })
+    return json_data