Spaces:

seanpedrickcase
/

document_redaction

Running

@@ -56,6 +56,7 @@ RUN mkdir -p /home/user/app/output \
     && mkdir -p /home/user/app/input \
     && mkdir -p /home/user/app/tld \
     && mkdir -p /home/user/app/logs \
     && chown -R user:user /home/user/app
 # Copy installed packages from builder stage

     && mkdir -p /home/user/app/input \
     && mkdir -p /home/user/app/tld \
     && mkdir -p /home/user/app/logs \
+    && mkdir -p /home/user/app/config \
     && chown -R user:user /home/user/app
 # Copy installed packages from builder stage

README.md CHANGED Viewed

@@ -34,7 +34,16 @@ NOTE: The app is not 100% accurate, and it will miss some personal information.
     - [Handwriting and signature redaction](#handwriting-and-signature-redaction)
 - [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
-See the [advanced user guide here](#advanced-user-guide).
 ## Example data files
@@ -292,4 +301,25 @@ The app also allows you to import .xfdf files from Adobe Acrobat. To do this, go
 When you click the 'convert .xfdf comment file to review_file.csv' button, the app should take you up to the top of the screen where the new review file has been created and can be downloaded.
-![Outputs from Adobe import](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/import_from_adobe_interface_outputs.PNG)

     - [Handwriting and signature redaction](#handwriting-and-signature-redaction)
 - [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
+See the [advanced user guide here](#advanced-user-guide):
+- [Modifying and merging redaction review files](#modifying-and-merging-redaction-review-files)
+    - [Modifying existing redaction review files](#modifying-existing-redaction-review-files)
+    - [Merging existing redaction review files](#merging-existing-redaction-review-files)
+- [Identifying and redacting duplicate pages](#identifying-and-redacting-duplicate-pages)
+- [Fuzzy search and redaction](#fuzzy-search-and-redaction)
+- [Export redactions to and import from Adobe Acrobat](#export-to-and-import-from-adobe)
+    - [Exporting to Adobe Acrobat](#exporting-to-adobe-acrobat)
+    - [Importing from Adobe Acrobat](#importing-from-adobe-acrobat)
+- [Using AWS Textract and Comprehend when not running in an AWS environment](#using-aws-textract-and-comprehend-when-not-running-in-an-aws-environment)
 ## Example data files
 When you click the 'convert .xfdf comment file to review_file.csv' button, the app should take you up to the top of the screen where the new review file has been created and can be downloaded.
+![Outputs from Adobe import](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/import_from_adobe_interface_outputs.PNG)
+## Using AWS Textract and Comprehend when not running in an AWS environment
+AWS Textract and Comprehend give much better results for text extraction and document redaction than the local model options in the app. The most secure way to access them in the Redaction app is to run the app in a secure AWS environment with relevant permissions. Alternatively, you could run the app on your own system while logged in to AWS SSO with relevant permissions.
+However, it is possible to access these services directly via API from outside an AWS environment by creating IAM users and access keys with relevant permissions to access AWS Textract and Comprehend services. Please check with your IT and data security teams that this approach is acceptable for your data before trying the following approaches.
+To do the following, in your AWS environment you will need to create a new user with permissions for "textract:AnalyzeDocument", "textract:DetectDocumentText", and "comprehend:DetectPiiEntities". Under security credentials, create new access keys - note down the access key and secret key.
+### Direct access by passing AWS access keys through app
+The Redaction Settings tab now has boxes for entering the AWS access key and secret key. If you paste the relevant keys into these boxes before performing redaction, you should be able to use these services in the app.
+### Picking up AWS access keys through an .env file
+The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
+AWS_ACCESS_KEY=<your-access-key>
+AWS_SECRET_KEY=<your-secret-key>
+The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.
+Again, a lot can potentially go wrong with AWS solutions that are insecure, so before trying the above please consult with your AWS and data security teams.

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import socket
 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
-os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
 import gradio as gr
 import pandas as pd
@@ -65,7 +65,8 @@ with app:
     ###
     # STATE VARIABLES
     ###
     pdf_doc_state = gr.State([])
     all_image_annotations_state = gr.State([])
@@ -73,12 +74,12 @@ with app:
     all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
     review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
-    session_hash_state = gr.State()
-    s3_output_folder_state = gr.State()
-    first_loop_state = gr.State(True)
-    second_loop_state = gr.State(False)
-    do_not_save_pdf_state = gr.State(False)
     prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
     images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
@@ -92,12 +93,12 @@ with app:
     # Logging state
     log_file_name = 'log.csv'
-    feedback_logs_state = gr.State(feedback_logs_folder + log_file_name)
-    feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
-    access_logs_state = gr.State(access_logs_folder + log_file_name)
-    access_s3_logs_loc_state = gr.State(access_logs_folder)
-    usage_logs_state = gr.State(usage_logs_folder + log_file_name)
-    usage_s3_logs_loc_state = gr.State(usage_logs_folder)
     # Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
     session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
@@ -121,11 +122,11 @@ with app:
     ## Annotator zoom value
     annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
-    zoom_true_bool = gr.State(True)
-    zoom_false_bool = gr.State(False)
-    clear_all_page_redactions = gr.State(True)
-    prepare_for_review_bool = gr.Checkbox(value=True, visible=False)
     ## Settings page variables
     default_allow_list_file_name = "default_allow_list.csv"
@@ -148,11 +149,11 @@ with app:
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
     # Base dataframe for recognisers that is not modified subsequent to load
-    recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
     # Duplicate page detection
     in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
-    duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
@@ -177,12 +178,12 @@ with app:
     with gr.Tab("Redact PDFs/images"):
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
-            if RUN_AWS_FUNCTIONS == "1":
-                in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
-                pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost per 100 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
-            else:
-                in_redaction_method = gr.Radio(label="Choose text extraction method.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option])
-                pii_identification_method_drop = gr.Radio(label = "Choose PII detection method.", value = default_pii_detector, choices=[local_pii_detector], visible=False)
             gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the redaction settings tab.""")
             document_redact_btn = gr.Button("Redact document", variant="primary")
@@ -336,14 +337,14 @@ with app:
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
-        with gr.Accordion("AWS Textract specific options", open = False):
             handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
             #with gr.Row():
             in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
             with gr.Row():
-                aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=False)
-                aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=False)
         with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
@@ -355,8 +356,6 @@ with app:
             merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
     ### UI INTERACTION ###
     ###
@@ -365,14 +364,13 @@ with app:
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-    then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
-    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox],
-                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
-    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox],
-                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If a file has been completed, the function will continue onto the next document
@@ -386,7 +384,7 @@ with app:
     # Upload previous files for modifying redactions
     upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         then(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
-        then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # Page controls at top
@@ -445,12 +443,12 @@ with app:
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
-        then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
         then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
-        then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
         then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state], outputs=[output_review_files], scroll_to_output=True)
     ###
@@ -542,14 +540,17 @@ print(f'The value of GRADIO_SERVER_PORT is {GRADIO_SERVER_PORT}')
 ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
 print(f'The value of ROOT_PATH is {ROOT_PATH}')
 if __name__ == "__main__":
     if RUN_DIRECT_MODE == "0":
         if os.environ['COGNITO_AUTH'] == "1":
-            app.queue(max_size=MAX_QUEUE_SIZE).launch(show_error=True, auth=authenticate_user, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
         else:
-            app.queue(max_size=MAX_QUEUE_SIZE).launch(show_error=True, inbrowser=True, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
     else:
         from tools.cli_redact import main

 import socket
 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
+#os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
 import gradio as gr
 import pandas as pd
     ###
     # STATE VARIABLES
     ###
+    # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
     pdf_doc_state = gr.State([])
     all_image_annotations_state = gr.State([])
     all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
     review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
+    session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False) #.State()
+    s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False) #.State()
+    first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False) #.State(True)
+    second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False) #.State(False)
+    do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False) #.State(False)
     prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
     images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
     # Logging state
     log_file_name = 'log.csv'
+    feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=feedback_logs_folder + log_file_name, visible=False) #State(feedback_logs_folder + log_file_name)
+    feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=feedback_logs_folder, visible=False) #State(feedback_logs_folder)
+    access_logs_state = gr.Textbox(label= "access_logs_state", value=access_logs_folder + log_file_name, visible=False) #State(access_logs_folder + log_file_name)
+    access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=access_logs_folder, visible=False) #State(access_logs_folder)
+    usage_logs_state = gr.Textbox(label= "usage_logs_state", value=usage_logs_folder + log_file_name, visible=False) #State(usage_logs_folder + log_file_name)
+    usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=usage_logs_folder, visible=False) #State(usage_logs_folder)
     # Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
     session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
     ## Annotator zoom value
     annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
+    zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False) #State(True)
+    zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False) #State(False)
+    clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False) #State(True)
+    prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
     ## Settings page variables
     default_allow_list_file_name = "default_allow_list.csv"
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
     # Base dataframe for recognisers that is not modified subsequent to load
+    recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False, label="recogniser_entity_dataframe_base")
     # Duplicate page detection
     in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
+    duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas")
     with gr.Tab("Redact PDFs/images"):
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
+            # if RUN_AWS_FUNCTIONS == "1":
+            in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
+            pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
+            # else:
+            #     in_redaction_method = gr.Radio(label="Choose text extraction method.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option])
+            #     pii_identification_method_drop = gr.Radio(label = "Choose PII detection method.", value = default_pii_detector, choices=[local_pii_detector], visible=False)
             gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the redaction settings tab.""")
             document_redact_btn = gr.Button("Redact document", variant="primary")
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
+        with gr.Accordion("AWS Textract options", open = False):
             handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
             #with gr.Row():
             in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
             with gr.Row():
+                aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
+                aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
         with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
             merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
     ### UI INTERACTION ###
     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state],
+                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
+    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state],
+                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state]).\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If a file has been completed, the function will continue onto the next document
     # Upload previous files for modifying redactions
     upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         then(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
+        then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # Page controls at top
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
+        then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
         then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
+        then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
         then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state], outputs=[output_review_files], scroll_to_output=True)
     ###
 ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
 print(f'The value of ROOT_PATH is {ROOT_PATH}')
+DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '5')
+print(f'The value of DEFAULT_CONCURRENCY_LIMIT is {DEFAULT_CONCURRENCY_LIMIT}')
 if __name__ == "__main__":
     if RUN_DIRECT_MODE == "0":
         if os.environ['COGNITO_AUTH'] == "1":
+            app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, auth=authenticate_user, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
         else:
+            app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, inbrowser=True, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
     else:
         from tools.cli_redact import main

how_to_create_exe_dist.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 1. Create minimal environment to run the app in conda. E.g. 'conda create --name new_env'
 2. Activate the environment 'conda activate new_env'
@@ -14,7 +16,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
 9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
-a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client  --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.2.0 app.py
 # Add --onefile  to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
@@ -30,7 +32,7 @@ a = Analysis(
 hook-presidio-image-redactor.py
-c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.2.0.spec
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').

+Here are instructions for creating an .exe runnable version of the redaction app. Tested until Gradio version 5.17.0
 1. Create minimal environment to run the app in conda. E.g. 'conda create --name new_env'
 2. Activate the environment 'conda activate new_env'
 9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
+a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client  --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.3.0 app.py
 # Add --onefile  to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
 hook-presidio-image-redactor.py
+c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.3.0.spec
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').

requirements.txt CHANGED Viewed

@@ -1,30 +1,31 @@
-pdfminer.six==20231228
 pdf2image==1.17.0
-pymupdf==1.24.10
 opencv-python==4.10.0.84
-presidio_analyzer==2.2.355
-presidio_anonymizer==2.2.355
-presidio-image-redactor==0.0.53
-pikepdf==8.15.1
 pandas==2.2.3
 nltk==3.9.1
-scikit-learn==1.5.2
 spacy==3.8.3
 #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
-gradio==5.16.0
-boto3==1.36.15
-pyarrow==18.1.0
-openpyxl==3.1.2
-Faker==22.2.0
 python-levenshtein==0.26.1
 spaczz==0.6.1
-gradio_image_annotation==0.2.5
-# The following version includes rotation and image zoom options - not currently working so reverting to original until fixed
-#https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.0/gradio_image_annotation-0.3.0-py3-none-any.whl
 rapidfuzz==3.12.1
 numpy==1.26.4
-awslambdaric==3.0.0

+pdfminer.six==20240706
 pdf2image==1.17.0
+pymupdf==1.25.3
 opencv-python==4.10.0.84
+presidio_analyzer==2.2.357
+presidio_anonymizer==2.2.357
+presidio-image-redactor==0.0.55
+pikepdf==9.5.2
 pandas==2.2.3
 nltk==3.9.1
+scikit-learn==1.6.1
 spacy==3.8.3
 #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==5.18.0
+boto3==1.36.26
+pyarrow==19.0.1
+openpyxl==3.1.5
+Faker==36.1.1
 python-levenshtein==0.26.1
 spaczz==0.6.1
+#gradio_image_annotation==0.2.5
+# The following version includes rotation and image zoom options
+https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.0/gradio_image_annotation-0.3.0-py3-none-any.whl
 rapidfuzz==3.12.1
+python-dotenv==1.0.1
 numpy==1.26.4
+awslambdaric==3.0.1

tools/aws_functions.py CHANGED Viewed

@@ -4,18 +4,27 @@ import boto3
 import tempfile
 import os
 from tools.helper_functions import get_or_create_env_var
 PandasDataFrame = Type[pd.DataFrame]
 # Get AWS credentials
 bucket_name=""
-RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "1")
 print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
 print(f'The value of AWS_REGION is {AWS_REGION}')
 AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
 if AWS_ACCESS_KEY:
     print(f'AWS_ACCESS_KEY found in environment variables')

 import tempfile
 import os
 from tools.helper_functions import get_or_create_env_var
+from dotenv import load_dotenv
 PandasDataFrame = Type[pd.DataFrame]
 # Get AWS credentials
 bucket_name=""
+RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
 print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
 print(f'The value of AWS_REGION is {AWS_REGION}')
+# If you have an aws_config env file in the config folder, you can load in AWS keys this way
+AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '/env/aws_config.env')
+print(f'The value of AWS_CONFIG_PATH is {AWS_CONFIG_PATH}')
+if os.path.exists(AWS_CONFIG_PATH):
+    print("Loading AWS keys from config folder")
+    load_dotenv(AWS_CONFIG_PATH)
 AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
 if AWS_ACCESS_KEY:
     print(f'AWS_ACCESS_KEY found in environment variables')

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -515,6 +515,7 @@ def do_aws_comprehend_call(current_batch, current_batch_mapping, comprehend_clie
         except Exception as e:
             if attempt == max_retries - 1:
                 raise
             time.sleep(retry_delay)
@@ -571,7 +572,6 @@ def run_page_text_redaction(
             allow_list=allow_list
         )
-        #print("page_analyser_result:", page_analyser_result)
         all_text_line_results = map_back_entity_results(
             page_analyser_result,
@@ -579,10 +579,8 @@ def run_page_text_redaction(
             all_text_line_results
         )
-        #print("all_text_line_results:", all_text_line_results)
     elif pii_identification_method == "AWS Comprehend":
-        #print("page text:", page_text)
         # Process custom entities if any
         if custom_entities:
@@ -600,8 +598,6 @@ def run_page_text_redaction(
                     allow_list=allow_list
                 )
-                print("page_analyser_result:", page_analyser_result)
                 all_text_line_results = map_back_entity_results(
                     page_analyser_result,
                     page_text_mapping,

         except Exception as e:
             if attempt == max_retries - 1:
+                print("AWS Comprehend calls failed due to", e)
                 raise
             time.sleep(retry_delay)
             allow_list=allow_list
         )
         all_text_line_results = map_back_entity_results(
             page_analyser_result,
             all_text_line_results
         )
     elif pii_identification_method == "AWS Comprehend":
         # Process custom entities if any
         if custom_entities:
                     allow_list=allow_list
                 )
                 all_text_line_results = map_back_entity_results(
                     page_analyser_result,
                     page_text_mapping,

tools/file_conversion.py CHANGED Viewed

@@ -464,12 +464,10 @@ def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colou
 def prepare_image_or_pdf(
     file_paths: List[str],
     in_redact_method: str,
-    in_allow_list: Optional[List[List[str]]] = None,
     latest_file_completed: int = 0,
     out_message: List[str] = [],
     first_loop_state: bool = False,
     number_of_pages:int = 1,
-    current_loop_page_number:int=0,
     all_annotations_object:List = [],
     prepare_for_review:bool = False,
     in_fully_redacted_list:List[int]=[],
@@ -484,12 +482,10 @@ def prepare_image_or_pdf(
     Args:
         file_paths (List[str]): List of file paths to process.
         in_redact_method (str): The redaction method to use.
-        in_allow_list (optional, Optional[List[List[str]]]): List of allowed terms for redaction.
         latest_file_completed (optional, int): Index of the last completed file.
         out_message (optional, List[str]): List to store output messages.
         first_loop_state (optional, bool): Flag indicating if this is the first iteration.
         number_of_pages (optional, int): integer indicating the number of pages in the document
-        current_loop_page_number (optional, int): Current number of loop
         all_annotations_object(optional, List of annotation objects): All annotations for current document
         prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
         in_fully_redacted_list(optional, List of int): A list of pages to fully redact

 def prepare_image_or_pdf(
     file_paths: List[str],
     in_redact_method: str,
     latest_file_completed: int = 0,
     out_message: List[str] = [],
     first_loop_state: bool = False,
     number_of_pages:int = 1,
     all_annotations_object:List = [],
     prepare_for_review:bool = False,
     in_fully_redacted_list:List[int]=[],
     Args:
         file_paths (List[str]): List of file paths to process.
         in_redact_method (str): The redaction method to use.
         latest_file_completed (optional, int): Index of the last completed file.
         out_message (optional, List[str]): List to store output messages.
         first_loop_state (optional, bool): Flag indicating if this is the first iteration.
         number_of_pages (optional, int): integer indicating the number of pages in the document
         all_annotations_object(optional, List of annotation objects): All annotations for current document
         prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
         in_fully_redacted_list(optional, List of int): A list of pages to fully redact

tools/file_redaction.py CHANGED Viewed

@@ -29,7 +29,7 @@ from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRRes
 from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
 from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
-from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
@@ -78,9 +78,9 @@ def choose_and_run_redactor(file_paths:List[str],
  custom_recogniser_word_list:List[str]=None,
  redact_whole_page_list:List[str]=None,
  latest_file_completed:int=0,
- out_message:list=[],
- out_file_paths:list=[],
- log_files_output_paths:list=[],
  first_loop_state:bool=False,
  page_min:int=0,
  page_max:int=999,
@@ -99,6 +99,8 @@ def choose_and_run_redactor(file_paths:List[str],
  match_fuzzy_whole_phrase_bool:bool=True,
  aws_access_key_textbox:str='',
  aws_secret_key_textbox:str='',
  output_folder:str=output_folder,
  progress=gr.Progress(track_tqdm=True)):
     '''
@@ -136,6 +138,7 @@ def choose_and_run_redactor(file_paths:List[str],
     - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
     - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
     - output_folder (str, optional): Output folder for results.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
@@ -145,6 +148,13 @@ def choose_and_run_redactor(file_paths:List[str],
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
     #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
     review_out_file_paths = [prepared_pdf_file_paths[0]]
@@ -212,7 +222,7 @@ def choose_and_run_redactor(file_paths:List[str],
         estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
-        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
     # If we have reached the last page, return message
     if current_loop_page >= number_of_pages:
@@ -228,7 +238,7 @@ def choose_and_run_redactor(file_paths:List[str],
             review_out_file_paths.extend(out_review_file_path)
-        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
     # Create allow list
     # If string, assume file path
@@ -241,45 +251,52 @@ def choose_and_run_redactor(file_paths:List[str],
     else:
         in_allow_list_flat = []
-    # Try to connect to AWS services only if RUN_AWS_FUNCTIONS environmental variable is 1
     if pii_identification_method == "AWS Comprehend":
         print("Trying to connect to AWS Comprehend service")
-        if RUN_AWS_FUNCTIONS == "1":
-            comprehend_client = boto3.client('comprehend')
-        elif aws_access_key_textbox and aws_secret_key_textbox:
             comprehend_client = boto3.client('comprehend',
                 aws_access_key_id=aws_access_key_textbox,
                 aws_secret_access_key=aws_secret_key_textbox)
         elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
             comprehend_client = boto3.client('comprehend',
                 aws_access_key_id=AWS_ACCESS_KEY,
-                aws_secret_access_key=AWS_SECRET_KEY)
         else:
             comprehend_client = ""
-            out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
     else:
         comprehend_client = ""
     if in_redact_method == textract_option:
-        print("Trying to connect to AWS Textract service")
-        if RUN_AWS_FUNCTIONS == "1":
-            textract_client = boto3.client('textract')
-        elif aws_access_key_textbox and aws_secret_key_textbox:
-            comprehend_client = boto3.client('textract',
                 aws_access_key_id=aws_access_key_textbox,
                 aws_secret_access_key=aws_secret_key_textbox)
         elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
-            comprehend_client = boto3.client('textract',
                 aws_access_key_id=AWS_ACCESS_KEY,
-                aws_secret_access_key=AWS_SECRET_KEY)
         else:
             textract_client = ""
-            out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
     else:
         textract_client = ""
@@ -301,9 +318,6 @@ def choose_and_run_redactor(file_paths:List[str],
         file_paths_list = file_paths
         file_paths_loop = [file_paths_list[int(latest_file_completed)]]
-    # print("file_paths_list in choose_redactor function:", file_paths_list)
     for file in file_paths_loop:
         if isinstance(file, str):
             file_path = file
@@ -313,7 +327,6 @@ def choose_and_run_redactor(file_paths:List[str],
         if file_path:
             pdf_file_name_without_ext = get_file_name_without_type(file_path)
             pdf_file_name_with_ext = os.path.basename(file_path)
-            # print("Redacting file:", pdf_file_name_with_ext)
             is_a_pdf = is_pdf(file_path) == True
             if is_a_pdf == False and in_redact_method == text_ocr_option:
@@ -324,14 +337,14 @@ def choose_and_run_redactor(file_paths:List[str],
             out_message = "No file selected"
             print(out_message)
-            return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
         if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
-                return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
             print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
@@ -361,14 +374,11 @@ def choose_and_run_redactor(file_paths:List[str],
              custom_recogniser_word_list,
              redact_whole_page_list,
              max_fuzzy_spelling_mistakes_num,
-             match_fuzzy_whole_phrase_bool)
-            #print("log_files_output_paths at end of image redact function:", log_files_output_paths)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
-                #print("Request metadata:", new_request_metadata)
                 all_request_metadata.append(new_request_metadata)
         elif in_redact_method == text_ocr_option:
@@ -377,7 +387,7 @@ def choose_and_run_redactor(file_paths:List[str],
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
-                return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
@@ -407,7 +417,7 @@ def choose_and_run_redactor(file_paths:List[str],
         else:
             out_message = "No redaction method selected"
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
         # If at last page, save to file
         if current_loop_page >= number_of_pages:
@@ -422,9 +432,6 @@ def choose_and_run_redactor(file_paths:List[str],
             # Save file
             if is_pdf(file_path) == False:
                 out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
-                #pymupdf_doc[0].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)
-                #print("pymupdf_doc", pymupdf_doc)
-                #print("pymupdf_doc[0]", pymupdf_doc[0])
                 pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
                 out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
@@ -434,10 +441,6 @@ def choose_and_run_redactor(file_paths:List[str],
             out_file_paths.append(out_redacted_pdf_file_path)
-            #if log_files_output_paths:
-            #    log_files_output_paths.extend(log_files_output_paths)
             out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
             logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
@@ -450,27 +453,20 @@ def choose_and_run_redactor(file_paths:List[str],
             # Save the gradio_annotation_boxes to a JSON file
             try:
-                #print("Saving annotations to CSV")
-                # Convert json to csv and also save this
-                #print("annotations_all_pages:", annotations_all_pages)
-                #print("all_decision_process_table:", all_decision_process_table)
                 review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
                 out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
                 review_df.to_csv(out_review_file_path, index=None)
                 out_file_paths.append(out_review_file_path)
-                print("Saved review file to csv")
                 out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
                 with open(out_annotation_file_path, 'w') as f:
                     json.dump(annotations_all_pages, f)
                 log_files_output_paths.append(out_annotation_file_path)
-                print("Saving annotations to JSON")
             except Exception as e:
                 print("Could not save annotations to json or csv file:", e)
@@ -488,7 +484,6 @@ def choose_and_run_redactor(file_paths:List[str],
             combined_out_message = combined_out_message + " " + out_time_message  # Ensure this is a single string
             estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
-            #print("Estimated total processing time:", str(estimate_total_processing_time))
         else:
             toc = time.perf_counter()
@@ -511,19 +506,12 @@ def choose_and_run_redactor(file_paths:List[str],
     if combined_out_message: out_message = combined_out_message
-    #print("\nout_message at choose_and_run_redactor end is:", out_message)
     # Ensure no duplicated output files
     log_files_output_paths = list(set(log_files_output_paths))
     out_file_paths = list(set(out_file_paths))
     review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
-    #print("log_files_output_paths:", log_files_output_paths)
-    #print("out_file_paths:", out_file_paths)
-    #print("review_out_file_paths:", review_out_file_paths)
-    return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
     '''
@@ -646,9 +634,6 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot, image:Image, type="imag
         # Unpack coordinates
         x1, y1, x2, y2 = rect_coordinates
-        #print("scale_width:", scale_width)
-        #print("scale_height:", scale_height)
         x1 = (x1* scale_width)# + page_x_adjust
         new_y1 = ((y2 + (y1 - y2))* scale_height)# - page_y_adjust  # Calculate y1 correctly
         x2 = ((x1 + (x2 - x1)) * scale_width)# + page_x_adjust  # Calculate x1
@@ -1005,12 +990,10 @@ def redact_image_pdf(file_path:str,
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
-        #print("new_custom_recogniser:", new_custom_recogniser)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
         nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
         new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
-        #print("new_custom_recogniser:", new_custom_recogniser)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
@@ -1045,22 +1028,15 @@ def redact_image_pdf(file_path:str,
     else: page_min = page_min - 1
     print("Page range:", str(page_min + 1), "to", str(page_max))
-    #print("Current_loop_page:", current_loop_page)
     # If running Textract, check if file already exists. If it does, load in existing data
-    # Import results from json and convert
     if analysis_type == textract_option:
         json_file_path = output_folder + file_name + "_textract.json"
         if not os.path.exists(json_file_path):
             print("No existing Textract results file found.")
             textract_data = {}
-            #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
-            #log_files_output_paths.append(json_file_path)
-            #request_metadata = request_metadata + "\n" + new_request_metadata
-            #wrapped_text_blocks = {"pages":[text_blocks]}
         else:
             # Open the file and load the JSON data
             no_textract_file = False
@@ -1073,7 +1049,6 @@ def redact_image_pdf(file_path:str,
                 textract_data = json.load(json_file)
     ###
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
@@ -1087,7 +1062,6 @@ def redact_image_pdf(file_path:str,
         page_break_return = False
         reported_page_number = str(page_no + 1)
-        #print("Redacting page:", reported_page_number)
         # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
         try:
@@ -1104,7 +1078,6 @@ def redact_image_pdf(file_path:str,
             #print("Image is in range of pages to redact")
             if isinstance(image, str):
-                #print("image is a file path", image)
                 image = Image.open(image)
             # Need image size to convert textract OCR outputs to the correct sizes
@@ -1153,7 +1126,7 @@ def redact_image_pdf(file_path:str,
                             text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                         except Exception as e:
                             print("Textract extraction for page", reported_page_number, "failed due to:", e)
-                            text_bocks = []
                             new_request_metadata = "Failed Textract API call"
                         # Check if "pages" key exists, if not, initialize it as an empty list
@@ -1192,13 +1165,13 @@ def redact_image_pdf(file_path:str,
                 redaction_bboxes = []
-            if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
-            elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
-            # Save decision making process
-            bboxes_str = str(redaction_bboxes)
-            with open(interim_results_file_path, "w") as f:
-                f.write(bboxes_str)
             # Merge close bounding boxes
             merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
@@ -1210,7 +1183,6 @@ def redact_image_pdf(file_path:str,
                 all_image_annotations_boxes = []
                 for box in merged_redaction_bboxes:
-                    #print("box:", box)
                     x0 = box.left
                     y0 = box.top
@@ -1238,8 +1210,6 @@ def redact_image_pdf(file_path:str,
             ## Apply annotations with pymupdf
             else:
-                #print("merged_redaction_boxes:", merged_redaction_bboxes)
-                #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)
                     if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
@@ -1284,8 +1254,6 @@ def redact_image_pdf(file_path:str,
             time_taken = toc - tic
-            #print("toc - tic:", time_taken)
             # Break if time taken is greater than max_time seconds
             if time_taken > max_time:
                 print("Processing for", max_time, "seconds, breaking loop.")
@@ -1298,7 +1266,6 @@ def redact_image_pdf(file_path:str,
                     pymupdf_doc = images
                 # Check if the image already exists in annotations_all_pages
-                #print("annotations_all_pages:", annotations_all_pages)
                 existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
                 if existing_index is not None:
                     # Replace the existing annotation
@@ -1315,6 +1282,8 @@ def redact_image_pdf(file_path:str,
                         if json_file_path not in log_files_output_paths:
                             log_files_output_paths.append(json_file_path)
                 current_loop_page += 1
                 return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
@@ -1324,7 +1293,6 @@ def redact_image_pdf(file_path:str,
             pymupdf_doc = images
         # Check if the image already exists in annotations_all_pages
-        #print("annotations_all_pages:", annotations_all_pages)
         existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
         if existing_index is not None:
             # Replace the existing annotation
@@ -1409,9 +1377,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
         if isinstance(char, LTAnno):
-            # print("Character line:", "".join(character_text_objects_out))
-            # print("Char is an annotation object:", char)
             added_text = char.get_text()
             # Handle double quotes
@@ -1427,7 +1392,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
             # Check for line break (assuming a new line is indicated by a specific character)
             if '\n' in added_text:
-                #print("char_anno:", char)
                 # Finalize the current line
                 if current_word:
                     word_bboxes.append((current_word, current_word_bbox))
@@ -1475,13 +1440,12 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
         word_bboxes.append((current_word, current_word_bbox))
     if full_text:
-        #print("full_text before:", full_text)
         if re.search(r'[^\x00-\x7F]', full_text):  # Matches any non-ASCII character
             # Convert special characters to a human-readable format
-            #full_text = full_text.encode('latin1', errors='replace').decode('utf-8')
             full_text = clean_unicode_text(full_text)
             full_text = full_text.strip()
-        #print("full_text:", full_text)
         line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
@@ -1498,9 +1462,6 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
         analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
         # Remove brackets and split the string into four separate columns
-        #print("analysed_bounding_boxes_df_new:", analysed_bounding_boxes_df_new['boundingBox'])
-        # analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].str.strip('[]').str.split(',', expand=True)
         # Split the boundingBox list into four separate columns
         analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
@@ -1512,8 +1473,6 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
         analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
         analysed_bounding_boxes_df_new['page'] = page_num + 1
         decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
-        #print('\n\ndecision_process_table:\n\n', decision_process_table)
     return decision_process_table
@@ -1607,7 +1566,6 @@ def redact_text_pdf(
         return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
     # Update custom word list analyser object with any new words that have been added to the custom deny list
-    #print("custom_recogniser_word_list:", custom_recogniser_word_list)
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
@@ -1617,16 +1575,6 @@ def redact_text_pdf(
         new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
-        # List all elements currently in the nlp_analyser registry
-        #print("Current recognizers in nlp_analyser registry:")
-        #for recognizer_name in nlp_analyser.registry.recognizers:
-           #print(recognizer_name)
-           #print(recognizer_name.name)
-        #print("Custom recogniser:", nlp_analyser.registry)
-        #print("custom_recogniser_word_list:", custom_recogniser_word_list)
     tic = time.perf_counter()
     # Open with Pikepdf to get text lines
@@ -1641,7 +1589,6 @@ def redact_text_pdf(
     else: page_min = page_min - 1
     print("Page range is",str(page_min + 1), "to", str(page_max))
-    print("Current_loop_page:", current_loop_page)
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
@@ -1716,8 +1663,6 @@ def redact_text_pdf(
                     ### REDACTION
                     if chosen_redact_entities or chosen_redact_comprehend_entities:
-                        #print("Identifying redactions on page.")
                         page_analysed_bounding_boxes = run_page_text_redaction(
                                                             language,
                                                             chosen_redact_entities,
@@ -1735,24 +1680,18 @@ def redact_text_pdf(
                                                             comprehend_query_number
                                                             )
-                    #print("page_analyser_results:", page_analyser_results)
-                    #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
-                    #print("image:", image)
                     else:
                         page_analysed_bounding_boxes = []
                 page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
-                #print("page_analysed_bounding_boxes_out_converted:", page_analysed_bounding_boxes)
                 # Annotate redactions on page
                 pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
-                # print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
                 # Make pymupdf page redactions
-                #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)
                     if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
@@ -1761,9 +1700,6 @@ def redact_text_pdf(
                 pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
-                #print("image_annotations:", image_annotations)
-                #print("Did redact_page_with_pymupdf function")
                 reported_page_no = page_no + 1
                 print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
@@ -1778,14 +1714,12 @@ def redact_text_pdf(
                 if not decision_process_table_on_page.empty:
                     all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
-                    #print("all_decision_process_table:", all_decision_process_table)
                 toc = time.perf_counter()
                 time_taken = toc - tic
-                #print("toc - tic:", time_taken)
                 # Break if time taken is greater than max_time seconds
                 if time_taken > max_time:
                     print("Processing for", max_time, "seconds, breaking.")

 from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
 from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
+from tools.file_conversion import process_file, is_pdf, is_pdf_or_image, prepare_image_or_pdf
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
  custom_recogniser_word_list:List[str]=None,
  redact_whole_page_list:List[str]=None,
  latest_file_completed:int=0,
+ out_message:List=[],
+ out_file_paths:List=[],
+ log_files_output_paths:List=[],
  first_loop_state:bool=False,
  page_min:int=0,
  page_max:int=999,
  match_fuzzy_whole_phrase_bool:bool=True,
  aws_access_key_textbox:str='',
  aws_secret_key_textbox:str='',
+ annotate_max_pages:int=1,
+ review_file_state=[],
  output_folder:str=output_folder,
  progress=gr.Progress(track_tqdm=True)):
     '''
     - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
     - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
+    - annotate_max_pages (int, optional): Maximum page value for the annotation object
     - output_folder (str, optional): Output folder for results.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
+    # If there are no prepared PDF file paths, it is most likely that the prepare_image_or_pdf function has not been run. So do it here to get the outputs you need
+    if not pymupdf_doc:
+        print("Prepared PDF file not found, running prepare_image_or_pdf function")
+        out_message, prepared_pdf_file_paths, prepared_pdf_image_paths, annotate_max_pages, annotate_max_pages, pymupdf_doc, annotations_all_pages, review_file_state = prepare_image_or_pdf(file_paths, in_redact_method, latest_file_completed, out_message, first_loop_state, annotate_max_pages, annotations_all_pages)
+        annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
     #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
     review_out_file_paths = [prepared_pdf_file_paths[0]]
         estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
+        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
     # If we have reached the last page, return message
     if current_loop_page >= number_of_pages:
             review_out_file_paths.extend(out_review_file_path)
+        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
     # Create allow list
     # If string, assume file path
     else:
         in_allow_list_flat = []
+    # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
     if pii_identification_method == "AWS Comprehend":
         print("Trying to connect to AWS Comprehend service")
+        if aws_access_key_textbox and aws_secret_key_textbox:
+            print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
+            print("aws_access_key_textbox:", aws_access_key_textbox)
+            print("aws_secret_access_key:", aws_secret_key_textbox)
             comprehend_client = boto3.client('comprehend',
                 aws_access_key_id=aws_access_key_textbox,
                 aws_secret_access_key=aws_secret_key_textbox)
+        elif RUN_AWS_FUNCTIONS == "1":
+            print("Connecting to Comprehend via existing SSO connection")
+            comprehend_client = boto3.client('comprehend')
         elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+            print("Getting Comprehend credentials from environment variables")
             comprehend_client = boto3.client('comprehend',
                 aws_access_key_id=AWS_ACCESS_KEY,
+                aws_secret_access_key=AWS_SECRET_KEY)
         else:
             comprehend_client = ""
+            out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
     else:
         comprehend_client = ""
     if in_redact_method == textract_option:
+        print("Trying to connect to AWS Textract service")
+        if aws_access_key_textbox and aws_secret_key_textbox:
+            print("Connecting to Textract using AWS access key and secret keys from textboxes.")
+            textract_client = boto3.client('textract',
                 aws_access_key_id=aws_access_key_textbox,
                 aws_secret_access_key=aws_secret_key_textbox)
+        elif RUN_AWS_FUNCTIONS == "1":
+            print("Connecting to Textract via existing SSO connection")
+            textract_client = boto3.client('textract')
         elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+            print("Getting Textract credentials from environment variables.")
+            textract_client = boto3.client('textract',
                 aws_access_key_id=AWS_ACCESS_KEY,
+                aws_secret_access_key=AWS_SECRET_KEY)
         else:
             textract_client = ""
+            out_message = "Cannot connect to AWS Textract. Please provide access keys under Textract settings on the Redaction settings tab,choose another text extraction method."
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
     else:
         textract_client = ""
         file_paths_list = file_paths
         file_paths_loop = [file_paths_list[int(latest_file_completed)]]
     for file in file_paths_loop:
         if isinstance(file, str):
             file_path = file
         if file_path:
             pdf_file_name_without_ext = get_file_name_without_type(file_path)
             pdf_file_name_with_ext = os.path.basename(file_path)
             is_a_pdf = is_pdf(file_path) == True
             if is_a_pdf == False and in_redact_method == text_ocr_option:
             out_message = "No file selected"
             print(out_message)
+            return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
         if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
+                return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
             print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
              custom_recogniser_word_list,
              redact_whole_page_list,
              max_fuzzy_spelling_mistakes_num,
+             match_fuzzy_whole_phrase_bool,
+             log_files_output_paths=log_files_output_paths)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
                 all_request_metadata.append(new_request_metadata)
         elif in_redact_method == text_ocr_option:
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
+                return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
         else:
             out_message = "No redaction method selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
         # If at last page, save to file
         if current_loop_page >= number_of_pages:
             # Save file
             if is_pdf(file_path) == False:
                 out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
                 pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
                 out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
             out_file_paths.append(out_redacted_pdf_file_path)
             out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
             logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
             # Save the gradio_annotation_boxes to a JSON file
             try:
                 review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
                 out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
                 review_df.to_csv(out_review_file_path, index=None)
                 out_file_paths.append(out_review_file_path)
+                #print("Saved review file to csv")
                 out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
                 with open(out_annotation_file_path, 'w') as f:
                     json.dump(annotations_all_pages, f)
                 log_files_output_paths.append(out_annotation_file_path)
+                #print("Saving annotations to JSON")
             except Exception as e:
                 print("Could not save annotations to json or csv file:", e)
             combined_out_message = combined_out_message + " " + out_time_message  # Ensure this is a single string
             estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         else:
             toc = time.perf_counter()
     if combined_out_message: out_message = combined_out_message
     # Ensure no duplicated output files
     log_files_output_paths = list(set(log_files_output_paths))
     out_file_paths = list(set(out_file_paths))
     review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
+    return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
     '''
         # Unpack coordinates
         x1, y1, x2, y2 = rect_coordinates
         x1 = (x1* scale_width)# + page_x_adjust
         new_y1 = ((y2 + (y1 - y2))* scale_height)# - page_y_adjust  # Calculate y1 correctly
         x2 = ((x1 + (x2 - x1)) * scale_width)# + page_x_adjust  # Calculate x1
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
         nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
         new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
     else: page_min = page_min - 1
     print("Page range:", str(page_min + 1), "to", str(page_max))
     # If running Textract, check if file already exists. If it does, load in existing data
     if analysis_type == textract_option:
         json_file_path = output_folder + file_name + "_textract.json"
         if not os.path.exists(json_file_path):
             print("No existing Textract results file found.")
             textract_data = {}
         else:
             # Open the file and load the JSON data
             no_textract_file = False
                 textract_data = json.load(json_file)
     ###
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
         page_break_return = False
         reported_page_number = str(page_no + 1)
         # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
         try:
             #print("Image is in range of pages to redact")
             if isinstance(image, str):
                 image = Image.open(image)
             # Need image size to convert textract OCR outputs to the correct sizes
                             text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                         except Exception as e:
                             print("Textract extraction for page", reported_page_number, "failed due to:", e)
+                            text_blocks = []
                             new_request_metadata = "Failed Textract API call"
                         # Check if "pages" key exists, if not, initialize it as an empty list
                 redaction_bboxes = []
+            # if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
+            # elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
+            # # Save decision making process
+            # bboxes_str = str(redaction_bboxes)
+            # with open(interim_results_file_path, "w") as f:
+            #     f.write(bboxes_str)
             # Merge close bounding boxes
             merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
                 all_image_annotations_boxes = []
                 for box in merged_redaction_bboxes:
                     x0 = box.left
                     y0 = box.top
             ## Apply annotations with pymupdf
             else:
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)
                     if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
             time_taken = toc - tic
             # Break if time taken is greater than max_time seconds
             if time_taken > max_time:
                 print("Processing for", max_time, "seconds, breaking loop.")
                     pymupdf_doc = images
                 # Check if the image already exists in annotations_all_pages
                 existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
                 if existing_index is not None:
                     # Replace the existing annotation
                         if json_file_path not in log_files_output_paths:
                             log_files_output_paths.append(json_file_path)
+                            print("At end of redact_image_pdf function where time over max.", json_file_path, "not found in log_files_output_paths, appended to list:", log_files_output_paths)
                 current_loop_page += 1
                 return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
             pymupdf_doc = images
         # Check if the image already exists in annotations_all_pages
         existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
         if existing_index is not None:
             # Replace the existing annotation
         if isinstance(char, LTAnno):
             added_text = char.get_text()
             # Handle double quotes
             # Check for line break (assuming a new line is indicated by a specific character)
             if '\n' in added_text:
                 # Finalize the current line
                 if current_word:
                     word_bboxes.append((current_word, current_word_bbox))
         word_bboxes.append((current_word, current_word_bbox))
     if full_text:
         if re.search(r'[^\x00-\x7F]', full_text):  # Matches any non-ASCII character
             # Convert special characters to a human-readable format
             full_text = clean_unicode_text(full_text)
             full_text = full_text.strip()
         line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
         analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
         # Remove brackets and split the string into four separate columns
         # Split the boundingBox list into four separate columns
         analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
         analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
         analysed_bounding_boxes_df_new['page'] = page_num + 1
         decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
     return decision_process_table
         return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
     # Update custom word list analyser object with any new words that have been added to the custom deny list
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
     tic = time.perf_counter()
     # Open with Pikepdf to get text lines
     else: page_min = page_min - 1
     print("Page range is",str(page_min + 1), "to", str(page_max))
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
                     ### REDACTION
                     if chosen_redact_entities or chosen_redact_comprehend_entities:
                         page_analysed_bounding_boxes = run_page_text_redaction(
                                                             language,
                                                             chosen_redact_entities,
                                                             comprehend_query_number
                                                             )
                     else:
                         page_analysed_bounding_boxes = []
                 page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
                 # Annotate redactions on page
                 pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
                 # Make pymupdf page redactions
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)
                     if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
                 pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
                 reported_page_no = page_no + 1
                 print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
                 if not decision_process_table_on_page.empty:
                     all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
                 toc = time.perf_counter()
                 time_taken = toc - tic
                 # Break if time taken is greater than max_time seconds
                 if time_taken > max_time:
                     print("Processing for", max_time, "seconds, breaking.")

tools/redaction_review.py CHANGED Viewed

@@ -396,7 +396,7 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
         row_value_page = evt.row_value[0] # This is the page number value
         return row_value_page
-def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
     '''
     Converts coordinates from image space to Adobe PDF space.
@@ -431,7 +431,7 @@ def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width,
     return pdf_x1, pdf_y1, pdf_x2, pdf_y2
-def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
     '''
     Create an xfdf file from a review csv file and a pdf
     '''

         row_value_page = evt.row_value[0] # This is the page number value
         return row_value_page
+def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
     '''
     Converts coordinates from image space to Adobe PDF space.
     return pdf_x1, pdf_y1, pdf_x2, pdf_y2
+def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str]):
     '''
     Create an xfdf file from a review csv file and a pdf
     '''