Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Mar 24

Commit

6319afc

1 Parent(s): 66e145d

More config options. Fixed some bugs with removing elements from review page and Adobe export. Some UI rearrangements

Browse files

Files changed (16) hide show

app.py +39 -56
requirements.txt +1 -1
tools/auth.py +4 -24
tools/aws_functions.py +5 -32
tools/aws_textract.py +5 -12
tools/cli_redact.py +3 -2
tools/config.py +120 -0
tools/custom_csvlogger.py +0 -2
tools/custom_image_analyser_engine.py +4 -4
tools/data_anonymise.py +2 -3
tools/file_conversion.py +87 -45
tools/file_redaction.py +14 -21
tools/find_duplicate_pages.py +2 -2
tools/helper_functions.py +1 -47
tools/presidio_analyzer_custom.py +2 -2
tools/redaction_review.py +132 -57

app.py CHANGED Viewed

@@ -10,10 +10,11 @@ from datetime import datetime
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
-from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
 from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
@@ -142,9 +143,6 @@ with app:
     prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
     ## Settings page variables
-    default_allow_list_file_name = "default_allow_list.csv"
-    default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
     default_deny_list_file_name = "default_deny_list.csv"
     default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
     in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
@@ -155,7 +153,11 @@ with app:
     # S3 settings for default allow list load
     s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
-    s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
     # Base dataframe for recognisers that is not modified subsequent to load
@@ -185,7 +187,7 @@ with app:
     ###
     with gr.Tab("Redact PDFs/images"):
         with gr.Accordion("Redact document", open = True):
-            in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
             # if RUN_AWS_FUNCTIONS == "1":
             in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
             pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
@@ -217,18 +219,16 @@ with app:
     ###
     with gr.Tab("Review redactions", id="tab_object_annotation"):
-        with gr.Accordion(label = "Review redaction file", open=True):
-            output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
-            upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)", variant="primary")
-        with gr.Row():
-            annotation_button_apply = gr.Button("Apply revised redactions to pdf", variant="secondary")
         with gr.Row():
             annotate_zoom_in = gr.Button("Zoom in", visible=False)
             annotate_zoom_out = gr.Button("Zoom out", visible=False)
         with gr.Row():
             clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
-        with gr.Row():
             with gr.Column(scale=2):
                 with gr.Row(equal_height=True):
                     annotation_last_page_button = gr.Button("Previous page", scale = 4)
@@ -236,7 +236,8 @@ with app:
                     annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
                     annotation_next_page_button = gr.Button("Next page", scale = 4)
             with gr.Column(scale=1):
-                blank_markdown_top = gr.Markdown(value="", label="")
         with gr.Row():
             with gr.Column(scale=2):
@@ -261,12 +262,12 @@ with app:
                     interactive=False
                 )
             with gr.Column(scale=1):
-                with gr.Row():
                     recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
                     page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
                 text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
                 recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"])
-                with gr.Row():
                     reset_dropdowns_btn = gr.Button(value="Reset filters")
                     exclude_selected_btn = gr.Button(value="Exclude items in table from redactions")
                 undo_last_removal_btn = gr.Button(value="Undo last element removal")
@@ -393,21 +394,22 @@ with app:
     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, document_cropboxes]).\
-        success(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
-                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes], api_name="redact_doc").\
                     success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
     # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
-    # current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
-    #                 outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes]).\
-    #                 success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
     # If a file has been completed, the function will continue onto the next document
-    # latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
-    #                 outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes]).\
-    #                 success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
-    #                 success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
     ###
     # REVIEW PDF REDACTIONS
@@ -479,8 +481,9 @@ with app:
         success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
     exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
-        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])#.\
-        #success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base]).\
         success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
@@ -488,7 +491,7 @@ with app:
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
         success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes]).\
-        success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
@@ -533,14 +536,14 @@ with app:
     # Get connection details on app load
     app.load(get_connection_params, inputs=[output_folder_textbox], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox])
-    # If running on AWS, load in the default allow list file from S3
-    # if RUN_AWS_FUNCTIONS == "1":
-    #     print("default_allow_list_output_folder_location:", default_allow_list_loc)
-    #     if not os.path.exists(default_allow_list_loc):
-    #         app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
-    #         success(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
-    #     else:
-    #         app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
@@ -566,27 +569,7 @@ with app:
     latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
     success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
-# Get some environment variables and Launch the Gradio app
-COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
-print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
-1
-RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
-print(f'The value of RUN_DIRECT_MODE is {RUN_DIRECT_MODE}')
-MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
-print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
-MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
-print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
-GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
-print(f'The value of GRADIO_SERVER_PORT is {GRADIO_SERVER_PORT}')
-ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
-print(f'The value of ROOT_PATH is {ROOT_PATH}')
-DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '5')
-print(f'The value of DEFAULT_CONCURRENCY_LIMIT is {DEFAULT_CONCURRENCY_LIMIT}')
 if __name__ == "__main__":

 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
+from tools.config import output_folder, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, DEFAULT_ALLOW_LIST_PATH
+from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
+from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
     prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
     ## Settings page variables
     default_deny_list_file_name = "default_deny_list.csv"
     default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
     in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
     # S3 settings for default allow list load
     s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
+    default_allow_list_file_name = "default_allow_list.csv"
+    default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
+    s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=DEFAULT_ALLOW_LIST_PATH, visible=False)
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
     # Base dataframe for recognisers that is not modified subsequent to load
     ###
     with gr.Tab("Redact PDFs/images"):
         with gr.Accordion("Redact document", open = True):
+            in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
             # if RUN_AWS_FUNCTIONS == "1":
             in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
             pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
     ###
     with gr.Tab("Review redactions", id="tab_object_annotation"):
+        with gr.Accordion(label = "Review PDF redactions", open=True):
+            output_review_files = gr.File(label="Upload original PDF and 'review_file' csv here to review suggested redactions", file_count='multiple', height=file_input_height)
+            upload_previous_review_file_btn = gr.Button("Review PDF and 'review file' csv provided above", variant="primary")
         with gr.Row():
             annotate_zoom_in = gr.Button("Zoom in", visible=False)
             annotate_zoom_out = gr.Button("Zoom out", visible=False)
         with gr.Row():
             clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
+        with gr.Row(equal_height=True):
             with gr.Column(scale=2):
                 with gr.Row(equal_height=True):
                     annotation_last_page_button = gr.Button("Previous page", scale = 4)
                     annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
                     annotation_next_page_button = gr.Button("Next page", scale = 4)
             with gr.Column(scale=1):
+                annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="secondary")
+                #blank_markdown_top = gr.Markdown(value="", label="")
         with gr.Row():
             with gr.Column(scale=2):
                     interactive=False
                 )
             with gr.Column(scale=1):
+                with gr.Row(equal_height=True):
                     recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
                     page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
                 text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
                 recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"])
+                with gr.Row(equal_height=True):
                     reset_dropdowns_btn = gr.Button(value="Reset filters")
                     exclude_selected_btn = gr.Button(value="Exclude items in table from redactions")
                 undo_last_removal_btn = gr.Button(value="Undo last element removal")
     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
+    # Run redaction function
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, document_cropboxes]).\
+        success(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes],
+                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, document_cropboxes], api_name="redact_doc").\
                     success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
     # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
+    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes],
+                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, document_cropboxes]).\
+                    success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
     # If a file has been completed, the function will continue onto the next document
+    latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes],
+                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, document_cropboxes]).\
+                    success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
+                    success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
     ###
     # REVIEW PDF REDACTIONS
         success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
     exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
+        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
+        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
+    # success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
     undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base]).\
         success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
         success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes]).\
+        success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
     # Get connection details on app load
     app.load(get_connection_params, inputs=[output_folder_textbox], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox])
+    # If relevant environment variable is set, load in the default allow list file from S3
+    if GET_DEFAULT_ALLOW_LIST == "True" and DEFAULT_ALLOW_LIST_PATH:
+        print("Loading allow list from default_allow_list_output_folder_location:", default_allow_list_loc)
+        if not os.path.exists(default_allow_list_loc):
+            app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
+            success(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
+        else:
+            app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
     latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
     success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -13,7 +13,7 @@ spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
 gradio==5.22.0
-boto3==1.36.26
 pyarrow==19.0.1
 openpyxl==3.1.5
 Faker==36.1.1

 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
 gradio==5.22.0
+boto3==1.37.17
 pyarrow==19.0.1
 openpyxl==3.1.5
 Faker==36.1.1

tools/auth.py CHANGED Viewed

@@ -1,32 +1,12 @@
-import os
 import boto3
-import gradio as gr
 import hmac
 import hashlib
 import base64
-def get_or_create_env_var(var_name, default_value):
-    # Get the environment variable if it exists
-    value = os.environ.get(var_name)
-    # If it doesn't exist, set it to the default value
-    if value is None:
-        os.environ[var_name] = default_value
-        value = default_value
-    return value
-client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
-#print(f'The value of AWS_CLIENT_ID is {client_id}')
-client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
-#print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
-user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
-#print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
-def calculate_secret_hash(client_id, client_secret, username):
     message = username + client_id
     dig = hmac.new(
         str(client_secret).encode('utf-8'),

+#import os
 import boto3
+#import gradio as gr
 import hmac
 import hashlib
 import base64
+from tools.config import client_id, client_secret, user_pool_id
+def calculate_secret_hash(client_id:str, client_secret:str, username:str):
     message = username + client_id
     dig = hmac.new(
         str(client_secret).encode('utf-8'),

tools/aws_functions.py CHANGED Viewed

@@ -3,37 +3,13 @@ import pandas as pd
 import boto3
 import tempfile
 import os
-from tools.helper_functions import get_or_create_env_var
-from dotenv import load_dotenv
 PandasDataFrame = Type[pd.DataFrame]
 # Get AWS credentials
-bucket_name=""
-RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
-print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
-AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
-print(f'The value of AWS_REGION is {AWS_REGION}')
-# If you have an aws_config env file in the config folder, you can load in AWS keys this way
-AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '/env/aws_config.env')
-print(f'The value of AWS_CONFIG_PATH is {AWS_CONFIG_PATH}')
-if os.path.exists(AWS_CONFIG_PATH):
-    print("Loading AWS keys from config folder")
-    load_dotenv(AWS_CONFIG_PATH)
-AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
-if AWS_ACCESS_KEY:
-    print(f'AWS_ACCESS_KEY found in environment variables')
-AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
-if AWS_SECRET_KEY:
-    print(f'AWS_SECRET_KEY found in environment variables')
 def get_assumed_role_info():
     sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
@@ -49,14 +25,11 @@ def get_assumed_role_info():
     return assumed_role_arn, assumed_role_name
 if RUN_AWS_FUNCTIONS == "1":
-    try:
-        bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
         session = boto3.Session()
-        #print("session:", session)
     except Exception as e:
-        print("Could not start boto3 session:", e)
     try:
         assumed_role_arn, assumed_role_name = get_assumed_role_info()

 import boto3
 import tempfile
 import os
+from tools.config import AWS_REGION, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET
 PandasDataFrame = Type[pd.DataFrame]
 # Get AWS credentials
+bucket_name = DOCUMENT_REDACTION_BUCKET
 def get_assumed_role_info():
     sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
     return assumed_role_arn, assumed_role_name
 if RUN_AWS_FUNCTIONS == "1":
+    try:
         session = boto3.Session()
     except Exception as e:
+        print("Could not start boto3 session:", e)
     try:
         assumed_role_arn, assumed_role_name = get_assumed_role_info()

tools/aws_textract.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import boto3
-#from PIL import Image
 from typing import List
 import io
 import os
@@ -7,12 +6,10 @@ import json
 from collections import defaultdict
 import pikepdf
 import time
-# Example: converting this single page to an image
-#from pdf2image import convert_from_bytes
 from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
-from tools.aws_functions import AWS_ACCESS_KEY, AWS_SECRET_KEY
-def extract_textract_metadata(response):
     """Extracts metadata from an AWS Textract response."""
     #print("Document metadata:", response['DocumentMetadata'])
@@ -83,8 +80,7 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
     # Return a list containing the wrapped response and the metadata
     return wrapped_response, request_metadata  # Return as a list to match the desired structure
-def convert_pike_pdf_page_to_bytes(pdf, page_num):
     # Create a new empty PDF
     new_pdf = pikepdf.Pdf.new()
@@ -109,8 +105,7 @@ def convert_pike_pdf_page_to_bytes(pdf, page_num):
     return pdf_bytes
-def json_to_ocrresult(json_data, page_width, page_height, page_no):
     '''
     Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
     '''
@@ -274,7 +269,7 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
     return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
-def load_and_convert_textract_json(textract_json_file_path, log_files_output_paths):
     """
     Loads Textract JSON from a file, detects if conversion is needed,
     and converts if necessary.
@@ -317,8 +312,6 @@ def load_and_convert_textract_json(textract_json_file_path, log_files_output_pat
         print("textract data:", textract_data)
         return {}, True, log_files_output_paths  # Return empty data if JSON is not recognized
 # Load Textract JSON output (assuming it's stored in a variable called `textract_output`)
 def restructure_textract_output(textract_output:object):
     '''

 import boto3
 from typing import List
 import io
 import os
 from collections import defaultdict
 import pikepdf
 import time
 from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
+from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY
+def extract_textract_metadata(response:object):
     """Extracts metadata from an AWS Textract response."""
     #print("Document metadata:", response['DocumentMetadata'])
     # Return a list containing the wrapped response and the metadata
     return wrapped_response, request_metadata  # Return as a list to match the desired structure
+def convert_pike_pdf_page_to_bytes(pdf:object, page_num:int):
     # Create a new empty PDF
     new_pdf = pikepdf.Pdf.new()
     return pdf_bytes
+def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
     '''
     Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
     '''
     return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
+def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str):
     """
     Loads Textract JSON from a file, detects if conversion is needed,
     and converts if necessary.
         print("textract data:", textract_data)
         return {}, True, log_files_output_paths  # Return empty data if JSON is not recognized
 # Load Textract JSON output (assuming it's stored in a variable called `textract_output`)
 def restructure_textract_output(textract_output:object):
     '''

tools/cli_redact.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import argparse
 import os
-from tools.helper_functions import ensure_output_folder_exists, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
 from tools.file_redaction import choose_and_run_redactor
 import pandas as pd
 from datetime import datetime
-chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV',          'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
                                 'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
                                 'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
                                 'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',

 import argparse
 import os
+from tools.config import get_or_create_env_var
+from tools.helper_functions import ensure_output_folder_exists,tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
 from tools.file_redaction import choose_and_run_redactor
 import pandas as pd
 from datetime import datetime
+chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
                                 'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
                                 'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
                                 'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',

tools/config.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import os
+from dotenv import load_dotenv
+# Set or retrieve configuration variables for the redaction app
+def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
+    '''
+    Get an environmental variable, and set it to a default value if it doesn't exist
+    '''
+    # Get the environment variable if it exists
+    value = os.environ.get(var_name)
+    # If it doesn't exist, set the environment variable to the default value
+    if value is None:
+        os.environ[var_name] = default_value
+        value = default_value
+    if print_val == True:
+        print(f'The value of {var_name} is {value}')
+    return value
+# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. '/env/app_config.env'
+APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', '', print_val=True)
+if os.path.exists(APP_CONFIG_PATH):
+    print(f"Loading APP variables from config file {APP_CONFIG_PATH}")
+    load_dotenv(APP_CONFIG_PATH)
+###
+# AWS CONFIG
+###
+# If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. '/env/aws_config.env'
+AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '', print_val=True)
+if os.path.exists(AWS_CONFIG_PATH):
+    print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
+    load_dotenv(AWS_CONFIG_PATH)
+RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
+AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
+client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
+client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
+user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
+AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
+if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
+AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
+if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
+DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
+# Custom headers e.g. if routing traffic through Cloudfront
+# Retrieving or setting CUSTOM_HEADER
+CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
+if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
+# Retrieving or setting CUSTOM_HEADER_VALUE
+CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
+if CUSTOM_HEADER_VALUE: print(f'CUSTOM_HEADER_VALUE found')
+###
+# Images config
+###
+IMAGES_DPI = get_or_create_env_var('IMAGES_DPI', '300.0')
+LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True')
+MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to None if blank in file_conversion.py
+###
+# File I/O config
+###
+output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
+print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
+session_output_folder = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
+print(f'The value of SESSION_OUTPUT_FOLDER is {session_output_folder}')
+input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
+print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
+###
+# REDACTION CONFIG
+###
+# Number of pages to loop through before breaking the function and restarting from the last finished page.
+page_break_value = get_or_create_env_var('page_break_value', '50000')
+max_time_value = get_or_create_env_var('max_time_value', '999999')
+CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
+###
+# APP RUN CONFIG
+###
+# Get some environment variables and Launch the Gradio app
+COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
+RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
+MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
+MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
+GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
+ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
+DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '5')
+GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', 'False')
+DEFAULT_ALLOW_LIST_PATH = get_or_create_env_var('DEFAULT_ALLOW_LIST_PATH', '')

tools/custom_csvlogger.py CHANGED Viewed

@@ -8,9 +8,7 @@ from collections.abc import Sequence
 from multiprocessing import Lock
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from gradio_client import utils as client_utils
 import gradio as gr
 from gradio import utils, wasm_utils

 from multiprocessing import Lock
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from gradio_client import utils as client_utils
 import gradio as gr
 from gradio import utils, wasm_utils

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -405,7 +405,7 @@ def bounding_boxes_overlap(box1:List, box2:List):
     return (box1[0] < box2[2] and box2[0] < box1[2] and
             box1[1] < box2[3] and box2[1] < box1[3])
-def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results:List[Tuple]):
     for entity in page_analyser_result:
         entity_start = entity.start
         entity_end = entity.end
@@ -443,7 +443,7 @@ def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_li
     return all_text_line_results
-def map_back_comprehend_entity_results(response, current_batch_mapping:List[Tuple], allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
     if not response or "Entities" not in response:
         return all_text_line_results
@@ -686,7 +686,7 @@ def run_page_text_redaction(
     return page_analysed_bounding_boxes
-def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
     '''
     Merge identified bounding boxes containing PII that are very close to one another
     '''
@@ -776,7 +776,7 @@ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combin
     return analysed_bounding_boxes
 # Function to combine OCR results into line-level results
-def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
     # Group OCR results into lines based on y_threshold
     lines = []
     current_line = []

     return (box1[0] < box2[2] and box2[0] < box1[2] and
             box1[1] < box2[3] and box2[1] < box1[3])
+def map_back_entity_results(page_analyser_result:dict, page_text_mapping:dict, all_text_line_results:List[Tuple]):
     for entity in page_analyser_result:
         entity_start = entity.start
         entity_end = entity.end
     return all_text_line_results
+def map_back_comprehend_entity_results(response:object, current_batch_mapping:List[Tuple], allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
     if not response or "Entities" not in response:
         return all_text_line_results
     return page_analysed_bounding_boxes
+def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
     '''
     Merge identified bounding boxes containing PII that are very close to one another
     '''
     return analysed_bounding_boxes
 # Function to combine OCR results into line-level results
+def combine_ocr_results(ocr_results:dict, x_threshold:float=50.0, y_threshold:float=12.0):
     # Group OCR results into lines based on y_threshold
     lines = []
     current_line = []

tools/data_anonymise.py CHANGED Viewed

@@ -13,12 +13,11 @@ from typing import List, Dict, Any
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
-from tools.aws_functions import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY
-from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
 from tools.custom_image_analyser_engine import do_aws_comprehend_call
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict

 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
+from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, output_folder
+from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
 from tools.custom_image_analyser_engine import do_aws_comprehend_call
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict

tools/file_conversion.py CHANGED Viewed

@@ -1,13 +1,14 @@
 from pdf2image import convert_from_path, pdfinfo_from_path
-from tools.helper_functions import get_file_name_without_type, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
 from PIL import Image, ImageFile
 import os
 import re
 import time
 import json
 import pymupdf
 import pandas as pd
-import numpy as np
 import shutil
 from pymupdf import Rect
 from fitz import Page
@@ -19,9 +20,13 @@ from pdf2image import convert_from_path
 from PIL import Image
 from scipy.spatial import cKDTree
-image_dpi = 300.0
-ImageFile.LOAD_TRUNCATED_IMAGES = True
-Image.MAX_IMAGE_PIXELS = None
 def is_pdf_or_image(filename):
     """
@@ -54,8 +59,7 @@ def is_pdf(filename):
 # %%
 ## Convert pdf to image if necessary
-CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
-print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
 def check_image_size_and_reduce(out_path:str, image:Image):
     '''
@@ -360,6 +364,27 @@ def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colou
     return whole_page_img_annotation_box
 def prepare_image_or_pdf(
     file_paths: List[str],
     in_redact_method: str,
@@ -371,6 +396,7 @@ def prepare_image_or_pdf(
     prepare_for_review:bool = False,
     in_fully_redacted_list:List[int]=[],
     output_folder:str=output_folder,
     progress: Progress = Progress(track_tqdm=True)
 ) -> tuple[List[str], List[str]]:
     """
@@ -390,6 +416,7 @@ def prepare_image_or_pdf(
         prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
         in_fully_redacted_list(optional, List of int): A list of pages to fully redact
         output_folder (optional, str): The output folder for file save
         progress (optional, Progress): Progress tracker for the operation
@@ -400,6 +427,10 @@ def prepare_image_or_pdf(
     tic = time.perf_counter()
     json_from_csv = False
     original_cropboxes = []  # Store original CropBox values
     if isinstance(in_fully_redacted_list, pd.DataFrame):
         if not in_fully_redacted_list.empty:
@@ -426,11 +457,6 @@ def prepare_image_or_pdf(
     if isinstance(out_message, str):
         out_message = [out_message]
-    converted_file_paths = []
-    image_file_paths = []
-    pymupdf_doc = []
-    review_file_csv = pd.DataFrame()
     if not file_paths:
         file_paths = []
@@ -496,23 +522,35 @@ def prepare_image_or_pdf(
         # If a pdf, load as a pymupdf document
         if is_pdf(file_path):
             pymupdf_doc = pymupdf.open(file_path)
             # Load cropbox dimensions to use later
             converted_file_path = file_path
-            image_file_paths, image_sizes_width, image_sizes_height = process_file(file_path, prepare_for_review)
-            page_sizes = []
-            for i, page in enumerate(pymupdf_doc):
-                page_no = i
-                reported_page_no = i + 1
-                pymupdf_page = pymupdf_doc.load_page(page_no)
-                original_cropboxes.append(pymupdf_page.cropbox)  # Save original CropBox
-                # Create a page_sizes_object
-                out_page_image_sizes = {"page":reported_page_no, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
-                page_sizes.append(out_page_image_sizes)
             #Create base version of the annotation object that doesn't have any annotations in it
             if (not all_annotations_object) & (prepare_for_review == True):
@@ -521,6 +559,7 @@ def prepare_image_or_pdf(
                 for image_path in image_file_paths:
                     annotation = {}
                     annotation["image"] = image_path
                     all_annotations_object.append(annotation)
@@ -546,7 +585,7 @@ def prepare_image_or_pdf(
             #print("image_file_paths:", image_file_paths)
             # Create a page_sizes_object
-            out_page_image_sizes = {"page":1, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":original_cropboxes[-1].width, "cropbox_height":original_cropboxes[-1].height}
             page_sizes.append(out_page_image_sizes)
             converted_file_path = output_folder + file_name_with_ext
@@ -557,7 +596,7 @@ def prepare_image_or_pdf(
         elif file_extension in ['.csv']:
             review_file_csv = read_file(file)
-            all_annotations_object = convert_pandas_df_to_review_json(review_file_csv, image_file_paths, page_sizes)
             json_from_csv = True
             print("Converted CSV review file to json")
@@ -708,7 +747,7 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
     return out_message, out_file_paths
-def join_values_within_threshold(df1, df2):
     # Threshold for matching
     threshold = 5
@@ -739,7 +778,7 @@ def join_values_within_threshold(df1, df2):
     print(final_df)
-def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame(), page_sizes:List[dict]=[]) -> pd.DataFrame:
     '''
     Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
     '''
@@ -887,7 +926,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
     #    review_file_df[col] = np.floor(review_file_df[col])
     # If colours are saved as list, convert to tuple
-    review_file_df["color"] = review_file_df["color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
     # print("page_sizes:", page_sizes)
@@ -910,32 +949,35 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
     review_file_df = review_file_df.sort_values(['page', 'ymin', 'xmin', 'label'])
-    review_file_df.to_csv(output_folder + "review_file_test.csv", index=None)
     return review_file_df
-def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths: List[Image.Image], page_sizes:List[dict]=[]) -> List[dict]:
     '''
     Convert a review csv to a json file for use by the Gradio Annotation object.
     '''
-    if page_sizes:
         page_sizes_df = pd.DataFrame(page_sizes)
-        #print(page_sizes_df)
-        if "image_width" not in review_file_df.columns:
-            review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
-        #print("review_file_df in convert pandas df to review json function:", review_file_df[["xmin", "xmax", "ymin", "ymax"]])
-        # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
-        if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
-            review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["image_width"]
-            review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["image_width"]
-            review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["image_height"]
-            review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["image_height"]
     # Keep only necessary columns
     review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
@@ -949,9 +991,8 @@ def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths:
     # Create a list to hold the JSON data
     json_data = []
-    for n, pdf_image_path in enumerate(image_paths):
-        reported_page_number = int(n + 1)
         if reported_page_number in review_file_df["page"].values:
@@ -969,6 +1010,7 @@ def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths:
         else:
             annotation = {}
             annotation["image"] = pdf_image_path
         # Append the structured data to the json_data list
         json_data.append(annotation)

 from pdf2image import convert_from_path, pdfinfo_from_path
 from PIL import Image, ImageFile
 import os
 import re
 import time
 import json
 import pymupdf
+from pymupdf import Document
 import pandas as pd
+#import numpy as np
 import shutil
 from pymupdf import Rect
 from fitz import Page
 from PIL import Image
 from scipy.spatial import cKDTree
+from tools.config import output_folder, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR
+from tools.helper_functions import get_file_name_without_type, tesseract_ocr_option, text_ocr_option, textract_option, read_file
+image_dpi = float(IMAGES_DPI)
+if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
+else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
+ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
 def is_pdf_or_image(filename):
     """
 # %%
 ## Convert pdf to image if necessary
 def check_image_size_and_reduce(out_path:str, image:Image):
     '''
     return whole_page_img_annotation_box
+def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float]):
+    page_sizes = []
+    original_cropboxes = []
+    for page_no, page in enumerate(pymupdf_doc):
+        reported_page_no = page_no + 1
+        pymupdf_page = pymupdf_doc.load_page(page_no)
+        original_cropboxes.append(pymupdf_page.cropbox)  # Save original CropBox
+        # Create a page_sizes_object.
+        # If images have been created, then image width an height come from this value. Otherwise, they are set to the cropbox size
+        if image_sizes_width and image_sizes_height:
+            out_page_image_sizes = {"page":reported_page_no, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
+        else:
+            out_page_image_sizes = {"page":reported_page_no, "image_width":pd.NA(), "image_height":pd.NA(), "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
+        page_sizes.append(out_page_image_sizes)
+    return page_sizes, original_cropboxes
 def prepare_image_or_pdf(
     file_paths: List[str],
     in_redact_method: str,
     prepare_for_review:bool = False,
     in_fully_redacted_list:List[int]=[],
     output_folder:str=output_folder,
+    prepare_images:bool=True,
     progress: Progress = Progress(track_tqdm=True)
 ) -> tuple[List[str], List[str]]:
     """
         prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
         in_fully_redacted_list(optional, List of int): A list of pages to fully redact
         output_folder (optional, str): The output folder for file save
+        prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to true
         progress (optional, Progress): Progress tracker for the operation
     tic = time.perf_counter()
     json_from_csv = False
     original_cropboxes = []  # Store original CropBox values
+    converted_file_paths = []
+    image_file_paths = []
+    pymupdf_doc = []
+    review_file_csv = pd.DataFrame()
     if isinstance(in_fully_redacted_list, pd.DataFrame):
         if not in_fully_redacted_list.empty:
     if isinstance(out_message, str):
         out_message = [out_message]
     if not file_paths:
         file_paths = []
         # If a pdf, load as a pymupdf document
         if is_pdf(file_path):
             pymupdf_doc = pymupdf.open(file_path)
+            pymupdf_pages = pymupdf_doc.page_count
             # Load cropbox dimensions to use later
             converted_file_path = file_path
+            if prepare_images==True:
+                image_file_paths, image_sizes_width, image_sizes_height = process_file(file_path, prepare_for_review)
+            else:
+                print("Skipping image preparation")
+                image_file_paths=[]
+                image_sizes_width=[]
+                image_sizes_height=[]
+            # Create page sizes object
+            # page_sizes = []
+            # for i, page in enumerate(pymupdf_doc):
+            #     page_no = i
+            #     reported_page_no = i + 1
+            #     pymupdf_page = pymupdf_doc.load_page(page_no)
+            #     original_cropboxes.append(pymupdf_page.cropbox)  # Save original CropBox
+            #     # Create a page_sizes_object
+            #     out_page_image_sizes = {"page":reported_page_no, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
+            #     page_sizes.append(out_page_image_sizes)
+            page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height)
             #Create base version of the annotation object that doesn't have any annotations in it
             if (not all_annotations_object) & (prepare_for_review == True):
                 for image_path in image_file_paths:
                     annotation = {}
                     annotation["image"] = image_path
+                    annotation["boxes"] = []
                     all_annotations_object.append(annotation)
             #print("image_file_paths:", image_file_paths)
             # Create a page_sizes_object
+            out_page_image_sizes = {"page":1, "image_width":image_sizes_width[0], "image_height":image_sizes_height[0], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":original_cropboxes[-1].width, "cropbox_height":original_cropboxes[-1].height}
             page_sizes.append(out_page_image_sizes)
             converted_file_path = output_folder + file_name_with_ext
         elif file_extension in ['.csv']:
             review_file_csv = read_file(file)
+            all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
             json_from_csv = True
             print("Converted CSV review file to json")
     return out_message, out_file_paths
+def join_values_within_threshold(df1:pd.DataFrame, df2:pd.DataFrame):
     # Threshold for matching
     threshold = 5
     print(final_df)
+def convert_annotation_json_to_review_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame(), page_sizes:List[dict]=[]) -> pd.DataFrame:
     '''
     Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
     '''
     #    review_file_df[col] = np.floor(review_file_df[col])
     # If colours are saved as list, convert to tuple
+    review_file_df.loc[:,"color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
     # print("page_sizes:", page_sizes)
     review_file_df = review_file_df.sort_values(['page', 'ymin', 'xmin', 'label'])
+    #review_file_df.to_csv(output_folder + "review_file_test.csv", index=None)
     return review_file_df
+def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame, image_paths:List[Image.Image], page_sizes:List[dict]=[]) -> List[dict]:
     '''
     Convert a review csv to a json file for use by the Gradio Annotation object.
     '''
+    # Convert relative co-ordinates into image coordinates for the image annotation output object
+    if page_sizes:
         page_sizes_df = pd.DataFrame(page_sizes)
+        # If there are no image coordinates, then just convert the first page to image to be able to see this at least.
+        if len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == len(page_sizes_df["image_width"]):
+            print("No image dimensions found, converting first page.")
+        # If no nulls, then can do image coordinate conversion
+        elif len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == 0:
+            if "image_width" not in review_file_df.columns:
+                    review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
+            # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
+            if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
+                review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["image_width"]
+                review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["image_width"]
+                review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["image_height"]
+                review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["image_height"]
     # Keep only necessary columns
     review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
     # Create a list to hold the JSON data
     json_data = []
+    for page_no, pdf_image_path in enumerate(image_paths):
+        reported_page_number = int(page_no + 1)
         if reported_page_number in review_file_df["page"].values:
         else:
             annotation = {}
             annotation["image"] = pdf_image_path
+            annotation["boxes"] = []
         # Append the structured data to the json_data list
         json_data.append(annotation)

tools/file_redaction.py CHANGED Viewed

@@ -8,38 +8,29 @@ import copy
 from tqdm import tqdm
 from PIL import Image, ImageChops, ImageFile, ImageDraw
-ImageFile.LOAD_TRUNCATED_IMAGES = True
 from typing import List, Dict, Tuple
 import pandas as pd
-#from presidio_image_redactor.entities import ImageRecognizerResult
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
 from pikepdf import Pdf, Dictionary, Name
-import pymupdf
-from pymupdf import Rect
-from fitz import Page
 import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
-from presidio_analyzer import RecognizerResult
-from tools.aws_functions import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
-from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
-from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image, prepare_image_or_pdf
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
-from tools.presidio_analyzer_custom import recognizer_result_from_dict
-# Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
-page_break_value = get_or_create_env_var('page_break_value', '50000')
-print(f'The value of page_break_value is {page_break_value}')
-max_time_value = get_or_create_env_var('max_time_value', '999999')
-print(f'The value of max_time_value is {max_time_value}')
 def bounding_boxes_overlap(box1, box2):
     """Check if two bounding boxes overlap."""
@@ -103,6 +94,7 @@ def choose_and_run_redactor(file_paths:List[str],
  review_file_state:pd.DataFrame=[],
  output_folder:str=output_folder,
  document_cropboxes:List=[],
  progress=gr.Progress(track_tqdm=True)):
     '''
     This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
@@ -143,6 +135,7 @@ def choose_and_run_redactor(file_paths:List[str],
     - review_file_state (pd.DataFrame, optional): Output review file dataframe.
     - output_folder (str, optional): Output folder for results.
     - document_cropboxes (List, optional): List of document cropboxes for the PDF.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted document along with processing logs.
@@ -239,7 +232,7 @@ def choose_and_run_redactor(file_paths:List[str],
         estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
-        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes
     # If we have reached the last page, return message and outputs
     if current_loop_page >= number_of_pages:
@@ -255,7 +248,7 @@ def choose_and_run_redactor(file_paths:List[str],
             review_out_file_paths.extend(out_review_file_path)
-        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes
     # Create allow list
     # If string, assume file path
@@ -484,7 +477,7 @@ def choose_and_run_redactor(file_paths:List[str],
                 #print("all_decision_process_table before in choose and run redactor:", all_decision_process_table)
                 #print("page_sizes before in choose and run redactor:", page_sizes)
-                review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table, page_sizes)
                 #print("annotation_all_pages:", annotations_all_pages)
                 #print("all_decision_process_table after in choose and run redactor:", all_decision_process_table)
@@ -560,7 +553,7 @@ def choose_and_run_redactor(file_paths:List[str],
     out_file_paths = list(set(out_file_paths))
     review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
-    return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_df, page_sizes
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
     '''

 from tqdm import tqdm
 from PIL import Image, ImageChops, ImageFile, ImageDraw
 from typing import List, Dict, Tuple
 import pandas as pd
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
 from pikepdf import Pdf, Dictionary, Name
+from pymupdf import Rect, Page
 import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
+from tools.config import output_folder, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, page_break_value, max_time_value, LOAD_TRUNCATED_IMAGES
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
+from tools.file_conversion import process_file, convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
+from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image, prepare_image_or_pdf
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
+ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
+if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
+else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
+image_dpi = float(IMAGES_DPI)
 def bounding_boxes_overlap(box1, box2):
     """Check if two bounding boxes overlap."""
  review_file_state:pd.DataFrame=[],
  output_folder:str=output_folder,
  document_cropboxes:List=[],
+ page_sizes:List[dict]=[],
  progress=gr.Progress(track_tqdm=True)):
     '''
     This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
     - review_file_state (pd.DataFrame, optional): Output review file dataframe.
     - output_folder (str, optional): Output folder for results.
     - document_cropboxes (List, optional): List of document cropboxes for the PDF.
+    - page_sizes (List[dict], optional): List of dictionaries of PDF page sizes in PDF or image format.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted document along with processing logs.
         estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
+        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
     # If we have reached the last page, return message and outputs
     if current_loop_page >= number_of_pages:
             review_out_file_paths.extend(out_review_file_path)
+        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
     # Create allow list
     # If string, assume file path
                 #print("all_decision_process_table before in choose and run redactor:", all_decision_process_table)
                 #print("page_sizes before in choose and run redactor:", page_sizes)
+                review_df = convert_annotation_json_to_review_df(annotations_all_pages, all_decision_process_table, page_sizes)
                 #print("annotation_all_pages:", annotations_all_pages)
                 #print("all_decision_process_table after in choose and run redactor:", all_decision_process_table)
     out_file_paths = list(set(out_file_paths))
     review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
+    return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_df, page_sizes, document_cropboxes
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
     '''

tools/find_duplicate_pages.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import pandas as pd
-import argparse
-import glob
 import os
 import re
 from tools.helper_functions import output_folder

 import pandas as pd
+#import argparse
+#import glob
 import os
 import re
 from tools.helper_functions import output_folder

tools/helper_functions.py CHANGED Viewed

@@ -9,19 +9,7 @@ import unicodedata
 from typing import List
 from gradio_image_annotation import image_annotator
 from tools.auth import user_pool_id
-def get_or_create_env_var(var_name, default_value):
-    # Get the environment variable if it exists
-    value = os.environ.get(var_name)
-    # If it doesn't exist, set it to the default value
-    if value is None:
-        os.environ[var_name] = default_value
-        value = default_value
-    return value
 # Names for options labels
 text_ocr_option = "Local model - selectable text"
@@ -31,24 +19,6 @@ textract_option = "AWS Textract service - all PDF types"
 local_pii_detector = "Local"
 aws_pii_detector  = "AWS Comprehend"
-output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
-print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
-session_output_folder = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
-print(f'The value of SESSION_OUTPUT_FOLDER is {session_output_folder}')
-input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
-print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
-# Retrieving or setting CUSTOM_HEADER
-CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
-print(f'CUSTOM_HEADER found')
-# Retrieving or setting CUSTOM_HEADER_VALUE
-CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
-print(f'CUSTOM_HEADER_VALUE found')
 def reset_state_vars():
     return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
             label="Modify redaction boxes",
@@ -268,24 +238,8 @@ def merge_csv_files(file_list):
     return output_files
 async def get_connection_params(request: gr.Request, output_folder_textbox:str='/output/'):
-    #print("request user:", request.username)
-    #request_data = await request.json()  # Parse JSON body
-    #print("All request data:", request_data)
-    #context_value = request_data.get('context')
-    #if 'context' in request_data:
-    #     print("Request context dictionary:", request_data['context'])
-    # print("Request headers dictionary:", request.headers)
-    # print("All host elements", request.client)
-    # print("IP address:", request.client.host)
-    # print("Query parameters:", dict(request.query_params))
-    # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
-    #print("Request dictionary to object:", request.request.body())
     print("Session hash:", request.session_hash)
     if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:

 from typing import List
 from gradio_image_annotation import image_annotator
 from tools.auth import user_pool_id
+from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, output_folder, session_output_folder
 # Names for options labels
 text_ocr_option = "Local model - selectable text"
 local_pii_detector = "Local"
 aws_pii_detector  = "AWS Comprehend"
 def reset_state_vars():
     return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
             label="Modify redaction boxes",
     return output_files
 async def get_connection_params(request: gr.Request, output_folder_textbox:str='/output/'):
     print("Session hash:", request.session_hash)
     if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:

tools/presidio_analyzer_custom.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import gradio as gr
 from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
-from tqdm import tqdm
-from presidio_analyzer import DictAnalyzerResult, RecognizerResult #, AnalyzerEngine
 from presidio_analyzer.nlp_engine import NlpArtifacts
 def recognizer_result_from_dict(data: Dict) -> RecognizerResult:

 import gradio as gr
 from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
+#from tqdm import tqdm
+from presidio_analyzer import DictAnalyzerResult, RecognizerResult
 from presidio_analyzer.nlp_engine import NlpArtifacts
 def recognizer_result_from_dict(data: Dict) -> RecognizerResult:

tools/redaction_review.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import gradio as gr
 import pandas as pd
 import numpy as np
@@ -7,18 +9,18 @@ import uuid
 from typing import List
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
-from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, convert_pandas_df_to_review_json, CUSTOM_BOX_COLOUR
-from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type
-from tools.file_redaction import redact_page_with_pymupdf
-import json
-import os
-import re
 import pymupdf
-from fitz import Document, Rect
 from PIL import ImageDraw, Image
 from collections import defaultdict
-Image.MAX_IMAGE_PIXELS = None
 def decrease_page(number:int):
     '''
@@ -110,9 +112,7 @@ def get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object:Annot
     recogniser_dataframe_out = recogniser_dataframe_modified
     try:
-        review_dataframe = convert_review_json_to_pandas_df(image_annotator_object, review_df, page_sizes)
-        print("in get_filtered_recogniser_dataframe_and_dropdowns, recogniser_dropdown_value:", recogniser_dropdown_value)
         recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
         recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
@@ -140,7 +140,6 @@ def get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object:Annot
     return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
 def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, recogniser_dataframe_modified:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=[], page_sizes:list[str]=[]):
     '''
     Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
@@ -168,7 +167,6 @@ def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, reco
     return recogniser_entities_list, recogniser_dataframe_out, recogniser_dataframe_modified, recogniser_entities_drop, text_entities_drop, page_entities_drop
 def undo_last_removal(backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base):
     return backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
@@ -191,15 +189,24 @@ def exclude_selected_items_from_redaction(review_df: pd.DataFrame, selected_rows
         # Keep only the rows that do not have a match in selected_rows_df
         out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
-        out_image_annotations_state = convert_pandas_df_to_review_json(out_review_df, image_file_paths, page_sizes)
-        recogniser_entity_dataframe_base = out_review_df[["page", "label", "text"]]
     else:
         out_review_df = review_df
-        recogniser_entity_dataframe_base = pd.DataFrame()
-        out_image_annotations_state = {}
-    return out_review_df, out_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
 def update_annotator(image_annotator_object:AnnotatedImageData,
                      page_num:int,
@@ -315,8 +322,6 @@ def modify_existing_page_redactions(image_annotator_object:AnnotatedImageData,
     if not current_page:
         current_page = 1
-    print("in modify_existing_page_redactions - recogniser_entities_dropdown_value:", recogniser_entities_dropdown_value)
     image_annotator_object['image'] = all_image_annotations[previous_page - 1]["image"]
     if clear_all == False:
@@ -471,10 +476,10 @@ def apply_redactions(image_annotator_object:AnnotatedImageData,
             #print("page_sizes before conversion in apply redactions:", page_sizes)
             # Convert json to csv and also save this
-            review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state, page_sizes=page_sizes)
             out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
-            print("Saving review file after convert_review_json function in apply redactions")
             review_df.to_csv(out_review_file_file_path, index=None)
             output_files.append(out_review_file_file_path)
@@ -589,6 +594,9 @@ def update_entities_df_text(choice:str, df:pd.DataFrame, label_dropdown_value:st
     return filtered_df, recogniser_entities_drop, page_entities_drop
 def reset_dropdowns():
     return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
 def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
@@ -612,10 +620,13 @@ def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, i
     - image_width: Width of the source image
     - image_height: Height of the source image
     - x1, y1, x2, y2: Coordinates in image space
     Returns:
     - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
     '''
     # Calculate scaling factors
     scale_width = pdf_page_width / image_width
@@ -636,12 +647,34 @@ def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, i
     return pdf_x1, pdf_y1, pdf_x2, pdf_y2
-def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str], document_cropboxes:List=[]):
     '''
     Create an xfdf file from a review csv file and a pdf
     '''
     # Create root element
     xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
@@ -651,13 +684,49 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:L
     # Add annots
     annots = SubElement(xfdf, 'annots')
-    for _, row in df.iterrows():
         page_python_format = int(row["page"])-1
         pymupdf_page = pymupdf_doc.load_page(page_python_format)
-        # Load cropbox sizes
         if document_cropboxes:
             #print("Document cropboxes:", document_cropboxes)
@@ -672,13 +741,12 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:L
         else:
             print("Document cropboxes not found.")
         pdf_page_height = pymupdf_page.mediabox.height
         pdf_page_width = pymupdf_page.mediabox.width
         image = image_paths[page_python_format]
-        #print("image:", image)
         if isinstance(image, str):
             image = Image.open(image)
@@ -695,16 +763,22 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:L
         redact_annot.set('page', str(int(row['page']) - 1))
         # Convert coordinates
-        x1, y1, x2, y2 = convert_image_coords_to_adobe(
-            pdf_page_width,
-            pdf_page_height,
-            image_page_width,
-            image_page_height,
-            row['xmin'],
-            row['ymin'],
-            row['xmax'],
-            row['ymax']
-        )
         if CUSTOM_BOX_COLOUR == "grey":
             colour_str = "0.5,0.5,0.5"
@@ -756,12 +830,13 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:L
     return xml_str
-def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], output_folder:str = output_folder, document_cropboxes:List=[]):
     '''
     Load in files to convert a review file into an Adobe comment file format
     '''
     output_paths = []
     pdf_name = ""
     if isinstance(input_files, str):
         file_paths_list = [input_files]
@@ -778,29 +853,29 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], ou
         else:
             file_path = file.name
-    file_path_name = get_file_name_without_type(file_path)
-    file_path_end = detect_file_type(file_path)
-    if file_path_end == "pdf":
-        pdf_name = os.path.basename(file_path)
-    if file_path_end == "csv":
-        # If no pdf name, just get the name of the file path
-        if not pdf_name:
-            pdf_name = file_path_name
-        # Read CSV file
-        df = pd.read_csv(file_path)
-        df.fillna('', inplace=True)  # Replace NaN with an empty string
-        xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths, document_cropboxes)
-        output_path = output_folder + file_path_name + "_adobe.xfdf"
-        with open(output_path, 'w', encoding='utf-8') as f:
-            f.write(xfdf_content)
-        output_paths.append(output_path)
     return output_paths
@@ -841,7 +916,7 @@ def convert_adobe_coords_to_image(pdf_page_width:float, pdf_page_height:float, i
     return image_x1, image_y1, image_x2, image_y2
-def parse_xfdf(xfdf_path):
     '''
     Parse the XFDF file and extract redaction annotations.

+import os
+import re
 import gradio as gr
 import pandas as pd
 import numpy as np
 from typing import List
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
+from pymupdf import Document, Rect
 import pymupdf
+#from fitz
 from PIL import ImageDraw, Image
 from collections import defaultdict
+from tools.config import output_folder, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS
+from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json
+from tools.helper_functions import get_file_name_without_type,  detect_file_type
+from tools.file_redaction import redact_page_with_pymupdf
+if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
 def decrease_page(number:int):
     '''
     recogniser_dataframe_out = recogniser_dataframe_modified
     try:
+        review_dataframe = convert_annotation_json_to_review_df(image_annotator_object, review_df, page_sizes)
         recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
         recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
     return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
 def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, recogniser_dataframe_modified:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=[], page_sizes:list[str]=[]):
     '''
     Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
     return recogniser_entities_list, recogniser_dataframe_out, recogniser_dataframe_modified, recogniser_entities_drop, text_entities_drop, page_entities_drop
 def undo_last_removal(backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base):
     return backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
         # Keep only the rows that do not have a match in selected_rows_df
         out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
+        out_image_annotations_state = convert_review_df_to_annotation_json(out_review_df, image_file_paths, page_sizes)
+        out_recogniser_entity_dataframe_base = out_review_df[["page", "label", "text"]]
+    # Either there is nothing left in the selection dataframe, or the review dataframe
     else:
         out_review_df = review_df
+        out_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
+        out_image_annotations_state = []
+        for page_no, page in enumerate(image_file_paths):
+            annotation = {}
+            annotation["image"] = image_file_paths[page_no]
+            annotation["boxes"] = []
+            out_image_annotations_state.append(annotation)
+    return out_review_df, out_image_annotations_state, out_recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
 def update_annotator(image_annotator_object:AnnotatedImageData,
                      page_num:int,
     if not current_page:
         current_page = 1
     image_annotator_object['image'] = all_image_annotations[previous_page - 1]["image"]
     if clear_all == False:
             #print("page_sizes before conversion in apply redactions:", page_sizes)
             # Convert json to csv and also save this
+            review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state, page_sizes=page_sizes)[["image",	"page",	"text",	"label","color", "xmin", "ymin", "xmax",	"ymax"]]
             out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
+            #print("Saving review file after convert_annotation_json_to_review_df function in apply redactions")
             review_df.to_csv(out_review_file_file_path, index=None)
             output_files.append(out_review_file_file_path)
     return filtered_df, recogniser_entities_drop, page_entities_drop
 def reset_dropdowns():
+    '''
+    Return Gradio dropdown objects with value 'ALL'.
+    '''
     return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
 def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
     - image_width: Width of the source image
     - image_height: Height of the source image
     - x1, y1, x2, y2: Coordinates in image space
+    - page_sizes: List of dicts containing sizes of page as pymupdf page or PIL image
     Returns:
     - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
     '''
     # Calculate scaling factors
     scale_width = pdf_page_width / image_width
     return pdf_x1, pdf_y1, pdf_x2, pdf_y2
+def convert_pymupdf_coords_to_adobe(x1: float, y1: float, x2: float, y2: float):
+    """
+    Converts coordinates from PyMuPDF (fitz) space to Adobe PDF space.
+    Parameters:
+    - pdf_page_width: Width of the PDF page
+    - pdf_page_height: Height of the PDF page
+    - x1, y1, x2, y2: Coordinates in PyMuPDF space
+    Returns:
+    - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
+    """
+    # PyMuPDF and Adobe PDF coordinates are similar, but ensure y1 is always the lower value
+    pdf_x1, pdf_x2 = x1, x2
+    # Ensure y1 is the bottom coordinate and y2 is the top
+    pdf_y1, pdf_y2 = min(y1, y2), max(y1, y2)
+    return pdf_x1, pdf_y1, pdf_x2, pdf_y2
+def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str], document_cropboxes:List=[], page_sizes:List[dict]=[]):
     '''
     Create an xfdf file from a review csv file and a pdf
     '''
+    pages_are_images = True
     # Create root element
     xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
     # Add annots
     annots = SubElement(xfdf, 'annots')
+    # Check if page size object exists, and if current coordinates are in relative format or image coordinates format.
+    if page_sizes:
+        page_sizes_df = pd.DataFrame(page_sizes)
+        # If there are no image coordinates, then convert coordinates to pymupdf coordinates prior to export
+        if len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == len(page_sizes_df["image_width"]):
+            print("No image dimensions found, using pymupdf coordinates for conversion.")
+            if "mediabox_width" not in review_file_df.columns:
+                    review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
+            # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
+            if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
+                review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["mediabox_width"]
+                review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["mediabox_width"]
+                review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["mediabox_height"]
+                review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["mediabox_height"]
+            pages_are_images = False
+        # If no nulls, then can do image coordinate conversion
+        elif len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == 0:
+            if "image_width" not in review_file_df.columns:
+                    review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
+            # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
+            if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
+                review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["image_width"]
+                review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["image_width"]
+                review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["image_height"]
+                review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["image_height"]
+                pages_are_images = True
+    # Go through each row of the review_file_df, create an entry in the output Adobe xfdf file.
+    for _, row in review_file_df.iterrows():
         page_python_format = int(row["page"])-1
         pymupdf_page = pymupdf_doc.load_page(page_python_format)
+        # Load cropbox sizes. Set cropbox to the original cropbox sizes from when the document was loaded into the app.
         if document_cropboxes:
             #print("Document cropboxes:", document_cropboxes)
         else:
             print("Document cropboxes not found.")
         pdf_page_height = pymupdf_page.mediabox.height
         pdf_page_width = pymupdf_page.mediabox.width
         image = image_paths[page_python_format]
         if isinstance(image, str):
             image = Image.open(image)
         redact_annot.set('page', str(int(row['page']) - 1))
         # Convert coordinates
+        if pages_are_images == True:
+            x1, y1, x2, y2 = convert_image_coords_to_adobe(
+                pdf_page_width,
+                pdf_page_height,
+                image_page_width,
+                image_page_height,
+                row['xmin'],
+                row['ymin'],
+                row['xmax'],
+                row['ymax']
+            )
+        else:
+            x1, y1, x2, y2 = convert_pymupdf_coords_to_adobe(row['xmin'],
+                row['ymin'],
+                row['xmax'],
+                row['ymax'])
         if CUSTOM_BOX_COLOUR == "grey":
             colour_str = "0.5,0.5,0.5"
     return xml_str
+def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = output_folder, document_cropboxes:List=[], page_sizes:List[dict]=[]):
     '''
     Load in files to convert a review file into an Adobe comment file format
     '''
     output_paths = []
     pdf_name = ""
+    file_path_name = ""
     if isinstance(input_files, str):
         file_paths_list = [input_files]
         else:
             file_path = file.name
+        file_path_name = get_file_name_without_type(file_path)
+        file_path_end = detect_file_type(file_path)
+        if file_path_end == "pdf":
+            pdf_name = os.path.basename(file_path)
+        if file_path_end == "csv":
+            # If no pdf name, just get the name of the file path
+            if not pdf_name:
+                pdf_name = file_path_name
+            # Read CSV file
+            review_file_df = pd.read_csv(file_path)
+            review_file_df.fillna('', inplace=True)  # Replace NaN in review file with an empty string
+            xfdf_content = create_xfdf(review_file_df, pdf_name, pdf_doc, image_paths, document_cropboxes, page_sizes)
+            output_path = output_folder + file_path_name + "_adobe.xfdf"
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write(xfdf_content)
+            output_paths.append(output_path)
     return output_paths
     return image_x1, image_y1, image_x2, image_y2
+def parse_xfdf(xfdf_path:str):
     '''
     Parse the XFDF file and extract redaction annotations.